一个多小时以来,我一直在摆弄 Python 的multiprocessing
功能,尝试使用multiprocessing.Process
and并行化一个相当复杂的图形遍历函数multiprocessing.Manager
:
import networkx as nx
import csv
import time
from operator import itemgetter
import os
import multiprocessing as mp
cutoff = 1
exclusionlist = ["cpd:C00024"]
DG = nx.read_gml("KeggComplete.gml", relabel=True)
for exclusion in exclusionlist:
DG.remove_node(exclusion)
# checks if 'memorizedPaths exists, and if not, creates it
fn = os.path.join(os.path.dirname(__file__),
'memorizedPaths' + str(cutoff+1))
if not os.path.exists(fn):
os.makedirs(fn)
manager = mp.Manager()
memorizedPaths = manager.dict()
filepaths = manager.dict()
degreelist = sorted(DG.degree_iter(),
key=itemgetter(1),
reverse=True)
def _all_simple_paths_graph(item, DG, cutoff, memorizedPaths, filepaths):
source = item[0]
uniqueTreePaths = []
if cutoff < 1:
return
visited = [source]
stack = [iter(DG[source])]
while stack:
children = stack[-1]
child = next(children, None)
if child is None:
stack.pop()
visited.pop()
elif child in memorizedPaths:
for path in memorizedPaths[child]:
newPath = (tuple(visited) + tuple(path))
if (len(newPath) <= cutoff) and
(len(set(visited) & set(path)) == 0):
uniqueTreePaths.append(newPath)
continue
elif len(visited) < cutoff:
if child not in visited:
visited.append(child)
stack.append(iter(DG[child]))
if visited not in uniqueTreePaths:
uniqueTreePaths.append(tuple(visited))
else: # len(visited) == cutoff:
if (visited not in uniqueTreePaths) and
(child not in visited):
uniqueTreePaths.append(tuple(visited + [child]))
stack.pop()
visited.pop()
# writes the absolute path of the node path file into the hash table
filepaths[source] = str(fn) + "/" + str(source) + "path.txt"
with open (filepaths[source], "wb") as csvfile2:
writer = csv.writer(csvfile2, delimiter=" ", quotechar="|")
for path in uniqueTreePaths:
writer.writerow(path)
memorizedPaths[source] = uniqueTreePaths
############################################################################
if __name__ == '__main__':
start = time.clock()
for item in degreelist:
test = mp.Process(target=_all_simple_paths_graph,
args=(DG, cutoff, item, memorizedPaths, filepaths))
test.start()
test.join()
end = time.clock()
print (end-start)
目前 - 虽然运气和魔法 - 它有效(有点)。我的问题是我只使用了 24 个内核中的 12 个。
有人可以解释为什么会这样吗?也许我的代码不是最好的多处理解决方案,或者它是我在 Ubuntu 13.04 x64 上运行的 Intel Xeon CPU E5-2640 @ 2.50GHz x18架构的一个特性?
编辑:
我设法得到:
p = mp.Pool()
for item in degreelist:
p.apply_async(_all_simple_paths_graph,
args=(DG, cutoff, item, memorizedPaths, filepaths))
p.close()
p.join()
但是,工作非常缓慢!所以我假设我在工作中使用了错误的功能。希望它有助于澄清我想要完成的事情!
EDIT2:.map
尝试:
partialfunc = partial(_all_simple_paths_graph,
DG=DG,
cutoff=cutoff,
memorizedPaths=memorizedPaths,
filepaths=filepaths)
p = mp.Pool()
for item in processList:
processVar = p.map(partialfunc, xrange(len(processList)))
p.close()
p.join()
工作,比单核慢。是时候优化了!