我必须从 sklearn KDTree 中查询大量向量,这是搜索器类的路径。我正在尝试使用 python 多处理并行查询它们,但是并行代码与单个版本花费几乎相同(或更多)的时间。
import time, numpy as np
from sklearn.neighbors import KDTree
from multiprocessing import Pool
def glob_query(arg, **kwarg):
return Searcher.query(*arg, **kwarg)
class Searcher:
def __init__(self, N, D):
self.kdt = KDTree(np.random.rand(N,D), leaf_size=30, metric="euclidean")
def query(self, X):
return self.kdt.query(X, k=5, return_distance=False)
def query_sin(self, X):
return [self.query(x) for x in X]
def query_par(self, X):
p = Pool(4)
return p.map(glob_query, zip([self]*len(X), X))
if __name__=="__main__":
N = 1000000 # Number of points to be indexed
D = 50 # Dimensions
searcher = Searcher(N, D)
E = 100 # Number of points to be searched
points = np.random.rand(E, D)
# Works fine
start = time.time()
searcher.query_sin(points)
print("Time taken - %f"%(time.time()-start))
# Slower than single core
start = time.time()
print searcher.query_par(points)
print("Time taken - %f"%(time.time()-start))
Time taken - 28.591089
Time taken - 36.920716
我想知道
- 如果我的 kd-tree 被复制到每个工作线程
- 是否有另一种并行搜索的方法(使用 pathos?)