c++ - cython中循环的并行化

Question

在这里的一些成员的帮助下，我确实建立了一个在 Python 中运行的代码，并评估一个需要两个大np.arrays作为输入的函数。

并行运行的矢量化版本仍然非常耗时，并且比用串行 fortran 编写的参考程序慢了约 50...

我想改用 cython 循环，我可以使用 OpenMP 或 MPI 进行并行化。c++ 中的想法是这样的：

#pragma omp parallel for
for (i=0;i<np1;i++){
  for (i=0;i<np2;i++){
    double dist = sph(coord1_particle1,coord1_particle2,coord2_particle1,coord2_particle2)
    int bin=binning_function(dist)
    hist_array[bin]++
  }
}

任何想法都是完全欢迎的。这是 Python 版本：

#a is an array containing two coordinates of two objects
def dist_vec(a): # a like [[array1,array2,array2,array2],[],[]...]
  return sph(a[0],a[1],a[2],a[3]) # sph operates on coordinates

def vec_chunk(array_ab, bins) :
    dist = dist_vec(array_ab)
    hist, _ = np.histogram(dist, bins=bins)
    return hist


def mp_dist(array_a,array_b, d, bins): #d chunks AND processes
    def worker(array_ab, out_q):
        """ push result in queue """
        outdict = vec_chunk(array_ab, bins)
        out_q.put(outdict)
    # Each process will get 'chunksize' nums and a queue to put his out
    out_q = mp.Queue()
    a = np.swapaxes(array_a, 0 ,1)
    b = np.swapaxes(array_b, 0 ,1)
    array_size_a=len(array_a)-(len(array_a)%d)
    array_size_b=len(array_b)-(len(array_b)%d)
    a_chunk = array_size_a / d
    b_chunk = array_size_b / d
    procs = []
    '''prepare arrays for mp'''
    array_ab = np.empty((4, a_chunk, b_chunk))
    for j in xrange(d):
      for k in xrange(d):
        array_ab[[0, 1]] = a[:, a_chunk * j:a_chunk * (j + 1), None]
        array_ab[[2, 3]] = b[:, None, b_chunk * k:b_chunk * (k + 1)]
        p= mp.Process(target=worker, args=(array_ab, out_q))
        p.start()
        procs.append(p)
    for pro in procs:
      pro.join()
    # Collect all results into a single result dict. 
    resultarray = np.empty(len(bins)-1)
    for i in range(d):
        resultarray+=out_q.get() 
        #resultdict.update(out_q.get())
    return resultarray

bins = np.logspace(-3,1, num=25) #prepare x-axis for histogram
start_time = time()
hist_data = mp_dist(DATA,sim,10,bins)
print 'Total Time Elaspsed: ', time() - start_time

score 4 · Accepted Answer

以下代码比原来的代码快 6 倍：它使用来自http://code.google.com/p/astrolibpy/source/browse/my_utils/quick_hist.py的更快的直方图代码（因为 np.histogram 太慢了对于统一长度的箱）新代码不会创建那么多进程，使用 multiprocessing.pool 并且还避免在进程之间大量复制数据。

剩下的性能可以通过重写cython中的距离函数来获得。或者更好的是，在 cython 或 scipy.weave 中重写 dist_vec() （参见 quick_hist 代码中的示例）

import numpy as np,multiprocessing as mp
from time import time
import quick_hist
def sph(a, b, c, d):
    return numexpr.evaluate('log(((a - c)**2 + (b - d)**2)**.5)')

def dist_vec(a,b):
    return sph(a[:,0][:, None], a[:,1][:, None], b[:,0][None, :], b[:,1][None, :]) 

def vec_chunk(a, b, bins) :
    dist = dist_vec(a, b).flatten()
    hist = quick_hist.quick_hist( (dist,), [(bins[0], bins[-1])], [len(bins)])
    return hist

class si:
    # singleton to share read-only data between processes
    a = None
    b = None
    step = None
    bins = None

def func(l1):
    return vec_chunk(si.a[l1:l1+si.step,:], si.b, si.bins)

def mp_dist(array_a,array_b, d, bins): #d chunks 
    nproc = 8  # n processes
    si.a = array_a
    si.b = array_b
    si.step = d
    si.bins = bins
    nx = array_a.shape[0]
    lefts = np.arange(0, nx, d) #left edges of the chunks
    pool = mp.Pool(nproc)   
    results = pool.map(func, lefts)
    results = np.array(results).sum(axis=0)
    pool.close()
    pool.join()
    return results

if __name__=='__main__':
    bins = np.logspace(-3,1, num=25) #prepare x-axis for histogram
    start_time = time()
    n1 = 10000
    n2 = 10000
    DATA = np.random.uniform(size=(n1, 2))
    sim = np.random.uniform(size=(n2, 2))
    chunksize = 10
    hist_data = mp_dist(DATA, sim, chunksize, bins)

    print 'Total Time Elaspsed: ', time() - start_time

c++ - cython中循环的并行化

1 回答 1

Related

Reference