c - 如何以有效的方式并行化这个三重循环？

Question

我正在尝试并行化一个函数，该函数将三个数组（x、y 和 prb）和一个标量作为输入，并输出三个数组（P1、Pt1 和 Px）。

原始 c 代码在这里（异常值和 E 无关紧要）：

#include <stdio.h>
#include <math.h>
#include <stdlib.h>
#define max(A, B)   ((A) > (B) ? (A) : (B))
#define min(A, B)   ((A) < (B) ? (A) : (B))

void cpd_comp(
        double* x,
        double* y, 
        double* prb,
        double* sigma2,
        double* outlier,
        double* P1,
        double* Pt1,
        double* Px,
        double* E,
        int N,
        int M,
        int D
        )

{
  int       n, m, d;
  double    ksig, diff, razn, outlier_tmp, sp;
  double    *P, *temp_x;

  P = (double*) calloc(M, sizeof(double));
  temp_x = (double*) calloc(D, sizeof(double));

  ksig = -2.0 * *sigma2;


  for (n=0; n < N; n++) {

      sp=0;
      for (m=0; m < M; m++) {
          razn=0;
          for (d=0; d < D; d++) {
             diff=*(x+n+d*N)-*(y+m+d*M);  diff=diff*diff;
             razn+=diff;
          }

          *(P+m)=exp(razn/ksig) ;
          sp+=*(P+m);
      }


      *(Pt1+n)=*(prb+n);
      for (d=0; d < D; d++) {
       *(temp_x+d)=*(x+n+d*N)/ sp;
      }

      for (m=0; m < M; m++) {
          *(P1+m)+=((*(P+m)/ sp) **(prb+n));

          for (d=0; d < D; d++) {
          *(Px+m+d*M)+= (*(temp_x+d)**(P+m)**(prb+n));
          }

      }

   *E +=  -log(sp);     
  }
  *E +=D*N*log(*sigma2)/2;


  free((void*)P);
  free((void*)temp_x);

  return;
}

这是我并行化它的尝试：

#include <cuda.h>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <thrust/device_ptr.h>
#include <thrust/reduce.h>

/*headers*/
void cpd_comp(
    float * x,        //Points to register      [N*D]
    float * y,        //Points to be registered [M*D]
    float * prb,      //Vector of probabilities [N]
    float * sigma2,   //Square of sigma
    float ** P1,       //P1,  output, [M]
    float ** Pt1,      //Pt1, output, [N]
    float ** Px,       //Px,  output, [M*3]
    int N,            //Number of points, i.e. rows, in x
    int M             //Number of points, i.e. rows, in 
    );

__global__ void d_computeP(
    float * P,
    float * P1,
    float * Px,
    float * ProbabilityMatrix,
    float * x,
    float * y,
    float * prb,
    float ksig,
    const int N,
    const int M);

__global__ void d_sumP(
    float * sp,
    float * P1timessp,
    float * Pxtimessp,
    float * P1,
    float * Px,
    const int N,
    const int M);

/*implementations*/

void cpd_comp(
    float * x,        //Points to register      [N*D]
    float * y,        //Points to be registered [M*D]
    float * prb,      //Vector of probabilities [N]
    float * sigma2,   //Scalar
    float ** P1,       //P1,  output, [M]
    float ** Pt1,      //Pt1, output, [N]
    float ** Px,       //Px,  output, [M*3]
    int N,            //Number of points, i.e. rows, in x
    int M             //Number of points, i.e. rows, in y
    ){
    //X is generatedPointPos
    //Y is points

    float
        *P,
        *P1timessp,
        *Pxtimessp,
        ksig = -2.0 * (*sigma2),
        *h_sumofP = new float[N], //sum of P, on host
        *d_sumofP;                //sum of P, on device

    cudaMalloc((void**)&P,        sizeof(float)*M*N);
    cudaMalloc((void**)&P1timessp,sizeof(float)*M*N);
    cudaMalloc((void**)&Pxtimessp,sizeof(float)*M*N*3);
    cudaMalloc((void**)&d_sumofP, sizeof(float)*N);

    cudaMalloc((void**)P1,        sizeof(float)*M);
    cudaMalloc((void**)Px,        sizeof(float)*M*3);
    cudaMalloc((void**)Pt1,       sizeof(float)*N);

    d_computeP<<<dim3(N,M/1024+1),M>1024?1024:M>>>(P,P1timessp,Pxtimessp,NULL,x,y,prb,ksig,N,M);

    for(int n=0; n<N; n++){
        thrust::device_ptr<float>dev_ptr(P);
        h_sumofP[n] = thrust::reduce(dev_ptr+M*n,dev_ptr+M*(n+1),0.0f,thrust::plus<float>());
    }

    cudaMemcpy(d_sumofP,h_sumofP,sizeof(float)*N,cudaMemcpyHostToDevice);

    d_sumP<<<M/1024+1,M>1024?1024:M>>>(d_sumofP,P1timessp,Pxtimessp,*P1,*Px,N,M);

    cudaMemcpy(*Pt1,prb,sizeof(float)*N,cudaMemcpyDeviceToDevice);

    cudaFree(P);
    cudaFree(P1timessp);
    cudaFree(Pxtimessp);
    cudaFree(d_sumofP);
    delete[]h_sumofP;
}

/*kernels*/

__global__ void d_computeP(
    float * P,
    float * P1,
    float * Px,
    float * ProbabilityMatrix,
    float * x,
    float * y,
    float * prb,
    float ksig,
    const int N,
    const int M){
    //thread configuration: <<<dim3(N,M/1024+1),1024>>>
    int m = threadIdx.x+blockIdx.y*blockDim.x;
    int n = blockIdx.x;
    if(m>=M || n>=N) return;

    float 
        x1 = x[3*n],
        x2 = x[3*n+1],
        x3 = x[3*n+2],
        diff1 = x1 - y[3*m],
        diff2 = x2 - y[3*m+1],
        diff3 = x3 - y[3*m+2],
        razn = diff1*diff1+diff2*diff2+diff3*diff3,

        Pm = __expf(razn/ksig), //fast exponentiation
        prbn = prb[n];

    P[M*n+m] = Pm; 

    __syncthreads();

    P1[N*m+n] = Pm*prbn;
    Px[3*(N*m+n)+0] = x1*Pm*prbn;
    Px[3*(N*m+n)+1] = x2*Pm*prbn;
    Px[3*(N*m+n)+2] = x3*Pm*prbn;
}

__global__ void d_sumP(
    float * sp,
    float * P1timessp,
    float * Pxtimessp,
    float * P1,
    float * Px,
    const int N,
    const int M){
    //computes P1 and Px
    //thread configuration: <<<M/1024+1,1024>>>
    int m = threadIdx.x+blockIdx.x*blockDim.x;
    if(m>=M) return;
    float 
        P1m = 0,
        Pxm1 = 0,
        Pxm2 = 0,
        Pxm3 = 0;
    for(int n=0; n<N; n++){
        float spn = 1/sp[n];
        P1m += P1timessp[N*m+n]*spn;
        Pxm1 += Pxtimessp[3*(N*m+n)+0]*spn;
        Pxm2 += Pxtimessp[3*(N*m+n)+1]*spn;
        Pxm3 += Pxtimessp[3*(N*m+n)+2]*spn;
    }

    P1[m] = P1m;
    Px[3*m+0] = Pxm1;
    Px[3*m+1] = Pxm2;
    Px[3*m+2] = Pxm3;

}

然而，令我恐惧的是，它的运行速度比原始版本慢得多。如何让它运行得更快？请彻底解释一下，因为我对 CUDA 和并行编程非常陌生，并且没有算法经验。

请注意，c 版本具有列优先排序，而 CUDA 版本具有行优先排序。我做了几次测试以确保结果是正确的。它非常慢并且占用大量内存。

任何帮助是极大的赞赏！

编辑：更多信息：N 和 M 大约为几千（例如，300-3000），而 D 始终为 3。CUDA 版本期望数组是设备内存，但以 h_ 为前缀的变量除外。

score 1 · Accepted Answer

在尝试任何特定于 CUDA 的优化之前，请分析您的代码以查看时间花在了哪里。

尝试安排您的数组读取/写入，以便每个 CUDA 线程使用跨步访问模式。例如，目前您有

int m = threadIdx.x+blockIdx.y*blockDim.x;
int n = blockIdx.x;
if(m>=M || n>=N) return;

diff1 = x1 - y[3*m],
diff2 = x2 - y[3*m+1],
diff3 = x3 - y[3*m+2],

所以线程 1 将从y[0],y[1],y[2]etc 读取。相反，重新排列您的数据，以便线程 1 从 etc 读取y[0],y[M],y[2*M]，线程 2 从y[1],y[M+1],y[2*M+1]etc 读取。对于其他数组，您应该遵循此访问模式。

此外，您可能需要考虑是否可以避免使用__syncthreads(). 我不太明白为什么在这个算法中它是必要的，可能值得删除它以查看它是否能提高性能（即使它产生不正确的结果）。

score 0 · Accepted Answer

获得良好 CUDA 性能的关键几乎总是尽可能接近最佳内存访问。您的内存访问模式看起来与矩阵乘法非常相似。我将从一个好的 CUDA 矩阵乘法实现开始，确保理解它为什么以这种方式实现，然后根据您的需要进行修改。

c - 如何以有效的方式并行化这个三重循环？

2 回答 2

Related

Reference