我正在尝试并行化一个函数,该函数将三个数组(x、y 和 prb)和一个标量作为输入,并输出三个数组(P1、Pt1 和 Px)。
原始 c 代码在这里(异常值和 E 无关紧要):
#include <stdio.h>
#include <math.h>
#include <stdlib.h>
#define max(A, B) ((A) > (B) ? (A) : (B))
#define min(A, B) ((A) < (B) ? (A) : (B))
void cpd_comp(
double* x,
double* y,
double* prb,
double* sigma2,
double* outlier,
double* P1,
double* Pt1,
double* Px,
double* E,
int N,
int M,
int D
)
{
int n, m, d;
double ksig, diff, razn, outlier_tmp, sp;
double *P, *temp_x;
P = (double*) calloc(M, sizeof(double));
temp_x = (double*) calloc(D, sizeof(double));
ksig = -2.0 * *sigma2;
for (n=0; n < N; n++) {
sp=0;
for (m=0; m < M; m++) {
razn=0;
for (d=0; d < D; d++) {
diff=*(x+n+d*N)-*(y+m+d*M); diff=diff*diff;
razn+=diff;
}
*(P+m)=exp(razn/ksig) ;
sp+=*(P+m);
}
*(Pt1+n)=*(prb+n);
for (d=0; d < D; d++) {
*(temp_x+d)=*(x+n+d*N)/ sp;
}
for (m=0; m < M; m++) {
*(P1+m)+=((*(P+m)/ sp) **(prb+n));
for (d=0; d < D; d++) {
*(Px+m+d*M)+= (*(temp_x+d)**(P+m)**(prb+n));
}
}
*E += -log(sp);
}
*E +=D*N*log(*sigma2)/2;
free((void*)P);
free((void*)temp_x);
return;
}
这是我并行化它的尝试:
#include <cuda.h>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <thrust/device_ptr.h>
#include <thrust/reduce.h>
/*headers*/
void cpd_comp(
float * x, //Points to register [N*D]
float * y, //Points to be registered [M*D]
float * prb, //Vector of probabilities [N]
float * sigma2, //Square of sigma
float ** P1, //P1, output, [M]
float ** Pt1, //Pt1, output, [N]
float ** Px, //Px, output, [M*3]
int N, //Number of points, i.e. rows, in x
int M //Number of points, i.e. rows, in
);
__global__ void d_computeP(
float * P,
float * P1,
float * Px,
float * ProbabilityMatrix,
float * x,
float * y,
float * prb,
float ksig,
const int N,
const int M);
__global__ void d_sumP(
float * sp,
float * P1timessp,
float * Pxtimessp,
float * P1,
float * Px,
const int N,
const int M);
/*implementations*/
void cpd_comp(
float * x, //Points to register [N*D]
float * y, //Points to be registered [M*D]
float * prb, //Vector of probabilities [N]
float * sigma2, //Scalar
float ** P1, //P1, output, [M]
float ** Pt1, //Pt1, output, [N]
float ** Px, //Px, output, [M*3]
int N, //Number of points, i.e. rows, in x
int M //Number of points, i.e. rows, in y
){
//X is generatedPointPos
//Y is points
float
*P,
*P1timessp,
*Pxtimessp,
ksig = -2.0 * (*sigma2),
*h_sumofP = new float[N], //sum of P, on host
*d_sumofP; //sum of P, on device
cudaMalloc((void**)&P, sizeof(float)*M*N);
cudaMalloc((void**)&P1timessp,sizeof(float)*M*N);
cudaMalloc((void**)&Pxtimessp,sizeof(float)*M*N*3);
cudaMalloc((void**)&d_sumofP, sizeof(float)*N);
cudaMalloc((void**)P1, sizeof(float)*M);
cudaMalloc((void**)Px, sizeof(float)*M*3);
cudaMalloc((void**)Pt1, sizeof(float)*N);
d_computeP<<<dim3(N,M/1024+1),M>1024?1024:M>>>(P,P1timessp,Pxtimessp,NULL,x,y,prb,ksig,N,M);
for(int n=0; n<N; n++){
thrust::device_ptr<float>dev_ptr(P);
h_sumofP[n] = thrust::reduce(dev_ptr+M*n,dev_ptr+M*(n+1),0.0f,thrust::plus<float>());
}
cudaMemcpy(d_sumofP,h_sumofP,sizeof(float)*N,cudaMemcpyHostToDevice);
d_sumP<<<M/1024+1,M>1024?1024:M>>>(d_sumofP,P1timessp,Pxtimessp,*P1,*Px,N,M);
cudaMemcpy(*Pt1,prb,sizeof(float)*N,cudaMemcpyDeviceToDevice);
cudaFree(P);
cudaFree(P1timessp);
cudaFree(Pxtimessp);
cudaFree(d_sumofP);
delete[]h_sumofP;
}
/*kernels*/
__global__ void d_computeP(
float * P,
float * P1,
float * Px,
float * ProbabilityMatrix,
float * x,
float * y,
float * prb,
float ksig,
const int N,
const int M){
//thread configuration: <<<dim3(N,M/1024+1),1024>>>
int m = threadIdx.x+blockIdx.y*blockDim.x;
int n = blockIdx.x;
if(m>=M || n>=N) return;
float
x1 = x[3*n],
x2 = x[3*n+1],
x3 = x[3*n+2],
diff1 = x1 - y[3*m],
diff2 = x2 - y[3*m+1],
diff3 = x3 - y[3*m+2],
razn = diff1*diff1+diff2*diff2+diff3*diff3,
Pm = __expf(razn/ksig), //fast exponentiation
prbn = prb[n];
P[M*n+m] = Pm;
__syncthreads();
P1[N*m+n] = Pm*prbn;
Px[3*(N*m+n)+0] = x1*Pm*prbn;
Px[3*(N*m+n)+1] = x2*Pm*prbn;
Px[3*(N*m+n)+2] = x3*Pm*prbn;
}
__global__ void d_sumP(
float * sp,
float * P1timessp,
float * Pxtimessp,
float * P1,
float * Px,
const int N,
const int M){
//computes P1 and Px
//thread configuration: <<<M/1024+1,1024>>>
int m = threadIdx.x+blockIdx.x*blockDim.x;
if(m>=M) return;
float
P1m = 0,
Pxm1 = 0,
Pxm2 = 0,
Pxm3 = 0;
for(int n=0; n<N; n++){
float spn = 1/sp[n];
P1m += P1timessp[N*m+n]*spn;
Pxm1 += Pxtimessp[3*(N*m+n)+0]*spn;
Pxm2 += Pxtimessp[3*(N*m+n)+1]*spn;
Pxm3 += Pxtimessp[3*(N*m+n)+2]*spn;
}
P1[m] = P1m;
Px[3*m+0] = Pxm1;
Px[3*m+1] = Pxm2;
Px[3*m+2] = Pxm3;
}
然而,令我恐惧的是,它的运行速度比原始版本慢得多。如何让它运行得更快?请彻底解释一下,因为我对 CUDA 和并行编程非常陌生,并且没有算法经验。
请注意,c 版本具有列优先排序,而 CUDA 版本具有行优先排序。我做了几次测试以确保结果是正确的。它非常慢并且占用大量内存。
任何帮助是极大的赞赏!
编辑:更多信息:N 和 M 大约为几千(例如,300-3000),而 D 始终为 3。CUDA 版本期望数组是设备内存,但以 h_ 为前缀的变量除外。