cuda - 将 __sync CUDA 与全局内存一起使用

Question

我正在尝试实现一个异步 PSO。我这样做的方法如下：

__global__ void particle(double *pos, double *pbest, double *vpbest, double *vel, double *gbest){

    int thread = threadIdx.x + blockDim.x * blockIdx.x;
    int particle, i = 0;
    double tpbest;

    double l, r;
    int index, best, j;

    if(thread < DIMPAR){
      particle = thread / NDIM;
        do{
            best    = ring(vpbest, &particle);

            index = (best * NDIM) + (thread % NDIM);

            l = (double) 2.05 * (double) uniform(thread) * ( pbest[thread] -   pos[thread] );
            r = (double) 2.05 * (double) uniform(thread) * ( pbest[index]  -   pos[thread] );

            vel[thread] = vel[thread] + l + r;

                pos[thread] =  pos[thread] + vel[thread];

                __syncthreads(); // I am trying wait all threads write in global memory

            if( (thread % NDIM) == 0 ){ //only one thread replace the vector
                tpbest = rastrigin(pos, particle * NDIM, NDIM);
                if(tpbest < vpbest[particle]){
                    vpbest[particle] = tpbest;
                    for(j = 0 ; j < NDIM; j++){
                        pbest[(particle * NDIM) + j] = pos[(particle * NDIM) + j];
                    }

                }
            }

            i++;
        }while(i < 10000);
    }
}

电话：

particle<<<1,512>>>(d_pos, d_pbest, d_vpbest, d_velo, d_gbest);

有时同步会出现问题... pos[thread] 中的某些值会发散。在 B.6 节 CUDA_C_PROGRAMMING 指南中：

等待直到线程块中的所有线程都达到这一点，并且这些线程在 __syncthreads() 之前进行的所有全局和共享内存访问对块中的所有线程都是可见的。

pos向量是这样的：

p0 = [0,1,2] //粒子 1

p1 = [3,4,5] //粒子 2

p2 = [6,7,8] //粒子 3

pos = [1,2,3,4,5,6,7,8] //pos向量，DIMPAR = 9; NPAR = 3；NDIM = 3

当我使用 NDIM >= 30 时，会发生分歧

如何使用全局内存确保同步？

score 1 · Accepted Answer

您的_syncthread ()在if语句中。如果块大小大于DIMPAR，请小心，您的程序会停止。为一个_syncthread()要正常工作，块内的所有线程都必须能够访问它。

修改代码的一种方法如下（我不知道您的代码的目的，因此可能有更好的方法）：

particle = thread / NDIM;
    do{
        if(thread < DIMPAR){
            best    = ring(vpbest, &particle);

            index = (best * NDIM) + (thread % NDIM);

            l = (double) 2.05 * (double) uniform(thread) * ( pbest[thread] -   pos[thread] );
            r = (double) 2.05 * (double) uniform(thread) * ( pbest[index]  -   pos[thread] );

            vel[thread] = vel[thread] + l + r;

            pos[thread] =  pos[thread] + vel[thread];
        }

        __syncthreads();

现在所有线程都可以达到同步点。我在其余代码中注意到的另一个问题是 NDIM 的一个线程在 for 循环中工作。相反，您可以将tpbest作为块的线程之间的共享变量（或数组）。然后，再次同步线程后，您可以在已经空闲的线程之间分配 NDIM 工作，并让它们一起将效果写入全局内存，而不是使用 for 循环。通过这种方式，访问将更快且合并。

cuda - 将 __sync CUDA 与全局内存一起使用

1 回答 1

Related

Reference