1

我正在为 2D/3D 非线性扩散编写我的第一个 CUDA 代码。2D 案例工作正常,但我在 3D 案例中苦苦挣扎。基本上我在有限差分计算阶段得到了零,令人惊讶的是,'deltaN'(请参见下面的代码)给出了正确的答案,但其他的却不起作用(答案为零)。我正在尝试处理 256x256x256 卷。请问有什么建议吗?谢谢!

 #define BLKXSIZE 8
 #define BLKYSIZE 8
 #define BLKZSIZE 8
 #define idivup(a, b) ( ((a)%(b) != 0) ? (a)/(b)+1 : (a)/(b) )

void AnisotropDiff_GPU(double* A, double* B, int N, int M, int Z, double sigma, int iter, double tau, int type)
{ 
// Nonlinear Diffusion in 3D 
double *Ad;     

dim3 dimBlock(BLKXSIZE, BLKYSIZE, BLKZSIZE);           
dim3 dimGrid(idivup(N,BLKXSIZE), idivup(M,BLKYSIZE), idivup(Z,BLKYSIZE));    

cudaMalloc((void**)&Ad,N*M*Z*sizeof(double));  
cudaMemcpy(Ad,A,N*M*Z*sizeof(double),cudaMemcpyHostToDevice);  

int n = 1;
while (n <= iter) {    
anis_diff3D<<<dimGrid,dimBlock>>>(Ad, N, M, Z, sigma, iter, tau, type);  
n++;}
cudaMemcpy(B,Ad,N*M*Z*sizeof(double),cudaMemcpyDeviceToHost);
cudaFree(Ad);
}

这是计算有限差分的部分

 __global__ void anis_diff3D(double* A, int N, int M, int Z, double sigma, int iter, double tau, int type)
{

 int xIndex = blockDim.x * blockIdx.x + threadIdx.x;
 int yIndex = blockDim.y * blockIdx.y + threadIdx.y;
 int zIndex = blockDim.z * blockIdx.z + threadIdx.z;

 if ( (xIndex < N) && (yIndex < M) && (zIndex < Z) )
 {
    int index_out = xIndex + M*yIndex + N*M*zIndex;

    double deltaN=0, deltaS=0, deltaW=0, deltaE=0, deltaU=0, deltaD=0;
    double cN, cS, cW, cE, cU, cD;

    int indexN = (xIndex-1) + M*yIndex + N*M*zIndex;
    int indexS = (xIndex+1) + M*yIndex + N*M*zIndex;
    int indexW = xIndex + M*(yIndex-1) + N*M*zIndex;
    int indexE = xIndex + M*(yIndex+1) + N*M*zIndex;
    int indexU = xIndex + M*yIndex + N*M*(zIndex-1);
    int indexD = xIndex + M*yIndex + N*M*(zIndex+1);


    if (xIndex>1)
        deltaN = A[indexN]-A[index_out];
    if (xIndex<N)
        deltaS = A[indexS]-A[index_out];    
    if (yIndex>1)
        deltaW = A[indexW]-A[index_out];    
    if (yIndex<M)
        deltaE = A[indexE]-A[index_out];
    if (zIndex>1)
        deltaU = A[indexU]-A[index_out];    
    if (zIndex<Z)
        deltaD = A[indexD]-A[index_out];

   A[index_out] = deltaN ; // works for deltaN but not for deltaS, deltaW... . 

 }

非常感谢您的帮助!

4

1 回答 1

2

您尝试在内核中计算的某些值的索引越界。

如果您使用最后一行内核编译代码,如下所示:

A[index_out] = deltaS ;

然后运行它cuda-memcheck,cuda-memcheck 将报告越界访问:

========= Invalid __global__ read of size 8
=========     at 0x000000b0 in anis_diff3D(double*, int, int, int, double, int, double, int)
=========     by thread (7,7,7) in block (31,31,31)
=========     Address 0x408100000 is out of bounds

那么发生了什么?让我们看看你的指数计算:

int indexS = (xIndex+1) + M*yIndex + N*M*zIndex;

对于网格中的最后一个线程(块 (31, 31, 31) 中的线程 (7,7,7)),此索引计算索引超出A了此行中内存数组的末尾:

    deltaS = A[indexS]-A[index_out];

您必须处理这些边界条件才能使事情正常运行。

当我们这样做的时候,如果你做了错误检查,你的内核就会抛出一个错误。请注意,根据您选择在内核末尾存储的值,编译器可能会优化其他计算,从而使内核看起来运行正确(例如,如果您存储 deltaN 而不是 deltaS)。这是一个带有错误检查的代码示例:

#include <stdio.h>
#include <stdlib.h>

#define cudaCheckErrors(msg) \
    do { \
        cudaError_t __err = cudaGetLastError(); \
        if (__err != cudaSuccess) { \
            fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
                msg, cudaGetErrorString(__err), \
                __FILE__, __LINE__); \
            fprintf(stderr, "*** FAILED - ABORTING\n"); \
            exit(1); \
        } \
    } while (0)

 #define BLKXSIZE 8
 #define BLKYSIZE 8
 #define BLKZSIZE 8
 #define idivup(a, b) ( ((a)%(b) != 0) ? (a)/(b)+1 : (a)/(b) )
 #define sizeX 256
 #define sizeY 256
 #define sizeZ 256
 #define sizeT (sizeX*sizeY*sizeZ)


 __global__ void anis_diff3D(double* A, int N, int M, int Z, double sigma, int iter, double tau, int type)
{

 int xIndex = blockDim.x * blockIdx.x + threadIdx.x;
 int yIndex = blockDim.y * blockIdx.y + threadIdx.y;
 int zIndex = blockDim.z * blockIdx.z + threadIdx.z;

 if ( (xIndex < N) && (yIndex < M) && (zIndex < Z) )
 {
    int index_out = xIndex + M*yIndex + N*M*zIndex;

    double deltaN=0, deltaS=0, deltaW=0, deltaE=0, deltaU=0, deltaD=0;
    double cN, cS, cW, cE, cU, cD;

    int indexN = (xIndex-1) + M*yIndex + N*M*zIndex;
    if (indexN > ((N*M*Z)-1)) indexN = (N*M*Z) -1;
    if (indexN < 0) indexN = 0;
    int indexS = (xIndex+1) + M*yIndex + N*M*zIndex;
    if (indexS > ((N*M*Z)-1)) indexS = (N*M*Z) -1;
    if (indexS < 0) indexS = 0;
    int indexW = xIndex + M*(yIndex-1) + N*M*zIndex;
    if (indexW > ((N*M*Z)-1)) indexW = (N*M*Z) -1;
    if (indexW < 0) indexW = 0;
    int indexE = xIndex + M*(yIndex+1) + N*M*zIndex;
    if (indexE > ((N*M*Z)-1)) indexE = (N*M*Z) -1;
    if (indexE < 0) indexE = 0;
    int indexU = xIndex + M*yIndex + N*M*(zIndex-1);
    if (indexU > ((N*M*Z)-1)) indexU = (N*M*Z) -1;
    if (indexU < 0) indexU = 0;
    int indexD = xIndex + M*yIndex + N*M*(zIndex+1);
    if (indexD > ((N*M*Z)-1)) indexD = (N*M*Z) -1;
    if (indexD < 0) indexD = 0;    

    if (xIndex>1)
        deltaN = A[indexN]-A[index_out];
    if (xIndex<N)
        deltaS = A[indexS]-A[index_out];
    if (yIndex>1)
        deltaW = A[indexW]-A[index_out];
    if (yIndex<M)
        deltaE = A[indexE]-A[index_out];
    if (zIndex>1)
        deltaU = A[indexU]-A[index_out];
    if (zIndex<Z)
        deltaD = A[indexD]-A[index_out];

   A[index_out] = deltaS; // works for deltaN but not for deltaS, deltaW... .

 }
}
void AnisotropDiff_GPU(double* A, double* B, int N, int M, int Z, double sigma, int iter, double tau, int type)
{
  // Nonlinear Diffusion in 3D
  double *Ad;

  dim3 dimBlock(BLKXSIZE, BLKYSIZE, BLKZSIZE);
  dim3 dimGrid(idivup(N,BLKXSIZE), idivup(M,BLKYSIZE), idivup(Z,BLKYSIZE));

  cudaMalloc((void**)&Ad,N*M*Z*sizeof(double));
  cudaCheckErrors("cm1");
  cudaMemcpy(Ad,A,N*M*Z*sizeof(double),cudaMemcpyHostToDevice);
  cudaCheckErrors("cc1");
  int n = 1;
  while (n <= iter) {
    anis_diff3D<<<dimGrid,dimBlock>>>(Ad, N, M, Z, sigma, iter, tau, type);
    n++;
    cudaDeviceSynchronize();
    cudaCheckErrors("kernel");
  }
  cudaMemcpy(B,Ad,N*M*Z*sizeof(double),cudaMemcpyDeviceToHost);
  cudaCheckErrors("cc2");
  cudaFree(Ad);
}

int main(){

  double *A;
  A = (double *)malloc(sizeT * sizeof(double));
  if (A == 0) {printf("malloc fail\n"); return 1;}

  for (int i=0; i< sizeT; i++)
    A[i] = (double)(rand()/(double)RAND_MAX);

  printf("data:\n");
  for (int i = 0; i < 8; i++)
    printf("A[%d] = %f\n", i, A[i]);

  AnisotropDiff_GPU(A, A, sizeX, sizeY, sizeZ, 0.5f, 3, 0.5, 3);
  printf("results:\n");
  for (int i = 0; i < 8; i++)
    printf("A[%d] = %f\n", i, A[i]);

  return 0;
}

编辑 我已经编辑了上面的代码以将索引计算限制在定义数组的边界。这应该可以防止越界访问。从算法的角度来看,我不知道这是否明智。

于 2013-03-02T17:00:49.973 回答