我正在为 2D/3D 非线性扩散编写我的第一个 CUDA 代码。2D 案例工作正常,但我在 3D 案例中苦苦挣扎。基本上我在有限差分计算阶段得到了零,令人惊讶的是,'deltaN'(请参见下面的代码)给出了正确的答案,但其他的却不起作用(答案为零)。我正在尝试处理 256x256x256 卷。请问有什么建议吗?谢谢!
#define BLKXSIZE 8
#define BLKYSIZE 8
#define BLKZSIZE 8
#define idivup(a, b) ( ((a)%(b) != 0) ? (a)/(b)+1 : (a)/(b) )
void AnisotropDiff_GPU(double* A, double* B, int N, int M, int Z, double sigma, int iter, double tau, int type)
{
// Nonlinear Diffusion in 3D
double *Ad;
dim3 dimBlock(BLKXSIZE, BLKYSIZE, BLKZSIZE);
dim3 dimGrid(idivup(N,BLKXSIZE), idivup(M,BLKYSIZE), idivup(Z,BLKYSIZE));
cudaMalloc((void**)&Ad,N*M*Z*sizeof(double));
cudaMemcpy(Ad,A,N*M*Z*sizeof(double),cudaMemcpyHostToDevice);
int n = 1;
while (n <= iter) {
anis_diff3D<<<dimGrid,dimBlock>>>(Ad, N, M, Z, sigma, iter, tau, type);
n++;}
cudaMemcpy(B,Ad,N*M*Z*sizeof(double),cudaMemcpyDeviceToHost);
cudaFree(Ad);
}
这是计算有限差分的部分
__global__ void anis_diff3D(double* A, int N, int M, int Z, double sigma, int iter, double tau, int type)
{
int xIndex = blockDim.x * blockIdx.x + threadIdx.x;
int yIndex = blockDim.y * blockIdx.y + threadIdx.y;
int zIndex = blockDim.z * blockIdx.z + threadIdx.z;
if ( (xIndex < N) && (yIndex < M) && (zIndex < Z) )
{
int index_out = xIndex + M*yIndex + N*M*zIndex;
double deltaN=0, deltaS=0, deltaW=0, deltaE=0, deltaU=0, deltaD=0;
double cN, cS, cW, cE, cU, cD;
int indexN = (xIndex-1) + M*yIndex + N*M*zIndex;
int indexS = (xIndex+1) + M*yIndex + N*M*zIndex;
int indexW = xIndex + M*(yIndex-1) + N*M*zIndex;
int indexE = xIndex + M*(yIndex+1) + N*M*zIndex;
int indexU = xIndex + M*yIndex + N*M*(zIndex-1);
int indexD = xIndex + M*yIndex + N*M*(zIndex+1);
if (xIndex>1)
deltaN = A[indexN]-A[index_out];
if (xIndex<N)
deltaS = A[indexS]-A[index_out];
if (yIndex>1)
deltaW = A[indexW]-A[index_out];
if (yIndex<M)
deltaE = A[indexE]-A[index_out];
if (zIndex>1)
deltaU = A[indexU]-A[index_out];
if (zIndex<Z)
deltaD = A[indexD]-A[index_out];
A[index_out] = deltaN ; // works for deltaN but not for deltaS, deltaW... .
}
非常感谢您的帮助!