我正在 CUDA 中测试一些代码(我是 CUDA 新手,这是我的第一个应用程序)。到目前为止,我已经在 CUDA 中取得了与在 CPU 上串行运行代码所获得的结果相同的结果。我正在使用 Visual Studio 2010 并且构建配置是调试。但是,一旦我将构建配置更改为“发布”,我就会开始得到错误的结果。我无法使用 Nvidia 论坛,因为它们目前处于关闭状态。有CUDA经验的人可以指出这个问题。代码如下
__global__ void MyKernel(int *Nptr,int *deltaptr, double *gravityptr, double *separationptr, double *fconptr, double *xForce, double *yForce, double *zForce,
double *xPos, double *yPos, double *zPos )
{
int N = *Nptr;
int delta= *deltaptr;
double gravity= *gravityptr;
double separation = *separationptr;
double fcon = *fconptr;
double len=0.0;
double r12X =0.0;
double r12Y =0.0;
double r12Z =0.0;
double PE=0.0;
int nx = blockDim.x * blockIdx.x + threadIdx.x;//use this place of nx
//int ny = blockDim.x * blockIdx.x + threadIdx.y;//use this place of ny
int ny = blockDim.y * blockIdx.y + threadIdx.y;
//printf("nx:%d ny:%d\n", nx,ny);
if(!(nx< N && ny <N))
return;
//printf("nx:%d ny:%d\n", nx,ny);
xForce[nx*N+ny] = 0.0;
yForce[nx*N+ny] = -gravity;
zForce[nx*N+ny] = 0.0;
int lowerValuedx = maxOnDevice(nx-delta,0);
int upperValuedx=minOnDevice(nx+delta+1,N);
for(int dx=lowerValuedx; dx<upperValuedx;dx++)
{
int lowerValuedy=maxOnDevice(ny-delta,0);
int upperValuedy=minOnDevice(ny+delta+1,N);
for(int dy=lowerValuedy; dy<upperValuedy;dy++)
{
len=sqrt((double)((nx-dx)*(nx-dx)+(ny-dy)*(ny-dy)) ) *separation;
bool condition = ny!=dy;
bool condition1 = nx!=dx;
//if (nx!=dx || ny!=dy)
if (condition || condition1)
{
r12X = xPos[dx*N+dy] - xPos[nx*N+ny];
r12Y = yPos[dx*N+dy] - yPos[nx*N+ny];
r12Z = zPos[dx*N+dy] - zPos[nx*N+ny];
xForce[nx*N+ny] = xForce[nx*N+ny] +fcon*normxOnDevice(r12X,r12Y,r12Z)*(magOnDevice(r12X,r12Y,r12Z)-len);
yForce[nx*N+ny]= yForce[nx*N+ny] +fcon*normyOnDevice(r12X,r12Y,r12Z)*(magOnDevice(r12X,r12Y,r12Z)-len);
zForce[nx*N+ny]= zForce[nx*N+ny] +fcon*normzOnDevice(r12X,r12Y,r12Z)*(magOnDevice(r12X,r12Y,r12Z)-len);
}
}
}
}
谢谢