我是 CUDA 新手,第一次玩 CUDA 内核。我有以下实现卷积的内核(非常天真),带有一个虚拟循环,在全局内存中执行相同元素的计算 1000 次(见下文)。问题是,在操作之后,结果矩阵中的一些单元格是错误的:从某个偏移量开始,这些值并不是预期的 1000 的倍数。我的内核:
__global__ void conv(float *input, float *kernel, float *target)
{
for (long i = 0; i <100; i++)
{
atomicAdd(target+gridDim.y*blockIdx.x+blockIdx.y,input[(blockIdx.x+threadIdx.x)*(blockDim.y+gridDim.y-1)+(blockIdx.y+threadIdx.y)]*kernel[threadIdx.x*blockDim.y+threadIdx.y]);
}
}
内核的调用代码如下:
float image[1024] = {0.0};
float kernel[] =
{
1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
1.0f, 1.0f, 1.0f, 1.0f, 1.0f
};
float res[784]={0};
for (int i = 0; i < 1024; i++)
{
image[i]=(float)i;
} // Got 32x32 matrix
cudaError_t cudaStatus = cudaSetDevice(0);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");
exit (-1);
}
float *dev_image = 0;
float *dev_kernel = 0;
float *dev_res = 0;
// Allocate GPU buffers for three vectors (two input, one output) .
cudaStatus = cudaMalloc((void**)&dev_image, sizeof(image));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
exit(-10);
}
cudaStatus = cudaMalloc((void**)&dev_kernel, sizeof(kernel));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
exit(-10);
}
cudaStatus = cudaMalloc((void**)&dev_res, sizeof(res));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
exit(-10);
}
cudaMemcpy(dev_image, image, sizeof(image), cudaMemcpyHostToDevice);
cudaMemcpy(dev_kernel, kernel, sizeof(kernel), cudaMemcpyHostToDevice);
cudaMemset(dev_res,0,sizeof(res));
// Convloving 32x32 matrix with 5x5 kernel, getting 28x28 matrix as a result
dim3 blocks(28,28,1);
dim3 threads(5,5,1);
for (int itr = 0; itr<10; itr++)
{
conv<<<blocks, threads>>>(dev_image,dev_kernel, dev_res);
}
cudaMemcpy(res, dev_res, sizeof(res), cudaMemcpyDeviceToHost);
printf("res[0]=%f\n",res[0]);
cudaFree(dev_kernel);
cudaFree(dev_image);
cudaFree(dev_res);
exit (0);
看来我处理了并发问题,所以它不应该是根本原因。我很感激任何帮助。