在运行以下内核时,我遇到了以下易于重现的问题,除了浮点数的 atomicAdds 之外什么都不做:
#define OUT_ITERATIONS 20000000
#define BLOCKS 12
#define THREADS 192
__global__ void testKernel(float* result) {
int i = threadIdx.x + blockIdx.x * blockDim.x;
float bias = 1.0f;
int n = 1;
while (i < OUT_ITERATIONS) {
atomicAdd(result, bias);
i += BLOCKS * THREADS;
}
}
内核应该将结果增加 OUT_ITERATIONS 次,即 20M。我用这个标准代码调用内核:
int main() {
cudaError_t cudaStatus;
float* result;
float* dev_result;
// Choose which GPU to run on, change this on a multi-GPU system.
cudaStatus = cudaSetDevice(0);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");
goto Error;
}
result = new float;
cudaStatus = cudaMalloc((void**)&dev_result, sizeof(float));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed: %s\n", cudaGetErrorString(cudaStatus));
goto Error;
}
cudaStatus = cudaMemset(dev_result, 0, sizeof(float));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemset failed: %s\n", cudaGetErrorString(cudaStatus));
goto Error;
}
// Launch a kernel on the GPU with one thread for each element.
testKernel<<<BLOCKS, THREADS>>>(dev_result);
// Check for any errors launching the kernel
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
goto Error;
}
// cudaDeviceSynchronize waits for the kernel to finish, and returns
// any errors encountered during the launch.
cudaStatus = cudaDeviceSynchronize();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
goto Error;
}
cudaStatus = cudaMemcpy(result, dev_result, sizeof(float), cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed: %s\n", cudaGetErrorString(cudaStatus));
goto Error;
}
printf("Result: %f\n", *result);
但是,最后打印的结果是 16777216.0,顺便说一下 0x1000000 十六进制。如果 OUT_ITERATIONS < 16777216 则不会出现问题,也就是说,如果我将其更改为 16777000 例如,输出果然是 16777000.0!
系统:NVidia-Titan、CUDA 5.5、Windows7