我最近买了一张gtx550ti boost卡。以前在我的旧 gf440 卡上运行的程序失败了。这是一个例子。以下程序适用于较小的内核,但适用于较大的内核。
#include "stdio.h"
__global__ void kernel(float * d_in, float * d_out){
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
int idx = x + y * blockDim.x * gridDim.x;
d_out[idx] = d_in[idx];
}
int main(){
const dim3 gridSize(10,10);
const dim3 blockSize(80,80);
const int size = 800*800;
float * h_in = new float[size];
float * h_out = new float[size];
float * d_in;
float * d_out;
cudaMalloc((void**)&d_in, sizeof(float)*size);
cudaMalloc((void**)&d_out, sizeof(float)*size);
for(int i = 0; i < size; i++)
h_in[i] = (float)i;
cudaMemcpy(d_in, h_in, sizeof(float)*size, cudaMemcpyHostToDevice);
kernel<<<gridSize,blockSize>>>(d_in, d_out);
cudaMemcpy(h_out, d_out, sizeof(float)*size, cudaMemcpyDeviceToHost);
for(int i = 0; i < size; i++)
printf("%f\n",h_out[i]);
cudaFree(d_in);
cudaFree(d_out);
return 0;
}
我希望它以浮点数输出索引。但它输出一些随机浮点数:
0.131061
2.520029
9.304665
0.000189
0.242134
0.525557
0.560013
尺寸 100*100
相反,当我切换到 100*100 尺寸时:
const dim3 gridSize(10,10);
const dim3 blockSize(10,10);
const int size = 100*100;
它工作正常(最后5个输出):
9995.000000
9996.000000
9997.000000
9998.000000
9999.000000
尺寸 500*500
但对于更大的尺寸 500*500:
const dim3 gridSize(10,10);
const dim3 blockSize(50,50);
const int size = 500*500;
它输出错误的索引(最后 5 个输出):
512139.000000
512140.000000
512141.000000
512142.000000
512143.000000
我安装了 CUDA 5.5。谢谢!