我正在尝试使用共享内存在 cuda c 中做一个关于点积的简单教程;代码非常简单,它基本上是在两个数组的元素之间进行乘积,然后对每个块的结果求和:
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#include <cuda.h>
#define imin(a,b) (a<b?a:b)
const int N = 33*1024;
const int threadsPerBlock = 256;
const int blocksPerGrid = imin(32 , (N+threadsPerBlock-1)/threadsPerBlock);
__global__ void dot(float *a, float *b, float *c){
__shared__ float cache[threadsPerBlock];
int tid = threadIdx.x + blockIdx.x*blockDim.x;
int cacheIndex = threadIdx.x;
float temp = 0;
while (tid < N){
temp += a[tid]*b[tid];
tid += blockDim.x*gridDim.x; /* Aggiorno l'indice per l'evenutale overshoot. */
}
cache[cacheIndex] = temp;
__syncthreads();
int i = blockDim.x/2;
while(i != 0){ /
if(cacheIndex < i){
cache[cacheIndex] += cache[cacheIndex + i];
__syncthreads();
i /= 2;
}
}
if(cacheIndex == 0){
c[blockIdx.x] = cache[0];
}
}
int main(void){
cudaError_t err = cudaSuccess;
float a[N], b[N], c[blocksPerGrid];
float *d_a, *d_b, *d_c;
int i;
for(i=0;i<N;i++){
a[i] = i;
b[i] = i*2;
}
for(i=0; i<blocksPerGrid;i++){
c[i] = 0;
}
err = cudaMalloc((void**)&d_a, N*sizeof(float));
if (err != cudaSuccess){fprintf(stderr, "Failed to allocate device vector a (error code %s)! \n", cudaGetErrorString(err));exit(EXIT_FAILURE);}
err = cudaMalloc((void**)&d_b, N*sizeof(float));
if (err != cudaSuccess){fprintf(stderr, "Failed to allocate device vector b (error code %s)! \n", cudaGetErrorString(err));exit(EXIT_FAILURE);}
err = cudaMalloc((void**)&d_c, blocksPerGrid*sizeof(float));
if (err != cudaSuccess){fprintf(stderr, "Failed to allocate device vector c (error code %s)! \n", cudaGetErrorString(err));exit(EXIT_FAILURE);}
/* Copio i valori dei vettori a e b nello spazio di memoria allocato precedentemente nel device. */
err = cudaMemcpy(d_a, a, N*sizeof(float), cudaMemcpyHostToDevice);
if (err != cudaSuccess){fprintf(stderr, "Failed to copy vector a from host to device (error code %s)!\n", cudaGetErrorString(err));exit(EXIT_FAILURE);}
err = cudaMemcpy(d_b, b, N*sizeof(float), cudaMemcpyHostToDevice);
if (err != cudaSuccess){fprintf(stderr, "Failed to copy vector b from host to device (error code %s)!\n", cudaGetErrorString(err));exit(EXIT_FAILURE);}
err = cudaMemcpy(d_c, c, blocksPerGrid*sizeof(float), cudaMemcpyHostToDevice);
if (err != cudaSuccess){fprintf(stderr, "Failed to copy vector c from host to device (error code %s)!\n", cudaGetErrorString(err));exit(EXIT_FAILURE);}
dot<<<blocksPerGrid,threadsPerBlock>>>(d_a, d_b, d_c); err = cudaGetLastError();
err = cudaMemcpy(c, d_c, blocksPerGrid*sizeof(float), cudaMemcpyDeviceToHost);
if (err != cudaSuccess){fprintf(stderr, "Failed to copy vector c from device to host (error code %s)!\n", cudaGetErrorString(err));exit(EXIT_FAILURE);}
err = cudaFree(d_a);
if (err != cudaSuccess){fprintf(stderr, "Failed to free device vector a (error code %s)!\n", cudaGetErrorString(err));exit(EXIT_FAILURE);}
err = cudaFree(d_b);
if (err != cudaSuccess){fprintf(stderr, "Failed to free device vector b (error code %s)!\n", cudaGetErrorString(err));exit(EXIT_FAILURE);}
err = cudaFree(d_c);
if (err != cudaSuccess){fprintf(stderr, "Failed to free device vector c (error code %s)!\n", cudaGetErrorString(err));exit(EXIT_FAILURE);}
float result = 0;
for(i=0;i<blocksPerGrid;i++){
result += c[i];
}
printf("il risultato finale è: %.2f\n", result);
return 0;
}
此代码与 Cuda by Example 书中提供的代码相同,唯一的区别在于向量 a、b 和 c 的定义(我定义它们的方式应该不是问题,因为我已经做过几次了)。
这就是问题所在:当我尝试运行程序时它崩溃了!终端说问题是:Failed to copy vector c from device to host (error code the launch timed out and was terminated)!
这很奇怪,因为我认为我已经以正确的方式分配了向量 c ......有没有人知道我做错了什么?是全局函数还是主函数有问题?