下面的代码计算两个向量 a 和 b 的点积。正确的结果是 8192。当我第一次运行它时,结果是正确的。然后当我第二次运行它时,结果是前一个结果 + 8192 等等:
1st iteration: result = 8192
2nd iteration: result = 8192 + 8192
3rd iteration: result = 8192 + 8192 
and so on.
我通过在屏幕上打印它进行了检查,并且设备变量 dev_c 没有被释放。更重要的是,写入它会导致类似总和的结果,结果是先前的值加上写入它的新值。我想这可能与 atomicAdd() 操作有关,但毕竟 cudaFree(dev_c) 应该删除它。
#define N 8192
#define THREADS_PER_BLOCK 512
#define NUMBER_OF_BLOCKS (N/THREADS_PER_BLOCK)
#include <stdio.h>
__global__ void dot( int *a, int *b, int *c ) {
    __shared__ int temp[THREADS_PER_BLOCK];
    int index = threadIdx.x + blockIdx.x * blockDim.x;
    temp[threadIdx.x] = a[index] * b[index];
    __syncthreads();
    if( 0 == threadIdx.x ) {
        int sum = 0;
        for( int i= 0; i< THREADS_PER_BLOCK; i++ ){
        sum += temp[i];
        }
        atomicAdd(c,sum);
    }
}
    int main( void ) {
        int *a, *b, *c;
        int *dev_a, *dev_b, *dev_c; 
        int size = N * sizeof( int); 
        cudaMalloc( (void**)&dev_a, size );
        cudaMalloc( (void**)&dev_b, size );
        cudaMalloc( (void**)&dev_c, sizeof(int));
        a = (int*)malloc(size);
        b = (int*)malloc(size);
        c = (int*)malloc(sizeof(int));
        for(int i = 0 ; i < N ; i++){
            a[i] = 1;
            b[i] = 1;
        }
        cudaMemcpy( dev_a, a, size, cudaMemcpyHostToDevice);
        cudaMemcpy( dev_b, b, size, cudaMemcpyHostToDevice);
        dot<<< N/THREADS_PER_BLOCK,THREADS_PER_BLOCK>>>( dev_a, dev_b, dev_c);
        cudaMemcpy( c, dev_c, sizeof(int) , cudaMemcpyDeviceToHost);
        printf("Dot product = %d\n", *c);
        cudaFree(dev_a);
        cudaFree(dev_b);
        cudaFree(dev_c);    
        free(a); 
        free(b); 
        free(c);
        return 0;
    }