4

我是 CUDA 新手,第一次玩 CUDA 内核。我有以下实现卷积的内核(非常天真),带有一个虚拟循环,在全局内存中执行相同元素的计算 1000 次(见下文)。问题是,在操作之后,结果矩阵中的一些单元格是错误的:从某个偏移量开始,这些值并不是预期的 1000 的倍数。我的内核:

__global__ void conv(float *input, float *kernel, float *target)
{
    for (long i = 0; i <100; i++)
    {
        atomicAdd(target+gridDim.y*blockIdx.x+blockIdx.y,input[(blockIdx.x+threadIdx.x)*(blockDim.y+gridDim.y-1)+(blockIdx.y+threadIdx.y)]*kernel[threadIdx.x*blockDim.y+threadIdx.y]);
    }
}

内核的调用代码如下:

float image[1024] = {0.0};
float kernel[] = 
{ 
    1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 
    1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 
    1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 
    1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 
    1.0f, 1.0f, 1.0f, 1.0f, 1.0f 
};

float res[784]={0};

for (int i = 0; i < 1024; i++)
{
    image[i]=(float)i;
} // Got 32x32 matrix

cudaError_t cudaStatus = cudaSetDevice(0);
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?");
    exit (-1);
}

float *dev_image = 0;
float *dev_kernel = 0;
float *dev_res = 0;

// Allocate GPU buffers for three vectors (two input, one output)    .
cudaStatus = cudaMalloc((void**)&dev_image, sizeof(image));
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaMalloc failed!");
    exit(-10);
}

cudaStatus = cudaMalloc((void**)&dev_kernel, sizeof(kernel));
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaMalloc failed!");
    exit(-10);
}

cudaStatus = cudaMalloc((void**)&dev_res, sizeof(res));
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaMalloc failed!");
    exit(-10);
}

cudaMemcpy(dev_image, image, sizeof(image), cudaMemcpyHostToDevice);
cudaMemcpy(dev_kernel, kernel, sizeof(kernel), cudaMemcpyHostToDevice);

cudaMemset(dev_res,0,sizeof(res));

    // Convloving 32x32 matrix with 5x5 kernel, getting 28x28 matrix as a result
dim3 blocks(28,28,1);
dim3 threads(5,5,1);

for (int itr = 0; itr<10; itr++)
{
    conv<<<blocks, threads>>>(dev_image,dev_kernel, dev_res);
}

cudaMemcpy(res, dev_res, sizeof(res), cudaMemcpyDeviceToHost);

printf("res[0]=%f\n",res[0]);

cudaFree(dev_kernel);
cudaFree(dev_image);
cudaFree(dev_res);

exit (0);

看来我处理了并发问题,所以它不应该是根本原因。我很感激任何帮助。

4

1 回答 1

4

您正在对float值进行任意算术并期望完美的准确性。

floatvalues 可以完美地存储整数,直到某个尾数。一旦我们超过该值,浮点运算就开始变得不精确。自然地,结果中倾向于累积到最大数字(靠近res数组末尾的那些)的值将首先显示这种效果。

让我们将内核中的循环计数与围绕内核的主机代码中的循环计数的乘积称为total_loops. 对于total_loops高达 700 左右的值,我得到“精确”的结果,也就是说,所有结果都可以被total_loops. 之后,随着你逐渐增加,错误开始蔓延,从数组total_loops的末尾开始。res

您可以切换到double而不是,float您的结果会有所不同,除了用于 double 的 atomicAdd 版本不方便使用。但是,编程指南显示了如何创建任意原子操作,并且他们给出的示例恰好是为 double 实现 atomicAdd

因此,对代码的以下修改允许您探索这两个想法:

  • 如果您想查看双重解决问题的方法,请将定义更改为USE_DOUBLE
  • 相反,如果您想了解如何减少total_loops修复问题,请将 LOOPS1 定义从 100 更改为 70。
  • 我还要提一下,对所有API 调用和内核调用进行cuda 错误检查是一种很好的做法(您只涉及一些,而不是内核),但在这种情况下这不是问题。

这是代码:

#include <stdio.h>
#define LOOPS1 100
#define LOOPS2 10
// set to USE_DOUBLE or USE_FLOAT
#define USE_FLOAT

#ifndef USE_DOUBLE
typedef float mytype;
#else
typedef double mytype;
#endif

__device__ double atomicAdd(double* address, double val)
{
    unsigned long long int* address_as_ull =
                              (unsigned long long int*)address;
    unsigned long long int old = *address_as_ull, assumed;
    do {
        assumed = old;
        old = atomicCAS(address_as_ull, assumed,
                        __double_as_longlong(val +
                               __longlong_as_double(assumed)));
    } while (assumed != old);
    return __longlong_as_double(old);
}

__global__ void conv(mytype *input, mytype *kernel, mytype *target)
{
    for (long i = 0; i <LOOPS1; i++)
    {
        atomicAdd(target+gridDim.y*blockIdx.x+blockIdx.y,input[(blockIdx.x+threadIdx.x)*(blockDim.y+gridDim.y-1)+(blockIdx.y+threadIdx.y)]*kernel[threadIdx.x*blockDim.y+threadIdx.y]);
    }
}

int main(){

mytype image[1024] = {0.0};
mytype kernel[] =
{
    1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
    1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
    1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
    1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
    1.0f, 1.0f, 1.0f, 1.0f, 1.0f
};

mytype res[784]={0};

for (int i = 0; i < 1024; i++)
{
    image[i]=(mytype)i;
} // Got 32x32 matrix

cudaError_t cudaStatus = cudaSetDevice(0);
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?");
    exit (-1);
}

mytype *dev_image = 0;
mytype *dev_kernel = 0;
mytype *dev_res = 0;

// Allocate GPU buffers for three vectors (two input, one output)    .
cudaStatus = cudaMalloc((void**)&dev_image, sizeof(image));
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaMalloc failed!");
    exit(-10);
}

cudaStatus = cudaMalloc((void**)&dev_kernel, sizeof(kernel));
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaMalloc failed!");
    exit(-10);
}

cudaStatus = cudaMalloc((void**)&dev_res, sizeof(res));
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaMalloc failed!");
    exit(-10);
}

cudaMemcpy(dev_image, image, sizeof(image), cudaMemcpyHostToDevice);
cudaMemcpy(dev_kernel, kernel, sizeof(kernel), cudaMemcpyHostToDevice);

cudaMemset(dev_res,0,sizeof(res));

    // Convloving 32x32 matrix with 5x5 kernel, getting 28x28 matrix as a result
dim3 blocks(28,28,1);
dim3 threads(5,5,1);

for (int itr = 0; itr<LOOPS2; itr++)
{
    conv<<<blocks, threads>>>(dev_image,dev_kernel, dev_res);
}

cudaMemcpy(res, dev_res, sizeof(res), cudaMemcpyDeviceToHost);

printf("results:\n");
for (int i = 0; i< (28*28); i++)
  if ((((int)res[i])%(LOOPS1*LOOPS2)) != 0) {printf("first error index: %d, value: %f\n", i, res[i]); return 1;}

cudaFree(dev_kernel);
cudaFree(dev_image);
cudaFree(dev_res);

  return 0;
}

请注意,即使您使用double,如果您累积到足够大的值,问题最终会再次出现。

另请注意,这并不是真正的 CUDA/GPU 问题。 float在主机代码中有类似的限制。

于 2013-06-02T14:12:18.090 回答