1

我正在使用 CUDA 添加两个矩阵,并将它们的结果提供给另一个矩阵。我希望利用共享内存功能,为此,我写了以下内容:

#include <stdio.h>
#include <cuda.h>
#define grid 1024
#define BSZ 16

    __global__ void addition(int *dev_a, int *dev_b, int *dev_c)
    {

    __shared__ int as[BSZ][BSZ];
    __shared__ int bs[BSZ][BSZ];

    int by = blockIdx.y;
    int bx = blockIdx.x;

    int cvalue;

    int ty = threadIdx.y;
    int tx = threadIdx.x;

    int row = by * BSZ + ty;
    int col = bx * BSZ + tx;

    as[ty][tx] = dev_a[row*grid + col];
    bs[ty][tx] = dev_b[row*grid + col];
    __syncthreads();

    cvalue = as[ty][tx] + bs[ty][tx];

    __syncthreads();

    dev_c[row*grid + col] = cvalue;
    }

int main ()
{
    int a[grid][grid], b[grid][grid], c[grid][grid];
    //c = a + b
    for(int i=0;i<grid;i++)
    {
      for(int j=0;j<grid;j++)
      {
        a[i][j]=2;
        b[i][j]=1;
      }
    }

    printf("Working fine here");
    int *dev_a;
    int *dev_b;
    int *dev_c;
    int size = grid * grid * sizeof(int);

    printf("Working fine");
    cudaMalloc( (void**)&dev_a, size );
    cudaMalloc( (void**)&dev_b, size );
    cudaMalloc( (void**)&dev_c, size );

    cudaMemcpy(dev_a,a,size,cudaMemcpyHostToDevice);
    cudaMemcpy(dev_b,b,size,cudaMemcpyHostToDevice);

    dim3 dimBlock(BSZ,BSZ);
    dim3 dimGrid(grid/dimBlock.x,grid/dimBlock.y);

    //Kernel launch
    addition<<<dimGrid, dimBlock>>>(dev_a, dev_b, dev_c);

    cudaMemcpy(c,dev_c,size,cudaMemcpyDeviceToHost);

    for (int i=0; i<grid; i++)
    {
      for(int j=0;j<grid;j++)
        {
          printf( "%d + %d = %d\n", a[i][j], b[i][j], c[i][j] );
        }
    }
}

我收到分段错误错误,我无法理解为什么!请有人帮我解决这个问题。

4

1 回答 1

8
int a[1024][1024], b[1024][1024], c[1024][1024];

这些物体的大小是天文数字!你可能溢出了堆栈。我认为如果您减小段错误的大小或增加堆栈的大小,您会发现段错误消失了,但是您的实现允许您这样做,或者甚至可能为它们分配动态存储持续时间(例如,malloc或在您的情况下cudaMalloc)而不是自动存储时间。

于 2013-05-21T04:21:25.910 回答