我正在使用 CUDA 添加两个矩阵,并将它们的结果提供给另一个矩阵。我希望利用共享内存功能,为此,我写了以下内容:
#include <stdio.h>
#include <cuda.h>
#define grid 1024
#define BSZ 16
__global__ void addition(int *dev_a, int *dev_b, int *dev_c)
{
__shared__ int as[BSZ][BSZ];
__shared__ int bs[BSZ][BSZ];
int by = blockIdx.y;
int bx = blockIdx.x;
int cvalue;
int ty = threadIdx.y;
int tx = threadIdx.x;
int row = by * BSZ + ty;
int col = bx * BSZ + tx;
as[ty][tx] = dev_a[row*grid + col];
bs[ty][tx] = dev_b[row*grid + col];
__syncthreads();
cvalue = as[ty][tx] + bs[ty][tx];
__syncthreads();
dev_c[row*grid + col] = cvalue;
}
int main ()
{
int a[grid][grid], b[grid][grid], c[grid][grid];
//c = a + b
for(int i=0;i<grid;i++)
{
for(int j=0;j<grid;j++)
{
a[i][j]=2;
b[i][j]=1;
}
}
printf("Working fine here");
int *dev_a;
int *dev_b;
int *dev_c;
int size = grid * grid * sizeof(int);
printf("Working fine");
cudaMalloc( (void**)&dev_a, size );
cudaMalloc( (void**)&dev_b, size );
cudaMalloc( (void**)&dev_c, size );
cudaMemcpy(dev_a,a,size,cudaMemcpyHostToDevice);
cudaMemcpy(dev_b,b,size,cudaMemcpyHostToDevice);
dim3 dimBlock(BSZ,BSZ);
dim3 dimGrid(grid/dimBlock.x,grid/dimBlock.y);
//Kernel launch
addition<<<dimGrid, dimBlock>>>(dev_a, dev_b, dev_c);
cudaMemcpy(c,dev_c,size,cudaMemcpyDeviceToHost);
for (int i=0; i<grid; i++)
{
for(int j=0;j<grid;j++)
{
printf( "%d + %d = %d\n", a[i][j], b[i][j], c[i][j] );
}
}
}
我收到分段错误错误,我无法理解为什么!请有人帮我解决这个问题。