让我首先为这篇文章道歉。我知道有几篇帖子问我在这里提出的相同问题,但我已经尝试了给出的解决方案,但我仍然没有得到正确的 CUDA 矩阵乘法结果。
从我遵循的示例中,我很确定我在内核中的算法是正确的。我不相信我在将二维数组传递给内核时遇到任何问题,并且由于它们是通过引用传递的,我觉得二维解决方案数组应该在数组打印到主机时包含正确的答案,但事实并非如此。
这可能是我的 dim3 dimGrid(B, B) 和 dim3 dimThreads(T, T) 变量的问题吗?我是 CUDA 框架的新手,我仍在努力解决它。任何建议将不胜感激。我的代码如下:
#include <stdio.h>
#include <cuda.h>
#include <stdlib.h>
__global__ void MatMultiply (int *a, int *b, int *c, int N) {
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
int val = 0;
for (int e = 0; e < N; ++e) {
val += a[row*N + e] * b[e*N + col];
}
c[row*N+col] = val;
}
int main(void) {
int N, B, T;
printf("Input integer for matrix dimension size: ");
scanf("%d", &N);
printf("Input number of threads in a block: ");
scanf("%d", &T);
printf("Input number of blocks in a grid: ");
scanf("%d", &B);
int size = N * N * sizeof(int);
int *a, *b, *c;
a = (int*)malloc(size);
b = (int*)malloc(size);
c = (int*)malloc(size);
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++) {
a[i*N+j] = j + i*N;
b[i*N+j] = j + i*N;
c[i*N+j] = j + i*N;
}
}
int *dev_a, *dev_b, *dev_c;
cudaMalloc((void**)&dev_a, size);
cudaMalloc((void**)&dev_b, size);
cudaMalloc((void**)&dev_c, size);
cudaMemcpy(dev_a, a, size, cudaMemcpyHostToDevice);
cudaMemcpy(dev_b, b, size, cudaMemcpyHostToDevice);
cudaMemcpy(dev_c, c, size, cudaMemcpyHostToDevice);
dim3 dimGrid(B, B);
dim3 dimThreads(T, T);
MatMultiply<<<B, T>>>(dev_a,dev_b,dev_c, N);
cudaMemcpy(c, dev_c, size, cudaMemcpyDeviceToHost);
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++) {
printf("%d\t", b[i*N + j]);
}
printf("\n");
}
free(a);
free(b);
free(c);
cudaFree(dev_a);
cudaFree(dev_b);
cudaFree(dev_c);
return 0;
}
再次感谢。