我试图在我的程序中发现一个错误,它使用 CUDA 将方阵相乘。只要无法从内核 printf() (请告诉您是否知道如何操作),我会向内核发送一个额外的结构来获取线程、块、行和列的数量。该结构的所有元素都是-1(以识别它们)。因此,矩阵乘积部分不正确,但来自内核的数据完全让我感到困惑。这是代码:
typedef struct {
int tx;
int ty;
int bx;
int by;
int rw;
int cl;
} THREADS;
__global__ void mult1_kernel (float *a, float *b, float *c, THREADS *d, int n) {
int k;
float sum = 0.0f;
int rw = blockIdx.y * blockDim.y + threadIdx.y;
int cl = blockIdx.x * blockDim.x + threadIdx.x;
for (k = 0; k < n; k++)
sum += a[rw*n + k] * b[k*n + cl];
c[rw*n + cl] = sum;
d[rw*n + cl].tx = threadIdx.y;
d[rw*n + cl].ty = threadIdx.x;
d[rw*n + cl].bx = blockIdx.y;
d[rw*n + cl].by = blockIdx.x;
d[rw*n + cl].rw = rw;
d[rw*n + cl].cl = cl;
}
...
dim3 blockSize = dim3 (16, 16);
dim3 gridSize = dim3 ((n+15)/16, (n+15)/16);
现在给出的是:
Input matrix A:
1 2 3 4
2 3 4 5
3 4 5 6
4 5 6 7
Input matrix B:
1 0.5 0.333 0.25
0.5 0.333 0.25 0.2
0.333 0.25 0.2 0.167
0.25 0.2 0.167 0.143
Time spent executing on the GPU: 0.05 millseconds
Matrix product of A and B (CUDA):
4 2.72 2.1 1.72
6.08 4 3.05 2.48
4 3.05 2.48 2.1
3.22 2.6 2.19 1.89
[ThreadIdx.x][ThreadIdx.y]:
[0][0] [0][1] [0][2] [0][3]
[1][0] [1][1] [-1][-1] [-1][-1]
[-1][-1] [-1][-1] [-1][-1] [-1][-1]
[-1][-1] [-1][-1] [-1][-1] [-1][-1]
[BlockIdx.x][BlockIdx.y]:
[0][0] [0][0] [0][0] [0][0]
[0][0] [-1][-1] [-1][-1] [-1][-1]
[-1][-1] [-1][-1] [-1][-1] [-1][-1]
[-1][-1] [-1][-1] [-1][-1] [-1][-1]
[rw][cl]:
[0][0] [0][1] [0][2] [0][3]
[1][0] [-1][-1] [-1][-1] [-1][-1]
[-1][-1] [-1][-1] [-1][-1] [-1][-1]
[-1][-1] [-1][-1] [-1][-1] [-1][-1]
Matrix product of A and B (Correct result):
4 2.72 2.1 1.72
6.08 4 3.05 2.48
8.17 5.28 4 3.24
10.2 6.57 4.95 4
因此,每个线程都会写入其 threadIdx、blockIdx、行数和列数,但大多数值自初始化以来并未更改。请帮助我了解它是如何工作的。
第 2 部分。每个线程输出正确的总和,但在将数据从设备内存复制到主机后,一些元素变为零。这是代码:
#include <stdio.h>
#include <stdlib.h>
#include "cuPrintf.cuh"
#include "cuPrintf.cu"
__global__ void mult1_kernel (float *a, float *b, float *c, int n) {
int k;
float sum = 0.0f;
int rw = blockIdx.y * blockDim.y + threadIdx.y;
int cl = blockIdx.x * blockDim.x + threadIdx.x;
if ((rw < n) && (cl < n)) {
rw *= n;
for (k = 0; k < n; k++)
sum += a[rw + k] * b[k*n + cl];
cuPrintf ("Thread[%d][%d] from block[%d][%d]:\n rw = %d, cl = %d, sum = %f\n\n",
threadIdx.x, threadIdx.y, blockIdx.x, blockIdx.y, rw/n, cl, sum);
}
c[rw + cl] = sum;
}
__host__ int mult1_host (float *a, float *b, float *c, int n) {
cudaEvent_t start, stop;
cudaError_t cuerr;
int i, j;
int size = n * n * sizeof (float);
float gpuTime = 0.0f;
float *aDev = NULL, *bDev = NULL, *cDev = NULL;
cuerr = cudaMalloc ((void**)&aDev, size);
if (cuerr != cudaSuccess) {
fprintf (stderr, "Cannot allocate GPU memory for aDev: %s\n", cudaGetErrorString (cuerr));
return (-1);
}
cuerr = cudaMalloc ((void**)&bDev, size);
if (cuerr != cudaSuccess) {
fprintf (stderr, "Cannot allocate GPU memory for bDev: %s\n", cudaGetErrorString (cuerr));
return (-1);
}
cuerr = cudaMalloc ((void**)&cDev, size);
if (cuerr != cudaSuccess) {
fprintf (stderr, "Cannot allocate GPU memory for cDev: %s\n", cudaGetErrorString (cuerr));
return (-1);
}
dim3 blockSize = dim3 (16, 16);
dim3 gridSize = dim3 ((n+15)/16, (n+15)/16);
cuerr = cudaMemcpy (aDev, a, size, cudaMemcpyHostToDevice);
if (cuerr != cudaSuccess) {
fprintf (stderr, "Cannot copy data from a to aDev: %s\n", cudaGetErrorString (cuerr));
return (-1);
}
cuerr = cudaMemcpy (bDev, b, size, cudaMemcpyHostToDevice);
if (cuerr != cudaSuccess) {
fprintf (stderr, "Cannot copy data from a to bDev: %s\n", cudaGetErrorString (cuerr));
return (-1);
}
cudaPrintfInit ();
cudaEventCreate (&start);
cudaEventCreate (&stop);
cudaEventRecord (start, 0);
mult1_kernel <<< gridSize, blockSize >>> (aDev, bDev, cDev, n);
cudaEventRecord (stop, 0);
cudaEventSynchronize (stop);
cudaEventElapsedTime (&gpuTime, start, stop);
cuerr = cudaGetLastError ();
if (cuerr != cudaSuccess) {
fprintf (stderr, "Cannot launch CUDA kernel: %s\n", cudaGetErrorString (cuerr));
return (-1);
}
cuerr = cudaDeviceSynchronize ();
if (cuerr != cudaSuccess) {
fprintf (stderr, "Cannot synchronize CUDA kernel: %s\n", cudaGetErrorString (cuerr));
return (-1);
}
cudaPrintfDisplay (stdout, true);
cudaPrintfEnd ();
cuerr = cudaMemcpy (c, cDev, size, cudaMemcpyDeviceToHost);
if (cuerr != cudaSuccess) {
fprintf (stderr, "Cannot copy data from c to cDev: %s\n", cudaGetErrorString (cuerr));
return (-1);
}
printf ("Time spent executing on the GPU: %.2f millseconds\n", gpuTime);
printf ("\n");
cudaEventDestroy (start);
cudaEventDestroy (stop);
cudaFree (aDev);
cudaFree (bDev);
cudaFree (cDev);
return (0);
}
int main () {
int i, j, k;
int n;
float *a, *b, *c;
printf ("Enter the matrix size: ");
scanf ("%d", &n);
printf ("Blocks = %d\n", n*n/512 + 1);
if (n < 1) {
printf ("Invalid matrix size\n");
return (-1);
}
if (!(a = (float*) malloc (n * n * sizeof (float)))) {
fprintf (stderr, "Cannot allocate CPU memory for a\n");
return (-1);
}
if (!(b = (float*) malloc (n * n * sizeof (float)))) {
fprintf (stderr, "Cannot allocate CPU memory for b\n");
return (-1);
}
if (!(c = (float*) malloc (n * n * sizeof (float)))) {
fprintf (stderr, "Cannot allocate CPU memory for c\n");
return (-1);
}
for (i = 0; i < n; i++)
for (j = 0; j < n; j++)
a[i*n + j] = (float)(i + j + 1);
for (i = 0; i < n; i++)
for (j = 0; j < n; j++)
b[i*n + j] = 1 / (float)(i + j + 1);
printf ("Input matrix A:\n");
for (i = 0; i < n; i++) {
for (j = 0; j < n; j++)
printf ("%8.3g ", a[i*n + j]);
printf ("\n");
}
printf ("\n");
printf ("Input matrix B:\n");
for (i = 0; i < n; i++) {
for (j = 0; j < n; j++)
printf ("%8.3g ", b[i*n + j]);
printf ("\n");
}
printf ("\n");
if (mult1_host (a, b, c, n) < 0) {
return (-1);
}
printf ("Matrix product of A and B (CUDA):\n");
for (i = 0; i < n; i++) {
for (j = 0; j < n; j++)
printf ("%8.3g ", c[i*n + j]);
printf ("\n");
}
printf ("\n");
for (i = 0; i < n; i++)
for (j = 0; j < n; j++) {
c[i*n +j] = 0;
for (k = 0; k < n; k++)
c[i*n + j] += a[i*n + k] * b[k*n + j];
}
printf ("Matrix product of A and B (Check):\n");
for (i = 0; i < n; i++) {
for (j = 0; j < n; j++)
printf ("%8.3g ", c[i*n + j]);
printf ("\n");
}
free (a);
free (b);
free (c);
return (0);
}
它给出了:
[0, 0]: Thread[0][0] from block[0][0]:
rw = 0, cl = 0, sum = 4.000000
[0, 1]: Thread[1][0] from block[0][0]:
rw = 0, cl = 1, sum = 2.716667
[0, 2]: Thread[2][0] from block[0][0]:
rw = 0, cl = 2, sum = 2.100000
[0, 3]: Thread[3][0] from block[0][0]:
rw = 0, cl = 3, sum = 1.721429
[0, 16]: Thread[0][1] from block[0][0]:
rw = 1, cl = 0, sum = 6.083333
[0, 17]: Thread[1][1] from block[0][0]:
rw = 1, cl = 1, sum = 4.000000
[0, 18]: Thread[2][1] from block[0][0]:
rw = 1, cl = 2, sum = 3.050000
[0, 19]: Thread[3][1] from block[0][0]:
rw = 1, cl = 3, sum = 2.480952
[0, 32]: Thread[0][2] from block[0][0]:
rw = 2, cl = 0, sum = 8.166666
[0, 33]: Thread[1][2] from block[0][0]:
rw = 2, cl = 1, sum = 5.283333
[0, 34]: Thread[2][2] from block[0][0]:
rw = 2, cl = 2, sum = 4.000000
[0, 35]: Thread[3][2] from block[0][0]:
rw = 2, cl = 3, sum = 3.240476
[0, 48]: Thread[0][3] from block[0][0]:
rw = 3, cl = 0, sum = 10.250000
[0, 49]: Thread[1][3] from block[0][0]:
rw = 3, cl = 1, sum = 6.566667
[0, 50]: Thread[2][3] from block[0][0]:
rw = 3, cl = 2, sum = 4.950000
[0, 51]: Thread[3][3] from block[0][0]:
rw = 3, cl = 3, sum = 4.000000
Matrix product of A and B (CUDA):
4 2.72 2.1 1.72
6.08 0 0 0
0 0 0 0
0 0 0 0
Time spent executing on the GPU: 0.06 millseconds
Matrix product of A and B (Check):
4 2.72 2.1 1.72
6.08 4 3.05 2.48
8.17 5.28 4 3.24
10.2 6.57 4.95 4
谢谢!