cuda - 矩阵乘法给出错误的输出

Question

我正在尝试做的是乘以矩阵 A 和矩阵 B，然后从乘积矩阵中得到每列最大值的索引。但不幸的是，只有矩阵乘法的前 128*128 个值是正确的，而其他的只是垃圾。我不太明白这是如何工作的。我请求你用这个来指导我..

#include<stdio.h>
#include "cuda.h"
#include<stdlib.h>

#define blockD 32
const int wA = 128;
const int hA = 4096;    
const int wB = 4096;
const int hB = wA;

main(void){

    void MatrixMultiplication(float *, float *, float *, float *);

    int size_A = wA * hA * sizeof(float);
    int size_B = wB * hB * sizeof(float);
    int size_C = wB * hA * sizeof(float);
    int size_max = 2 * wB * sizeof(float);
    float *M, *N, *P, *C;   

    // allocate memory on the CPU
    M = (float*)malloc(size_A);
    N = (float*)malloc(size_B);
    P = (float*)malloc(size_max);
    C = (float*)malloc(size_C);

    // initialize the matrices
    for (int y=0; y < hA; y++) {
        for (int x=0; x < wA; x++){
            M[y*wA + x] = 32; //x + y*wA; 
       }
    }

    for (int y=0; y<hB; y++) {
        for (int x=0; x<wB; x++){
            N[y*wB + x] = 21; //x + y*wB; 
       }
    }


    MatrixMultiplication(M, N, P, C);

    //Write
    FILE *f1;
    int i,j;
    f1 = fopen("C.txt","w");
    for(i = hA - 2 ; i < hA; i ++){
    for(j = 0; j < wB; j++){
        fprintf(f1,"%d\t",int(C[i*wB + j]));
    }
    fprintf(f1,"\n");
    }
    fclose(f1);

    // free the memory allocated on the CPU
    free( M );
    free( N );
    free( P ); 
    free( C );
    cudaDeviceReset();
    return 0;
}


__device__ void MaxFunction(float* Pd, float* max)
{
 int x = (threadIdx.x + blockIdx.x * blockDim.x);  
 int y = (threadIdx.y + blockIdx.y * blockDim.y); 

 int k = 0;

 int temp = 0; int temp_idx = 0;
 for (k = 0; k < wB; ++k) {
            if(Pd[x*wB + k] > temp){
                temp = Pd[x*wB + k];
                temp_idx = x*wB + k;
            }
  }
  max[y*2 + 0] = temp;
  max[y*2 + 1] = temp_idx;
}


__global__ void MatrixMulKernel(float* Md, float* Nd, float* Pd, float* max)
{
  // declare cache in the shared memory
  __shared__ float Mds[blockD][blockD];
  __shared__ float Nds[blockD][blockD];

  float Pvalue = 0;
  // Loop over the Md and Nd block dimension required to compute the Pd element
  for (int m = (wA * blockD * blockIdx.y), n = (blockD * blockIdx.x); 
                            m < ((wA * blockD * blockIdx.y)+wA-1); 
                                        m += blockD, n += (blockD*hB)){

    // collaboratively loading of Md and Nd blocks into shared memory    
    Mds[threadIdx.y][threadIdx.x] = Md[m + wA * threadIdx.y + threadIdx.x];
    Nds[threadIdx.y][threadIdx.x] = Nd[n + wA * threadIdx.y + threadIdx.x];
    __syncthreads();

    // keep track of the running sum    
    for (int k = 0; k < blockD; k++)
      Pvalue += Mds[threadIdx.y][k] * Nds[k][threadIdx.x];
    __syncthreads();
  }

  // write back to the global memory
  int p = hB * blockD * blockIdx.y + blockD * blockIdx.x;
  Pd[p + hB * threadIdx.y + threadIdx.x] = Pvalue;
  __syncthreads();

  MaxFunction(Pd, max);

}

void MatrixMultiplication(float *M, float *N, float *P, float *C) {

    int size_A = wA * hA * sizeof(float);
    int size_B = wB * hB * sizeof(float);
    int size_C = wB * hA * sizeof(float);
    int size_max = 2 * wB * sizeof(float);
    float *Md, *Nd, *Pd, *max; 

    // allocate memory on the GPU
    cudaMalloc((void**)&Md, size_A);
    cudaMalloc((void**)&Nd, size_B);
    cudaMalloc((void**)&Pd, size_C);
    cudaMalloc((void**)&max, size_max);

    // transfer M and N to device memory
    cudaMemcpy(Md, M, size_A, cudaMemcpyHostToDevice);
    cudaMemcpy(Nd, N, size_B, cudaMemcpyHostToDevice);

    // kernel invocation code
    dim3 dimBlock(blockD, blockD);
    dim3 dimGrid(wA/blockD, hB/blockD);

    //Execute Kernel
    MatrixMulKernel<<<dimGrid, dimBlock>>>( Md, Nd, Pd, max);

    // transfer P from device    
    cudaMemcpy(P, max, size_max, cudaMemcpyDeviceToHost);
    cudaMemcpy(C, Pd, size_C, cudaMemcpyDeviceToHost);

    // free the memory allocated on the GPU
    cudaFree(Md);
    cudaFree(Nd);
    cudaFree(Pd);
    cudaFree(max);
}

score 1 · Accepted Answer

在您的代码中，您似乎有不止一个问题。问题之一是，取而代之的是：

dim3 dimGrid(wA/blockD, hB/blockD);

你应该有这个：

dim3 dimGrid(wB/blockD, hA/blockD);

最终，对于每个输出点，您的网格中都需要一个线程。你的公式是给你一个 4 x 4 块的网格，而你需要一个 128 x 128 块的网格。

我在您的代码中发现的另一个问题是在内核中的这些行中：

int p = hB * blockD * blockIdx.y + blockD * blockIdx.x;
Pd[p + hB * threadIdx.y + threadIdx.x] = Pvalue;

它们没有通过输出数组正确索引。我没有尝试使用您的方案对其进行排序，而是使用了这个：

Pd[(threadIdx.x + (blockIdx.x * blockDim.x)) + ((threadIdx.y + (blockIdx.y * blockDim.y))*(gridDim.x*blockDim.x))] = Pvalue;

当我对您的代码进行上述两个更改时，我在整个数组中得到了我认为正确的结果。在我的机器上运行它大约需要 32 秒。（请注意，我没有尝试修复您原来的最大查找代码——请参阅下文以获得更好的方法。）

根据您之前的问题，您似乎担心速度。如果你想做快速矩阵乘法，你应该使用cublas。以下代码显示了如何使用 cublas 将两个普通的 C 样式矩阵相乘（它们不必是正方形）。我还包括了一个列最大查找内核，当列数很大时（例如，超过 500 列左右。在您的示例中，您有 4096 列）。对于少量的列，可能有更快的方法来执行此功能，但少量的列也表明整体问题规模可能很小，因此（这段代码的）速度不会成为真正的问题。

这是代码：

#include <stdio.h>
#include <cublas_v2.h>
#define VERBOSE 1
#define nTPB 64
#define ROW_A 4
#define COL_A 4
#define ROW_B COL_A
#define COL_B 4
#define ROW_C ROW_A
#define COL_C COL_B
#define SIZ_A (ROW_A*COL_A)
#define SIZ_B (ROW_B*COL_B)
#define SIZ_C (ROW_C*COL_C)



// error check macros
#define cudaCheckErrors(msg) \
    do { \
        cudaError_t __err = cudaGetLastError(); \
        if (__err != cudaSuccess) { \
            fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
                msg, cudaGetErrorString(__err), \
                __FILE__, __LINE__); \
            fprintf(stderr, "*** FAILED - ABORTING\n"); \
            exit(1); \
        } \
    } while (0)

// for CUBLAS V2 API
#define cublasCheckErrors(fn) \
    do { \
        cublasStatus_t __err = fn; \
        if (__err != CUBLAS_STATUS_SUCCESS) { \
            fprintf(stderr, "Fatal cublas error: %d (at %s:%d)\n", \
                (int)(__err), \
                __FILE__, __LINE__); \
            fprintf(stderr, "*** FAILED - ABORTING\n"); \
            exit(1); \
        } \
    } while (0)

__global__ void col_max(float *mat, float *max, unsigned int *midx, unsigned int rows, unsigned int cols){
  int idx = threadIdx.x + blockDim.x*blockIdx.x;
  if (idx < cols){
    float tempmax = mat[idx];
    unsigned int tempmidx = 0;
    for (int i = 1; i< rows; i++)
      if (mat[idx + (i*cols)] > tempmax){
        tempmax = mat[idx + (i*cols)];
        tempmidx = i;}
    max[idx] = tempmax;
    midx[idx] = tempmidx;
  }
}

int main(){

  float *h_A, *h_B, *h_C, *d_A, *d_B, *d_C, *h_max, *d_max;
  unsigned int *h_idx, *d_idx;

  h_A = (float *)malloc(SIZ_A*sizeof(float));
  if (h_A==0) {printf("malloc fail\n"); return -1;}
  h_B = (float *)malloc(SIZ_B*sizeof(float));
  if (h_B==0) {printf("malloc fail\n"); return -1;}
  h_C = (float *)malloc(SIZ_C*sizeof(float));
  if (h_C==0) {printf("malloc fail\n"); return -1;}
  h_max = (float *)malloc(COL_C*sizeof(float));
  if (h_max==0) {printf("malloc fail\n"); return -1;}
  h_idx = (unsigned int*)malloc(COL_C*sizeof(unsigned int));

  if (h_idx==0) {printf("malloc fail\n"); return -1;}

  cudaMalloc((void **)&d_A, SIZ_A*sizeof(float));
  cudaMalloc((void **)&d_B, SIZ_B*sizeof(float));
  cudaMalloc((void **)&d_C, SIZ_C*sizeof(float));
  cudaMalloc((void **)&d_max, COL_C*sizeof(float));
  cudaMalloc((void **)&d_idx, COL_C*sizeof(unsigned int));
  cudaCheckErrors("cuda malloc fail");

  // initialize data
  for (int i=0; i< SIZ_A; i++) h_A[i] = (float)(i+1);
  for (int i=0; i< SIZ_B; i++) h_B[i] = (float)(i+2);

  cudaMemcpy(d_A, h_A, SIZ_A*sizeof(float), cudaMemcpyHostToDevice);
  cudaMemcpy(d_B, h_B, SIZ_B*sizeof(float), cudaMemcpyHostToDevice);
  cudaCheckErrors("cuda memcpy 1 fail");
  const float alpha = 1.0f;
  const float beta  = 0.0f;
  cublasHandle_t handle;
  cublasCheckErrors(cublasCreate(&handle));
  // C = A*B
  // due to cublas expecting column-major storage, parameters
  // are scrambled
  cublasCheckErrors(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, COL_B, ROW_A, COL_A, &alpha, d_B, COL_B, d_A, COL_A, &beta, d_C, COL_C));
  cudaMemcpy(h_C, d_C, SIZ_C*sizeof(float), cudaMemcpyDeviceToHost);
  cudaCheckErrors("cuda memcpy 2 fail");
  col_max<<<(COL_C + nTPB - 1)/nTPB, nTPB>>>(d_C, d_max, d_idx, ROW_C, COL_C);
  cudaCheckErrors("kernel launch fail");
  cudaMemcpy(h_max, d_max, COL_C*sizeof(float), cudaMemcpyDeviceToHost);
  cudaMemcpy(h_idx, d_idx, COL_C*sizeof(unsigned int), cudaMemcpyDeviceToHost);
  cudaCheckErrors("cuda memcpy 3 fail/kernel fail");

  if (VERBOSE){
    printf("A: \n");
    for (int i=0; i< ROW_A; i++){
      for (int j=0; j< COL_A; j++)
        printf("%7.5G", h_A[j+(i*COL_A)]);
      printf("\n");}
    printf("B: \n");
    for (int i=0; i< ROW_B; i++){
      for (int j=0; j< COL_B; j++)
        printf("%7.5G", h_B[j+(i*COL_B)]);
      printf("\n");}
    printf("C = A*B: \n");
    for (int i=0; i< ROW_C; i++){
      for (int j=0; j< COL_C; j++)
        printf("%7.5G", h_C[j+(i*COL_C)]);
      printf("\n");}
    printf("COLUMN MAX:\n");
    for (int i=0; i< COL_C; i++)
      printf("%7.5G", h_max[i]);
    printf("\nCOLUMN MAX IDX:\n");
    for (int i=0; i< COL_C; i++)
      printf("%7d", h_idx[i]);
  }
  printf("\n finished!\n");
  return 0;
}

这是我用来编译的：

$ nvcc -arch=sm_20 -O3 -o t221 t221.cu -lcublas

这是示例输出：

$ cuda-memcheck ./t221
========= CUDA-MEMCHECK
A:
      1      2      3      4
      5      6      7      8
      9     10     11     12
     13     14     15     16
B:
      2      3      4      5
      6      7      8      9
     10     11     12     13
     14     15     16     17
C = A*B:
    100    110    120    130
    228    254    280    306
    356    398    440    482
    484    542    600    658
COLUMN MAX:
    484    542    600    658
COLUMN MAX IDX:
      3      3      3      3
 finished!
========= ERROR SUMMARY: 0 errors
$

当我扩展我的代码以处理您指定的相同尺寸时，（A = 4096x128，B=128x4096）在我的机器上花了大约 1 秒。所以它比你的代码快得多。但是，当我使用您的代码并在内核中注释掉您的调用时MaxFunction，计算矩阵乘法结果也只需要大约 1 秒。因此，如果您想保留矩阵乘法代码（即不使用 cublas），您可以将代码分解为 2 个内核，并在第一个内核中使用您的乘法例程，在第二个内核中使用我的最大查找例程（col_max），也可能得到一个相当快的结果。

正如@talonmies 所指出的，如果您在 Windows 机器上运行，请确保您了解 Windows TDR 的后果。（如果需要，在右上角搜索框中搜索）

cuda - 矩阵乘法给出错误的输出

1 回答 1

Related

Reference