cuda - 无法在 CUDA 中执行设备内核

Question

我正在尝试在全局内核中调用设备内核。我的全局内核是矩阵乘法，我的设备内核正在查找乘积矩阵每一列中的最大值和索引。以下是代码：

__device__ void MaxFunction(float* Pd, float* max)
{
  int x = (threadIdx.x + blockIdx.x * blockDim.x);  
  int y = (threadIdx.y + blockIdx.y * blockDim.y); 
  int k = 0;
  int temp = 0; int temp_idx = 0;
  for (k = 0; k < wB; ++k) {
   if(Pd[x*wB + y] > temp){
    temp = Pd[x*wB + y];
    temp_idx = x*wB + y;
   }
       max[y*2 + 0] = temp;
       max[y*2 + 1] = temp_idx;
  }
}

__global__ void MatrixMulKernel(float* Md, float* Nd, float* Pd, float* max)
{
  // declare cache in the shared memory
  __shared__ float Mds[blockD][blockD];
  __shared__ float Nds[blockD][blockD];

  float Pvalue = 0;
  // Loop over the Md and Nd block dimension required to compute the Pd element
  for (int m = (wA * blockD * blockIdx.y), n = (blockD * blockIdx.x); 
                            m < ((wA * blockD * blockIdx.y)+wA-1); 
                                        m += blockD, n += (blockD*hB)){

    // collaboratively loading of Md and Nd blocks into shared memory    
    Mds[threadIdx.y][threadIdx.x] = Md[m + wA * threadIdx.y + threadIdx.x];
    Nds[threadIdx.y][threadIdx.x] = Nd[n + wA * threadIdx.y + threadIdx.x];
    __syncthreads();

    // keep track of the running sum    
    for (int k = 0; k < blockD; k++)
      Pvalue += Mds[threadIdx.y][k] * Nds[k][threadIdx.x];
    __syncthreads();
  }

  // write back to the global memory
  int p = hB * blockD * blockIdx.y + blockD * blockIdx.x;
  Pd[p + hB * threadIdx.y + threadIdx.x] = Pvalue;
  __syncthreads();

  MaxFunction(Pd, max);

}

主要代码：

#include<stdio.h>
#include "cuda.h"
#include<stdlib.h>

#define blockD 32


const int wA = 128;
const int hA = 1024;

const int wB = 128;
const int hB = wA;

main(void){

    void MatrixMultiplication(float *, float *, float *, float *);

    int size_A = wA * hA * sizeof(float);
    int size_B = wB * hB * sizeof(float);
    int size_C = wB * hA * sizeof(float);
    int size_max = 2 * wB * sizeof(float);
    float *M, *N, *P, *C;   


    // allocate memory on the CPU
    M = (float*)malloc(size_A);
    N = (float*)malloc(size_B);
    P = (float*)malloc(size_max);
    C = (float*)malloc(size_C);

    // initialize the matrices
    for (int y=0; y < hA; y++) {
        for (int x=0; x < wA; x++){
            M[y*wA + x] = x;
       }
    }

    for (int y=0; y<hB; y++) {
        for (int x=0; x<wB; x++){
            N[y*wB + x] = x;
       }
    }

    MatrixMultiplication(M, N, P, C);

    //Write
    FILE *f1;
    int i, j;
    f1 = fopen("max_val.txt","w");
    for(i=0; i < (wB * 2); i+=2){
    fprintf(f1,"%d\t%d\n",int(P[i]),int(P[i+1]));
    }
    fclose(f1);

    f1 = fopen("Prod_mat.txt","w");
    for(i=0; i < 2; i++){
    for(j=0; j < wB; j++){
        fprintf(f1,"%d\t",int(C[i*wB + j]));
    }
    fprintf(f1,"\n");
    }
    fclose(f1);

    free( M );
    free( N );
    free( P ); 
            free( C );

    cudaDeviceReset();
    return 0;
}


void MatrixMultiplication(float *M, float *N, float *P, float *C) {

    int size_A = wA * hA * sizeof(float);
    int size_B = wB * hB * sizeof(float);
    int size_C = wB * hA * sizeof(float);
    int size_max = 2 * wB * sizeof(float);
    float *Md, *Nd, *Pd, *max; 

    // allocate memory on the GPU
    cudaMalloc((void**)&Md, size_A);
    cudaMalloc((void**)&Nd, size_B);
    cudaMalloc((void**)&Pd, size_C);
    cudaMalloc((void**)&max, size_max);

    // transfer M and N to device memory
    cudaMemcpy(Md, M, size_A, cudaMemcpyHostToDevice);
    cudaMemcpy(Nd, N, size_B, cudaMemcpyHostToDevice);

    // kernel invocation code
    dim3 dimBlock(blockD, blockD);
    dim3 dimGrid(wA/blockD, hB/blockD);

    //Execute Kernel
    MatrixMulKernel<<<dimGrid, dimBlock>>>( Md, Nd, Pd, max);

    // transfer P from device    
    cudaMemcpy(P, max, size_max, cudaMemcpyDeviceToHost);
    cudaMemcpy(C, Pd, size_C, cudaMemcpyDeviceToHost);

    cudaFree(Md);
    cudaFree(Nd);
    cudaFree(Pd);
    cudaFree(max);
}

矩阵乘法结果很好（使用 Matlab 验证），但我无法获得最大值及其相应的索引。如果有人能指出我做错了什么，我将不胜感激。当我运行上面的代码时，max 变量只有垃圾。

score 2 · Accepted Answer

显然，您正试图在每一列中找到最大值，以及该值的偏移量。

但是你所有的线程y都在同一个位置敲击最大值（max[x*2 + 0]）。不建议这样做，因为没有办法解决竞争条件。您应该使用原子操作或其他方法（例如归约）来处理以这种方式更新单个最大值的多个线程。

由于您需要以原子方式更新两个值（最大值和它的位置），因此将普通访问替换为标准原子函数并不是一件简单的事情。但是，由于您要处理两个 32 位相邻量，您可能会对我的回答感兴趣。

顺便说一句，我认为 matlab 的本机矩阵乘法gpuArray应该比您编写的任何矩阵乘法代码都要快。但它需要并行计算工具箱。

cuda - 无法在 CUDA 中执行设备内核

1 回答 1

Related

Reference