cuda - CUDA 矩阵乘法锁定并显示零矩阵

Question

我正在尝试编写一个简单的矩阵乘法程序，该程序不断地将两个矩阵的乘积添加到第三个结果矩阵中（我实际上是在给 GPU 一个锻炼，同时我用一个单独的设备测量功耗）。

当我指定大量迭代时，就会出现我的问题。我已经尝试过使用 BLOCK_SIZE 和矩阵维度值的几种组合，并且我注意到迭代次数可以随着更小的矩阵维度而增加，但是 BLOCK_SIZE 必须是矩阵维度（方阵）的平方根。

在这种情况下，产生的错误是 39 秒（不管迭代值如何，只要它“太多”）冻结，然后是全零矩阵输出。有趣的是，我用 20000 次迭代运行了一次，它运行良好。我再次运行它并得到冻结错误。

有任何想法吗？提前致谢！

核心：

//********************************************************************
// matrixMultiplication_kernel.cu
//
// Kernel for a basic CUDA matrix multiplication program.
//********************************************************************

#ifndef MATRIXMULTIPLICATION_KERNEL
#define MATRIXMULTIPLICATION_KERNEL

#define BLOCK_SIZE 16 // Set thread block size
#define colsA 256     // Set matrix A column dimension
#define rowsA 256     // Set matrix A row dimension
#define colsB 256     // Set matrix B column dimension
#define rowsB colsA   // Set matrix B row dimension
#define colsC colsB   // Set matrix C column dimension
#define rowsC rowsA   // Set matrix C row dimension

//--------------------------------------------------------------------
// matrixMultiplication() - Multiplies matrixA and matrixB, storing
//                          the result in device memory for matrixC.
//
// PRE:  matrixA, matrixB, and matrixC are float pointers; numColsA
//       numColsB are integers.
// POST: The result of multiplying matrixA and matrixB is stored in
//       matrixC.
//--------------------------------------------------------------------
__global__ void matrixMultiplication(float * matrixA, float * matrixB,
                     float * matrixC, int numColsA,
                     int numColsB) {

    /* Declare matrix-multplication holder value ouside of for loop */
    float val;

    /* Set block and thread index positions */
    int blockX = blockIdx.x;
    int blockY = blockIdx.y;
    int threadX = threadIdx.x;
    int threadY = threadIdx.y;

    /*
    Set starting and ending indices of the first sub-matrix of A
    and sub-matrix size for matrix A
    */
    int startA = numColsA * BLOCK_SIZE * blockY;
    int endA = startA + numColsA - 1;
    int subSizeA = BLOCK_SIZE;

    /*
    Set starting index of the first sub-matrix of B and sub-matrix
    size for matrix B
    */
    int startB = BLOCK_SIZE * blockX;
    int subSizeB = BLOCK_SIZE * colsB;

    /* Perform matrix multiplication 20000 times */
    for (int iteration = 0; iteration < 20000; iteration++) {

        /* Loop through matrix A and matrix B's sub-matrices */
        for (int i = startA, j = startB; i <= endA; i += subSizeA,
             j += subSizeB) {

        /*
            Declare shared memory arrays for matrix A and B
            sub-matrices
        */
        __shared__ float subA[BLOCK_SIZE][BLOCK_SIZE];
        __shared__ float subB[BLOCK_SIZE][BLOCK_SIZE];

        /* Fill sub-matrices */
        subA[threadY][threadX] =
            matrixA[i + colsA * threadY + threadX];
        subB[threadY][threadX] =
            matrixB[j + colsB * threadY + threadX];

        /* Ensure that the matrices are loaded */
        __syncthreads();

        /* Loop through the block */
        for (int k = 0; k < BLOCK_SIZE; ++k) {

            /* Compute product of two matrix indices */
            val += subA[threadY][k] * subB[k][threadX];
        }

        /*
            Ensure completion before the next set of sub-matrices
            begin computation
        */
        __syncthreads();
    }

    /* Set device memory for this sub-matrix */
    int position = colsB * BLOCK_SIZE * blockY + BLOCK_SIZE * blockX;   
    matrixC[position + colsB * threadY + threadX] = val;
    }
}

#endif

主持人：

//********************************************************************
// matrixMultiplication.cu
//
// A basic CUDA matrix multiplication program.
//********************************************************************

/* Include necessary libraries and kernel */
#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#include <matrixMultiplication_kernel.cu>

/* Function declarations */
void fillMatrix(float * matrix, int numIndices);

//*************
// Main Program
//*************
int main(int argc, char** argv) {

    /* Declare device memory */
    float * deviceA;
    float * deviceB;
    float * deviceC;

    srand(2013); // Set random seed

    /* Determine total number of indices in each matrix */
    unsigned int numIndicesA = colsA * rowsA;
    unsigned int numIndicesB = colsB * rowsB;
    unsigned int numIndicesC = colsC * rowsC;

    /* Determine memory size of each matrix */
    unsigned int memoryA = sizeof(float) * numIndicesA;
    unsigned int memoryB = sizeof(float) * numIndicesB;
    unsigned int memoryC = sizeof(float) * numIndicesC;

    /* Allocate memory for each matrix */
    float * matrixA = (float *) malloc(memoryA);
    float * matrixB = (float *) malloc(memoryB);
    float * matrixC = (float *) malloc(memoryC);

    /* Set contents of matrices A and B (matrix C is all zeros) */
    fillMatrix(matrixA, numIndicesA);
    fillMatrix(matrixB, numIndicesB); 

    /* Allocate device memory for each matrix */
    cudaMalloc((void **) &deviceA, memoryA);
    cudaMalloc((void **) &deviceB, memoryB);
    cudaMalloc((void **) &deviceC, memoryC);

    /* Copy host memory to device memory for matrices A and B */
    cudaMemcpy(deviceA, matrixA, memoryA, cudaMemcpyHostToDevice);
    cudaMemcpy(deviceB, matrixB, memoryB, cudaMemcpyHostToDevice);

    /* Set thread count to BLOCK_SIZE x BLOCK_SIZE */
    dim3 tCount(BLOCK_SIZE, BLOCK_SIZE);

    /* Set thread block count */
    dim3 tbCount((colsC / tCount.x), (rowsC / tCount.y));

    /* Run kernel */
    matrixMultiplication <<< tbCount, tCount >>> (deviceA, deviceB,
                          deviceC, colsA,
                          colsB);

    /* Copy device memory to host memory for matrix C */
    cudaMemcpy(matrixC, deviceC, memoryC, cudaMemcpyDeviceToHost);

    for(int i = 0; i < 256; i++) {
        printf("%f ", matrixC[i]);
    }
    printf("\n");

    /* Free up host and device memory for each matrix */
    free(matrixA);
    free(matrixB);
    free(matrixC);
    cudaFree(deviceA);
    cudaFree(deviceB);
    cudaFree(deviceC);
}

//--------------------------------------------------------------------
// fillMatrix - Assigns a random float value to each indice of the
//              matrix.
//
// PRE:  matrix is a pointer to a block of bytes in memory; numIndices
//       is the number of indicies in the matrix being instantiated.
// POST: Each index of the matrix has been filled with random float
//       values.
//--------------------------------------------------------------------
void fillMatrix(float * matrix, int numIndices) {

    /* Loop through each index of the matrix */
    for (int i = 0; i < numIndices; ++i) {

    /*
        Assign a random float between 0 and 1 for this index of
        the matrix
    */
    matrix[i] = rand() / (float)RAND_MAX;
    }
}

生成文件：

GCC = nvcc
CUDA_INSTALL_PATH := /usr/local/cuda
INCLUDES := -I. -I$(CUDA_INSTALL_PATH)/include
CUDA_LIBS := -L$(CUDA_INSTALL_PATH)/lib -lcudart

matrixMultiplication.o:     matrixMultiplication.cu
                    $(GCC)  $(INCLUDES) -c matrixMultiplication.cu -o $@ 

matrixMultiplication:       matrixMultiplication.o
        $(GCC)  -o $@ matrixMultiplication.o $(CUDA_LIBS)

clean:
        $(RM)   *.o *~

score 1 · Accepted Answer

问题解决了！由于内核的持续时间长，这是一个系统超时问题。通过切换仅终端模式，我能够规避这个问题。

感谢所有的帮助家伙！

cuda - CUDA 矩阵乘法锁定并显示零矩阵

1 回答 1

Related

Reference