cuda - 使用 CUDA 求解 Ax=b，适用于小于 128x128 的矩阵大小

Question

我编写了一个使用 GPU 进行正向求解的代码（从 A = LU 获取 U 矩阵，其中 U 的对角线条目未设置为单位）。我的代码适用于尺寸小于 128x128 的矩阵，这是块大小的一个因素，即 16。我很困惑为什么更大的矩阵会给我错误的结果。最后一行块出现问题。

------------ gaussian.h --------

// Thread block size
#define BLOCK_SIZE 16   //for forward solve A = LU
#define BLOCK_SIZE2 32  //for finding the pivot factor
// A[i,j] = A[i,j] - A[k,j] / A[k,k] * A[i,k]
// pivot factor = A[k,j] / A[k,k]

///////////////////////////////////////// /////////////////////////////////

------------ gaussian.cu --------

// includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <sys/time.h>
#include <cuda.h>
#include "gaussian_kernel.cu"

#define OUTPUT

void runTest(int argc, char** argv);

double gettime() {
    struct timeval t;
    gettimeofday(&t,NULL);
    return t.tv_sec+t.tv_usec*1e-6;
}

int main(int argc, char** argv)
{
    runTest(argc, argv);
}

void runTest(int argc, char** argv)
{
    int dim;

    if (argc == 2)
    {
        dim = atoi(argv[1]);
    }
    else{
        printf("Wrong Usage\n");
        exit(1);
    }

    // Note: A is a square matrix of size NxN where N % BLOCK_SIZE = 0
    //       Every row of A has a pivot column
    //       It is known that all the arithmetic operations involved for 
    //       Gaussian Elimination are of type int (not float)

    // allocate host memory for matrix A augmented with vector b (Ax=b)
    unsigned int size_A = dim * (dim + BLOCK_SIZE);  
    unsigned int mem_size_A = sizeof(int) * size_A;
    int* h_A = (int*) malloc(mem_size_A);

    // initialize host memory, generate a test case such as below
    // augmented matrix A with padding to make the size evenly divisible by BLOCK_SIZE
    //   ----A----- | b --padding--
    //   1 1 1 1 .. | 0 * * * ..
    //   1 2 2 2 .. | 1 * * * ..
    //   1 2 3 3 .. | 2 * * * ..
    //   1 2 3 4 .. | 3 * * * ..
    //   .......... | ...
    // * means uninitialized entry

    // A is size NxN
    // Augmented matrix with padding is size Nx(N+BLOCK_SIZE)
    int dimRow = dim;  
    int dimCol = dim + BLOCK_SIZE; 

    for(int i = 0; i < dim; i++)
    {       
        h_A[(i + 1) * dimCol - BLOCK_SIZE] = i;  // b vector stored in (N+1)th column 
        // of augmented matrix
        for (int j = 0; j < dimRow - i; j++)
        {
            h_A[j + i + i * dimCol] = i + 1;            // A[i][j] entry
            h_A[j * dimCol + i + i * dimCol] = i + 1;   // A[j][i] entry
        }
    }


    //display the test case  [ A | b ]
    for ( int m = 0 ; m < dimRow; m++)
    {
        for ( int n = 0 ; n <= dimRow; n++)
            //for ( int n = 0 ; n < dimCol; n++)
        {
            printf("%d ", h_A[m * dimCol + n]);
        }
        printf("\n");
    }


    // allocate device memory for the augmented matrix A
    int* d_A;
    cudaMalloc(&d_A, mem_size_A);

    // start timer
    double timer1 = gettime();

    // copy host memory to device
    cudaMemcpy(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice);

    // setup execution parameters for obtaining the pivot factor
    int gridP = (dimRow / BLOCK_SIZE2) + ( (dimRow % BLOCK_SIZE2) == 0 ? 0 : 1 ); 
    // add 1 if dimRow/BLOCK_SIZE2 is not evenly divisible

    // setup execution parameters for the forward solve (Gaussian Elimination)
    dim3 threads(BLOCK_SIZE, BLOCK_SIZE);
    dim3 grid(dimCol / threads.x, dimRow / threads.y);

    // execute the kernel
    for ( int i = 0 ; i < dim; i++)
    {
        Gaussian_Pivot_CUDA<<< gridP, BLOCK_SIZE2 >>>(d_A, dimCol, i);
        Gaussian_Elim_CUDA<<< grid, threads >>>(d_A, dimCol, i);
    }

    // copy result from device to host
    cudaMemcpy(h_A, d_A, mem_size_A, cudaMemcpyDeviceToHost);

    // stop timer
    double timer2 = gettime();
    printf("GPU time = %lf\n",(timer2-timer1)*1000);

    // Back substitution 
    int X[dimRow];
    for (int row = dimRow - 1; row >= 0; row--) 
    {
        X[row] = h_A[(row + 1) * dimCol - BLOCK_SIZE];  // b[row] entry
        for (int col = dimRow - 1; col > row; col--) 
        {
            X[row] -= h_A[row * dimCol + col] * X[col];
        }
        X[row] /= h_A[row * dimCol + row];
        printf("X[%d] = %d\t",row,X[row]);
    }
    printf("\n");


#ifdef OUTPUT

    // result of Gaussian Elimination 
    //   ----A----- | b --padding--
    //   1 1 1 1 .. | 0 * * * ..
    //   0 1 1 1 .. | 1 * * * ..
    //   0 0 1 1 .. | 1 * * * ..
    //   0 0 0 1 .. | 1 * * * ..
    //   .......... | ...
    // * means garbage entry 
    for ( int m = 0 ; m < dimRow; m++)
    {
        for ( int n = 0 ; n <= dimRow; n++)
        {
            printf("%d ", h_A[m * dimCol + n]);
        }
        printf("\n");
    }

#endif

    free(h_A);
    cudaFree(d_A);
}

///////////////////////////////////////// /////////////////////////

------------ gaussian_kernel.cu --------

#include "gaussian.h"

__global__ 
void Gaussian_Pivot_CUDA(int* A, int widthA, int Pcol)
{
    // find the pivot factor 
    // A[i,j] = A[i,j] - A[k,j] / A[k,k] * A[i,k]
    // pivot factor = A[k,j] / A[k,k]

    int bx = blockIdx.x;
    int tx = threadIdx.x;

    int index_row = BLOCK_SIZE2 * bx + tx;
    int index_rowA = index_row * widthA;

    __shared__ int A_RowRow; 
    if (tx == 0)
    {
        A_RowRow = A[Pcol * widthA + Pcol];  // get A[k,k] where k is the pivot column
        // for this problem pivot column = pivot row
    }       
    __syncthreads();

    A[index_rowA - 1] = A[index_rowA + Pcol] / A_RowRow;  
    // INVALID WRITE HERE!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
    // store pivot factor at A[dimCol - 1][row]
}

__global__
void Gaussian_Elim_CUDA(int* A, int widthA, int Pcol)
{
    // Block index
    int bx = blockIdx.x;
    int by = blockIdx.y;

    // Thread index
    int tx = threadIdx.x;
    int ty = threadIdx.y;

    int index_col = BLOCK_SIZE * bx + tx;
    int index_row = BLOCK_SIZE * by + ty;

    int index = widthA * index_row + index_col; 
    //  d_A[index] "=" d_A[index_row][index_col]

    // store pivot factor for rows that we are working on in this block 
    __shared__ int pivotFactor[BLOCK_SIZE];

    if ( ty == 0 ) // use ty instead of tx for coalesced accesses
    {
        pivotFactor[tx] = A[(index_row + tx) * widthA - 1];  
        // INVALID WRITE HERE!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
    }
    __syncthreads();

    // implement the gaussian elimination for the current row
    if ( index_row > Pcol ) 
    {
        A[index] -= A[Pcol * widthA + index_col] * pivotFactor[ty];
        // A[i,j] = A[i,j] - A[k,j] / A[k,k] * A[i,k]
        // pivot factor = A[k,j] / A[k,k]
    }
}

score 1 · Accepted Answer

cuda-memcheck告诉我，我正在阅读和写作超出范围的东西。

我的索引关闭了。在Gaussian_Pivot_CUDA中，最后一行应该是A[widthA*(index_row+1)-1)]=...

在Gaussian_Elim_CUDA，写作时，pivotFactor我必须更改ty为tx并阅读A[widthA*(index_row+1)-1)]。

谢谢@尤金！

score 0 · Accepted Answer

如果sizeof(int)==2，则当 dim 为 128 时，此数组索引将溢出：

h_A[j * dimCol + i + i * dimCol] = i + 1;   // A[j][i] entry

(127*144)+127+(127*144) = 36703 > 32767。

cuda - 使用 CUDA 求解 Ax=b，适用于小于 128x128 的矩阵大小

2 回答 2

Related

Reference