0

我目前正在学习 CUDA 以实现高性能计算。我有一个实现雅可比迭代的项目。我的程序某处出现内存错误,我很难追踪它。

我的 Jacobi 内核正确地运行了一次迭代,现在我正在计算旧矩阵和新矩阵之间的最大差异。如果我注释掉下一行代码:

//diff[idx] = BJacobi[idx] - AJacobi[idx];

有用。然而,包括这行代码,会导致 BJacbi 的数据被 AJacobi 的部分数据覆盖(或者至少我认为这是 AJacobi 的数据,它几乎是相同的模式)。这对我来说似乎是一个分配问题,但我不确定它在哪里。

__global__ 
void jacobi(float *diff, float *AJacobi, float *BJacobi, int *bitMask, int size) 
{
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    float sum = 0.0;
    int count = 0;

    if(idx < size * size)
    {
        if(bitMask[idx] == 0)
        { 
            //if left side of matrix
            if(idx - 1 > 0 && idx % size != 0) {
                sum += AJacobi[ idx - 1 ];
                count++;
            }
            //if right side of matrix
            if(idx + 1 < size * size && (idx + 1) % size != 0)
            {
                sum += AJacobi[ idx + 1 ];
                count++;
            }
            //if top of matrix
            if(idx - size > 0)
            {
                sum += AJacobi[ idx - size ];
                count++;
            }
            //if bottom of matrix
            if(idx + size < size * size)
            {
                sum  += AJacobi[ idx + size ];
                count++;
            }
            BJacobi[idx] = sum / count;
        }
        else BJacobi[idx] = AJacobi[idx];
    }

    //diff[idx] = BJacobi[idx] - AJacobi[idx];
}

在我的主要功能中

readSparceMatrix(argv[1], &matrix);
array_size = matrix.rowSize * matrix.rowSize;

//we want as many or more threads then data.
dimGrid = array_size / THREADS + 1;
dimBlock = THREADS;

// ---------------------- START ALLOCATION OF DEVICE MEMEORY
err = cudaMalloc( (void**)&diff, array_size * sizeof(float)); 
if (err != cudaSuccess) {
    fprintf (stderr, "cudaMalloc: %s\n", cudaGetErrorString(err));
    exit(1);
} 
err = cudaMalloc( (void**)&AJacobi, array_size * sizeof(float) );
if (err != cudaSuccess) {
    fprintf (stderr, "cudaMalloc: %s\n", cudaGetErrorString(err));
    exit(1);
} 
err = cudaMalloc( (void**)&BJacobi, array_size * sizeof(float) ); 
if (err != cudaSuccess) {
    fprintf (stderr, "cudaMalloc: %s\n", cudaGetErrorString(err));
    exit(1);
} 
err = cudaMalloc( (void**)&MaxDiffTree, array_size * sizeof(float) ); 
if (err != cudaSuccess) {
    fprintf (stderr, "cudaMalloc: %s\n", cudaGetErrorString(err));
    exit(1);
} 
err = cudaMalloc( (void**)&bitMask, array_size * sizeof(int) ); 
if (err != cudaSuccess) {
    fprintf (stderr, "cudaMalloc: %s\n", cudaGetErrorString(err));
    exit(1);
} 


// ---------------------- START INTITILIZATION OF DEVICE MEMERY 
err = cudaMemset(diff, 1.0, array_size * sizeof(float));
if (err != cudaSuccess) {
    fprintf (stderr, "cudaMemcpy: %s\n", cudaGetErrorString(err));
    exit(1);
} 

err = cudaMemset(BJacobi, 0.0, array_size * sizeof(float));
if (err != cudaSuccess) {
    fprintf (stderr, "cudaMemcpy: %s\n", cudaGetErrorString(err));
    exit(1);
} 

err = cudaMemset(MaxDiffTree, 0.0, array_size * sizeof(float));
if (err != cudaSuccess) {
    fprintf (stderr, "cudaMemcpy: %s\n", cudaGetErrorString(err));
    exit(1);
} 

err = cudaMemcpy(AJacobi, matrix.data, array_size * sizeof(float) ,cudaMemcpyHostToDevice);
if (err != cudaSuccess) {
    fprintf (stderr, "cudaMemcpy: %s\n", cudaGetErrorString(err));
    exit(1);
} 

err = cudaMemcpy(bitMask, matrix.mask, array_size * sizeof(int) ,cudaMemcpyHostToDevice);
if (err != cudaSuccess) {
    fprintf (stderr, "cudaMemcpy: %s\n", cudaGetErrorString(err));
    exit(1);
} 

// ---------------------- START MAIN JACOBI LOOP
//while(MaxDiff >  delta){

jacobi<<<dimGrid, dimBlock>>>(diff, AJacobi, BJacobi, bitMask,  matrix.rowSize);
4

1 回答 1

1

所以这实际上是一个简单的错误,我花了很长时间试图弄清楚。问题发生是因为我有更多的线程然后数据。因此,我的线程索引超出了数组的范围。我的代码中的第一个 if 语句旨在检查这一点,但我的差异分配超出了我的索引检查。在 if 检查下移动 diff 语句解决了我的问题。

if(idx < size * size){
    if(bitMask[idx] == 0){ 
        //if left side of matrix
        if(idx - 1 > 0 && idx % size != 0) {
            sum += src[ idx - 1 ];
            count++;
        }
        //if right side of matrix
        if(idx + 1 < size * size && (idx + 1) % size != 0)
        {
            sum += src[ idx + 1 ];
            count++;
        }
        //if top of matrix
        if(idx - size > 0)
        {
            sum += src[ idx - size ];
            count++;
        }
        //if bottom of matrix
        if(idx + size < size * size)
        {
            sum  += src[ idx + size ];
            count++;
        }
        dst[idx] = sum / count;
    }
    else dst[idx] = src[idx];

    diff[idx] = dst[idx] - src[idx];
}   
于 2013-03-05T06:54:48.970 回答