-2

我编写了一个 CUDA 程序来模拟John Conway 的 Game of Life,但有时我会出现内存泄漏(出现不应该存在的单元格)。

这是我的内核:

__global__ void gameOfLife(matrix pcuda_main,int lblock,int generations_to_run) {
/************************************
notice: first dimension is one section for read and one for 
write(change purpose every simulation generation). 2 extra section rows
for first and last line not need to check their up and down position evry time
also edge lines ensure that blocks handles edge sectors are having edge lines less that other blocks
************************************/
    __shared__ unsigned int section[2][SECTION_SIZE][CELLS_IN_LINE];
    int i,j;
    unsigned int sum_top_b1,sum_top_b0,sum_cur_b1,sum_cur_b0,sum_bot_b1,sum_bot_b0,newone,newtwo,new4a,new4b;
    unsigned int left_top,left,left_bot,right_top,right,right_bot;
    int read_section=0;
    int write_section=1;
    int bx = blockIdx.x;
    int row = SECTION_ROWS*blockIdx.x+threadIdx.x;
    int rowx = threadIdx.x+SECTION_ROWS;

    // I am zeroeing the perimiters lines since they dont loaded with values and can be corrupt while more lines may be zeroes its done for avoiding ifs
    section[0][rowx>>LINE_NUMBER_BITS_OFFSET][rowx&CELLS_IN_LINE_RESIDUE] = 0;
    section[1][rowx>>LINE_NUMBER_BITS_OFFSET][rowx&CELLS_IN_LINE_RESIDUE] = 0;
    section[0][SECTION_ROWS-1-(rowx>>LINE_NUMBER_BITS_OFFSET)][rowx&CELLS_IN_LINE_RESIDUE] = 0;
    section[1][SECTION_ROWS-1-(rowx>>LINE_NUMBER_BITS_OFFSET)][rowx&CELLS_IN_LINE_RESIDUE] = 0;
    section[0][3*SECTION_ROWS+1-(rowx>>LINE_NUMBER_BITS_OFFSET)][rowx&CELLS_IN_LINE_RESIDUE] = 0;
    section[1][3*SECTION_ROWS+1-(rowx>>LINE_NUMBER_BITS_OFFSET)][rowx&CELLS_IN_LINE_RESIDUE] = 0;
    section[0][2*SECTION_ROWS+1-(rowx>>LINE_NUMBER_BITS_OFFSET)][rowx&CELLS_IN_LINE_RESIDUE] = 0;
    section[1][2*SECTION_ROWS+1-(rowx>>LINE_NUMBER_BITS_OFFSET)][rowx&CELLS_IN_LINE_RESIDUE] = 0;
    __syncthreads(); // ensure no crashes between zeroeing perimiter lines and loading data
    // since entire warp access the first and last cell together no extrag edges are needed however only first and last line access lines beyond the edges
    for(i=0;i<CELLS_IN_LINE;i++) {
        if ( bx > 0 ) {
            section[0][rowx-SECTION_ROWS+1][i] = pcuda_main[((row-SECTION_ROWS)<<LINE_NUMBER_BITS_OFFSET)+i];
        }
        section[0][rowx+1][i] = pcuda_main[((row)<<LINE_NUMBER_BITS_OFFSET)+i];
        if ( bx < lblock ) {
            // not last block sector row
            section[0][rowx+SECTION_ROWS+1][i] = pcuda_main[((row+SECTION_ROWS)<<LINE_NUMBER_BITS_OFFSET)+i];
        }
    }
    __syncthreads(); // ensure all data read
    for ( i=0;i<generations_to_run;i++ ) {
        for(j=0;j<CELLS_IN_LINE;j++) {
            if ( bx > 0 ) {
                if ( j > 0 ) {
                    left_top = section[read_section][rowx-SECTION_ROWS][j-1];
                    left = section[read_section][rowx-SECTION_ROWS+1][j-1];
                    left_bot = section[read_section][rowx-SECTION_ROWS+2][j-1];
                } else {
                    left_top = 0;
                    left=0;
                    left_bot=0;
                }
                if ( j<CELLS_IN_LINE_RESIDUE ) {
                    right_top= section[read_section][rowx-SECTION_ROWS][j+1];
                    right= section[read_section][rowx-SECTION_ROWS+1][j+1];
                    right_bot= section[read_section][rowx-SECTION_ROWS+2][j+1];
                } else {
                    right_top = 0;
                    right=0;
                    right_bot=0;
                }

                CELL32(section[write_section][rowx-SECTION_ROWS+1][j],
                    left_top,section[read_section][rowx-SECTION_ROWS][j],right_top,
                    left,section[read_section][rowx-SECTION_ROWS+1][j],right,
                    left_bot,section[read_section][rowx-SECTION_ROWS+2][j],right_bot,
                    sum_top_b1,sum_top_b0,sum_cur_b1,sum_cur_b0,sum_bot_b1,sum_bot_b0,newone,newtwo,new4a,new4b);
            }
            if ( j > 0 ) {
                left_top = section[read_section][rowx][j-1];
                left = section[read_section][rowx+1][j-1];
                left_bot = section[read_section][rowx+2][j-1];
            } else {
                left_top = 0;
                left=0;
                left_bot=0;
            }
            if ( j<CELLS_IN_LINE_RESIDUE ) {
                right_top= section[read_section][rowx][j+1];
                right= section[read_section][rowx+1][j+1];
                right_bot= section[read_section][rowx+2][j+1];
            } else {
                right_top = 0;
                right=0;
                right_bot=0;
            }
            CELL32(section[write_section][rowx+1][j],
                left_top,section[read_section][rowx][j],right_top,
                left,section[read_section][rowx+1][j],right,
                left_bot,section[read_section][rowx+2][j],right_bot,
                sum_top_b1,sum_top_b0,sum_cur_b1,sum_cur_b0,sum_bot_b1,sum_bot_b0,newone,newtwo,new4a,new4b);
            if ( bx < lblock ) {
                if ( j > 0 ) {
                    left_top = section[read_section][rowx+SECTION_ROWS][j-1];
                    left = section[read_section][rowx+SECTION_ROWS+1][j-1];
                    left_bot = section[read_section][rowx+SECTION_ROWS+2][j-1];
                } else {
                    left_top = 0;
                    left=0;
                    left_bot=0;
                }
                if ( j<CELLS_IN_LINE_RESIDUE ) {
                    right_top= section[read_section][rowx+SECTION_ROWS][j+1];
                    right= section[read_section][rowx+SECTION_ROWS+1][j+1];
                    right_bot= section[read_section][rowx+SECTION_ROWS+2][j+1];
                } else {
                    right_top = 0;
                    right=0;
                    right_bot=0;
                }
                CELL32(section[write_section][rowx+SECTION_ROWS+1][j],
                    left_top,section[read_section][rowx+SECTION_ROWS][j],right_top,
                    left,section[read_section][rowx+SECTION_ROWS+1][j],right,
                    left_bot,section[read_section][rowx+SECTION_ROWS+2][j],right_bot,
                    sum_top_b1,sum_top_b0,sum_cur_b1,sum_cur_b0,sum_bot_b1,sum_bot_b0,newone,newtwo,new4a,new4b);
            }
        }
        read_section = read_section^1;
        write_section = write_section^1;
        //printf("passed %u generation for row: %u\n",i,row);
        __syncthreads();
    }

    // now writing back to the global memory notice write section turns into read section after every generation so 
    // I write the read section
    for(i=0;i<CELLS_IN_LINE;i++) {
        pcuda_main[((row)<<LINE_NUMBER_BITS_OFFSET)+i] = section[read_section][rowx+1][i];
    }
    __syncthreads();
}

这是单元格 32 的定义及其依赖项:

#define HALFADDER(s0,s1,a0,a1)do{s1=(a0)&(a1);s0=(a0)^(a1);}while(0)
#define FULLADDER(s0,s1,a0,a1,a2)do{s1=((a0)&(a1))|((a2)&((a0)^(a1)));s0 =(a2)^((a0)^(a1));}while(0)

#define CELL32(output,top_left,top,top_right,left,cur,right,bot_left,bot,bot_right,sum_top_b1,sum_top_b0,sum_cur_b1,sum_cur_b0,sum_bot_b1,sum_bot_b0,newone,newtwo,new4a,new4b)do{FULLADDER(sum_top_b0,sum_top_b1,(top_left<<31)|(top>>1),top,(top_right>>31)|(top<<1));HALFADDER(sum_cur_b0,sum_cur_b1,(left<<31)|(cur>>1),(right>>31)|(cur<<1));FULLADDER(sum_bot_b0,sum_bot_b1,(bot_left<<31)|(bot>>1),bot,((bot_right>>31)|(bot<<1)));FULLADDER(newone,newtwo,sum_bot_b0,sum_cur_b0,sum_top_b0);FULLADDER(newtwo,new4a,newtwo,sum_bot_b1,sum_top_b1);HALFADDER(newtwo,new4b,newtwo,sum_cur_b1);newone=newone|cur;output=newone&newtwo&(~new4a)&(~new4b);}while(0)

这是主循环和内存副本:

cudaMalloc((void **)(&pcuda),NUM_CELLS*sizeof(unsigned int));
    //cudaMalloc((void **)(&pdata),ROWS*sizeof(int));
    cudaMemcpy((void *)pcuda,(void *)p,sizeof(unsigned int)*NUM_CELLS,cudaMemcpyHostToDevice);
    cudaDeviceSynchronize();

    while ( generations > 0 ) {
        if ( generations > SECTION_ROWS ) {
            generations_run = SECTION_ROWS;
        } else {
            generations_run = generations;
        }
        generations -= generations_run;
        printf("running params last_row:%u, generations_run:%u, generations left:%u,grid size:%d, array size in bytes: %u,last cell index: %d\n",
            ROWS-SECTION_ROWS,generations_run,generations,dimGrid.x,NUM_CELLS*sizeof(unsigned int),((ROWS-1)<<LINE_NUMBER_BITS_OFFSET)+CELLS_IN_LINE_RESIDUE);
        gameOfLife<<<dimGrid,dimBlock>>>(pcuda,dimGrid.x-1,generations_run);
        error = cudaDeviceSynchronize();
        if (error != cudaSuccess)
        {
            printf("cudaGetDeviceProperties returned error code %d, line(%d)\n", error, __LINE__);
            lineid = __LINE__;
            err = error;
        }
        else
        {
            printf("GPU Device %d:synchronized\n", devID);
        }
    }
    cudaMemcpy((void *)p,(void *)pcuda,sizeof(int)*NUM_CELLS,cudaMemcpyDeviceToHost);

内存泄漏在哪里?

4

1 回答 1

3

CUDA 工具包带有 cuda-memcheck,默认情况下,它将检查内核中的越界访问。它还具有其他模式,包括泄漏检查器。请注意,您需要在退出之前调用 cudaDeviceReset() 以便该工具知道查找未释放的设备内存。

于 2013-01-26T22:41:09.383 回答