0

我试图仅将数组的第一个元素设置为 5.0(比如说)。即,只有一个线程应该设置一个值,而其他线程的其余部分不做任何事情。

这是我的完整代码

#include <stdio.h>
#include <cuda.h>

#define GPUERRCHK(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
   if (code != cudaSuccess)
   {
      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
      if (abort) exit(code);
   }
}

void writeBuf( char * fName, float * out_frame, int dim )
{
    FILE * fp = fopen( fName, "w+" );
    int baseIndx = 0;
    for( int i=0 ; i<dim ; i++ )
    {
        for( int j=0 ; j<dim ; j++ )
        {
            fprintf( fp, "%f ", out_frame[ dim + j ] );
        }
        baseIndx += dim;
        fprintf( fp, "\n" );
    }
    fclose( fp );

}

__global__ void kernel( float * s1, float * s2, int dim, int * hx, int *hy, float *hT, int nHeaters )
{
    int x = threadIdx.x + blockIdx.x*blockDim.x;
    int y = threadIdx.y + blockIdx.y*blockDim.y;
    int offset = x + y*blockDim.x*gridDim.x;


    if( offset < 1 )
    {
    s2[0] = 1.0;
    }

    __syncthreads();

}



int main()
{
    srand48( time(NULL) );
    int dim = 1024;

    float *dev_s1, *dev_s2;
    GPUERRCHK( cudaMalloc( (void**)&dev_s1, dim*dim * sizeof(float) ));
    GPUERRCHK( cudaMalloc( (void**)&dev_s2, dim*dim * sizeof(float) ));
    GPUERRCHK( cudaMemset( dev_s1, 0x00, dim*dim * sizeof(float) ));
    GPUERRCHK( cudaMemset( dev_s2, 0x00, dim*dim * sizeof(float) ));



    //heaters
    int *dev_hx, *dev_hy;
    float *dev_hT;
    int nHeaters = 20;
    GPUERRCHK( cudaMalloc( (void**)&dev_hx, nHeaters * sizeof(int) ));
    GPUERRCHK( cudaMalloc( (void**)&dev_hy, nHeaters * sizeof(int) ));
    GPUERRCHK( cudaMalloc( (void**)&dev_hT, nHeaters * sizeof(float) ));


    //init heaters on cpu
    int * hx, *hy;
    float * hT;
    hx = (int*) malloc( nHeaters * sizeof(int) );
    hy = (int*) malloc( nHeaters * sizeof(int) );
    hT = (float*) malloc( nHeaters * sizeof(float) );
    for( int i=0 ; i<nHeaters ; i++ )
    {
    hx[i] = (int) ((float)drand48() * (float)dim) + 5;
    hy[i] = (int) (drand48() * dim) + 5;
    hT[i] = (float) (drand48() * 100) + 50;
    }

    //transfer hx, hy, hT to GPU
    GPUERRCHK( cudaMemcpy( dev_hx, hx, nHeaters * sizeof(int), cudaMemcpyHostToDevice ));
    GPUERRCHK( cudaMemcpy( dev_hy, hy, nHeaters * sizeof(int), cudaMemcpyHostToDevice ));
    GPUERRCHK( cudaMemcpy( dev_hT, hT, nHeaters * sizeof(float), cudaMemcpyHostToDevice ));


    float *out_frame = (float *) malloc( dim*dim*sizeof(float) );


    // run kernel
    int nThreadsPerBlock = 16;
    int nBlockX = (dim+nThreadsPerBlock-1)/nThreadsPerBlock;
    int nBlockY = (dim+nThreadsPerBlock-1)/nThreadsPerBlock;
    kernel<<< dim3(nBlockX, nBlockY), dim3(nThreadsPerBlock, nThreadsPerBlock) >>>( dev_s1, dev_s2, dim, dev_hx, dev_hy, dev_hT, nHeaters );
    GPUERRCHK( cudaPeekAtLastError() );
    GPUERRCHK( cudaDeviceSynchronize() );


    // collect result
    GPUERRCHK( cudaMemcpy( out_frame, dev_s2, dim*dim * sizeof(float), cudaMemcpyDeviceToHost ) );


    int f=1;
    char fName[100];
    snprintf( fName, 100, "out/file_%04d.data", f );
    writeBuf( fName, out_frame, dim );


    cudaFree( dev_s1 );
    cudaFree( dev_s2 );

    free( out_frame );
}

当我运行它时,该文件包含所有零。我如何实现我计划实现的目标?可能是什么问题呢?

4

1 回答 1

1

您的问题出在 writeBuf(..) @line 中:

fprintf( fp, "%f ", out_frame[ dim + j ] );

dim 是 1024 并且您正在访问从 1024 到 1024+dim-1 的元素,这就是为什么您永远看不到第一个元素的原因。
正确的行应该是:

fprintf( fp, "%f ", out_frame[ baseIndx + j ] );
于 2013-02-04T11:27:38.853 回答