1

首先,对不起,如果我的语法很糟糕,我在处理不同维度(160x320)的二维数组时遇到问题。

dim3    blocks(DIMX/16,DIMZ/32);
dim3    threads(16,16);  

这段代码编译得很好,但不知何故只处理了 160x160,剩下的数组仍然为零。我做错了吗?

#include "cuda.h"
#include "conio.h"
#include <fstream>
#include <sstream>
#include <iostream>
#include <assert.h>
#include "../common/book.h"
#define DIMX 160
#define DIMZ 320
#define PI 3.1415926535897932f
#define dx 1.0
#define dz 1.0
#define dt 0.001
#define samp 500
#define nite 1000


__global__ void txz_kernel(float *txz,float *vz)
{
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
int offset = x + y * blockDim.x * gridDim.x;
vz[offset]=txz[offset]+vz[offset];
}

int  main( void ) 
{
    float              *txz;
    float              *vz;

        HANDLE_ERROR( cudaMalloc( (void**)&txz, DIMX * DIMZ * sizeof(float)));
        HANDLE_ERROR( cudaMalloc( (void**)&vz, DIMX * DIMZ * sizeof(float)));

        float *tempvz = (float*)malloc( sizeof(float)*(DIMX*DIMZ));    
        float *temptxz = (float*)malloc( sizeof(float)*(DIMX*DIMZ)); 

    for (int i=0; i<DIMX; i++) {
        for (int j=0; j<DIMZ; j++) {
        int ij=DIMX*j + i; 
        tempvz[ij]=0.0;
        temptxz[ij]=100.0;
        }
    }

    for (int i=0; i<DIMX; i++) {
        for (int j=(121); j<DIMZ; j++) {
        int ij=DIMX*j + i;       
        tempvz[ij]=0.0;
        temptxz[ij]=150.0;
        }
    }
            HANDLE_ERROR( cudaMemcpy( vz, tempvz,sizeof(float)*(DIMX*DIMZ),cudaMemcpyHostToDevice ) );
            HANDLE_ERROR( cudaMemcpy( txz, temptxz,sizeof(float)*(DIMX*DIMZ),cudaMemcpyHostToDevice ) );
                dim3    blocks(DIMX/16,DIMZ/32);
                dim3    threads(16,16);

            txz_kernel<<<blocks,threads>>>(txz,vz) ;            

            float *tempse = (float*)malloc( sizeof(float)*(DIMX*DIMZ));
            HANDLE_ERROR( cudaMemcpy( tempse, vz,sizeof(float)*(DIMX*DIMZ),cudaMemcpyDeviceToHost ) );
            std::ofstream outseis("contour.ctxt"); // output, normal file
            for (int jj=0; jj<DIMZ; jj++) 
            {
                for (int ii=0; ii<DIMX; ii++) 
                {
                int ij=DIMX*jj + ii;               
                outseis<<tempse[ij]<<" ";   
                }
                outseis<<"\r\n";
            }
}
4

1 回答 1

1

该线路块(DIMX/16,DIMZ/32);应该是块(DIMX/16,DIMZ/16);如果其他一切都很好..

还要检查它应该是的索引

int ij=DIMZ*i + j;

如果您正在连续处理主要订单。如果您正在按列主要顺序处理您所写的内容是正确的。

这是您的代码的略微修改版本,它正在编译并给出正确的结果,即添加 2 个数组并找到总和给我 102400 (160 * 320 + 160 *320)

输入你自己的号码并检查..

注意:这是针对行主要订单的。

#include "cuda.h"
#include <fstream>
#include <sstream>
#include <iostream>
#include <assert.h>
#define DIMX 160
#define DIMZ 320
#define PI 3.1415926535897932f
#define dx 1.0
#define dz 1.0
#define dt 0.001
#define samp 500
#define nite 1000

__global__ void txz_kernel(float *txz,float *vz)
{
        int x = threadIdx.x + blockIdx.x * blockDim.x;
        int y = threadIdx.y + blockIdx.y * blockDim.y;
        int offset = (x * blockDim.x * gridDim.x) + y ;


    if (offset < (DIMX * DIMZ))
    {
            vz[offset]=txz[offset]+vz[offset];
    }
    else
    {
            printf ("Offset going out of the bounds\n") ;
    }

}

int  main( void )
{
    float              *txz;
float              *vz;
float               sum = 0.0 ;
float               *tempse ;

    HANDLE_ERROR( cudaMalloc( (void**)&txz, DIMX * DIMZ * sizeof(float)));
    HANDLE_ERROR( cudaMalloc( (void**)&vz, DIMX * DIMZ * sizeof(float)));

    float *tempvz = (float*)malloc( sizeof(float)*(DIMX*DIMZ));
    float *temptxz = (float*)malloc( sizeof(float)*(DIMX*DIMZ));

for (int i=0; i<DIMX; i++) {
    for (int j=0; j<DIMZ; j++) {
    int ij=DIMZ*i + j;
    tempvz[ij]=1.0;
    temptxz[ij]=1.0;
    }
}

    cudaMemcpy( txz, temptxz,sizeof(float)*(DIMX*DIMZ),cudaMemcpyHostToDevice ) ;
    cudaMemcpy( vz, tempvz,sizeof(float)*(DIMX*DIMZ),cudaMemcpyHostToDevice ) ;

            dim3    blocks(DIMX/16,DIMZ/16);
            dim3    threads(16,16);

        txz_kernel<<<blocks,threads>>>(txz,vz) ;
        //cudaDeviceSynchronize() ;

        tempse = (float*)malloc( sizeof(float)*(DIMX*DIMZ));
        HANDLE_ERROR( cudaMemcpy( tempse, vz,sizeof(float)*(DIMX*DIMZ),cudaMemcpyDeviceToHost ) );

        for (int jj=0; jj<DIMX; jj++)
        {
            for (int ii=0; ii<DIMZ; ii++)
            {
                    int ij=DIMZ*jj + ii;
                    sum += tempse[ij] ;
            }
        }

    printf ("The sum is %f\n", sum) ;
}
于 2013-07-05T04:48:14.970 回答