0

我有一个目前在一个块上运行的 3D 内核:

// The two following variables are set elsewhere in the program.
// I give them possible value here for demonstration purposes.
int* N = {14, 5, 1};
int L = 2; // N's size - 1

int idx = blockIdx.x * blockDim.x + threadIdx.x;
int idy = blockIdx.x * blockDim.y + threadIdx.y;
int idz = blockIdx.x * blockDim.z + threadIdx.z;

int idxInc = idx + 1; // for not to waste threads whose idx = 0
if (idxInc >= 1 && idxInc <= L)
{
    if (idy < N[idxInc])
    {       
        if (idz < N[idxInc-1])
        {
            dw[ idxInc ][ idy ][ idz ] = 0;
        }
    }
}

如果我在一个尺寸为 {2, 5, 14} 的块上启动这个内核,一切都很好。这正是块的每个维度所需的线程数,以便内核执行前两行中定义的数据工作。现在,我不知道如何在多个块之间划分这项工作。我的大脑错误只是试图在两个块上为每个维度找到合适数量的线程。此外,L 可能会有所不同(但我可能会对此有所限制),更有可能 N[1] 会发生很大变化(在此示例中为 5,但可能是 128、256 或 2048...)。所以我必须找到一种算法来自动平衡块的数量,以及块的三个维度中每个维度的线程数。

我真的不知道该怎么办,现在我觉得我很愚蠢!我开始认为我应该停止玩 3 维……或者也许有一个我看不到的简单技巧……

一些帮助?谢谢!

编辑:串行检查结果...

for (layer = 1; layer <= L; layer++)
{
    for (i = 0; i < N[layer]; i++)
    {
        for (j = 0; j < N[layer-1]; j++)
        {
            printf("%1.0f", dw[ layer ][ i ][ j ]);
        }
        printf("\n");
    }
    printf("\n");
}

显示的每个数字都应为 0。

4

1 回答 1

1

这是一个简单的示例代码(我认为)您所描述的内容:

#include <stdio.h>

#define cudaCheckErrors(msg) \
    do { \
        cudaError_t __err = cudaGetLastError(); \
        if (__err != cudaSuccess) { \
            fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
                msg, cudaGetErrorString(__err), \
                __FILE__, __LINE__); \
            fprintf(stderr, "*** FAILED - ABORTING\n"); \
            exit(1); \
        } \
    } while (0)

// for simplicity assume grid is an even multiple of blocksizes
#define XSIZE 1024
#define YSIZE 14
#define ZSIZE 2
#define TSIZE (XSIZE*YSIZE*ZSIZE)
#define BLKX 16
#define BLKY 14
#define BLKZ 2


#define IDX(x,y,z) ((z*(XSIZE*YSIZE))+(y*XSIZE)+x)

typedef float mytype;

__global__ void mykernel(mytype *data){

  int idx = threadIdx.x + blockDim.x*blockIdx.x;
  int idy = threadIdx.y + blockDim.y*blockIdx.y;
  int idz = threadIdx.z + blockDim.z*blockIdx.z;

  if ((idx < XSIZE)&&(idy < YSIZE)&&(idz < ZSIZE))
    data[IDX(idx,idy,idz)] = (mytype)idx;
  if ((idx==127)&&(idy==13)&&(idz==1)) printf("BONJOUR\n");
}

int main(){


// for simplicity assume grid is an even multiple of blocksizes
  dim3 block(BLKX, BLKY, BLKZ);
  dim3 grid(XSIZE/BLKX, YSIZE/BLKY, ZSIZE/BLKZ);

  mytype *h_data, *d_data;

  h_data=(mytype *)malloc(TSIZE*sizeof(mytype));
  if (h_data == 0) {printf("malloc fail\n"); return 1;}
  cudaMalloc((void **)&d_data, TSIZE*sizeof(mytype));
  cudaCheckErrors("cudaMalloc fail");

  for (int x=0; x<XSIZE; x++)
    for (int y=0; y<YSIZE; y++)
      for (int z=0; z<ZSIZE; z++)
        h_data[IDX(x,y,z)] = (mytype)0;

  cudaMemcpy(d_data, h_data, TSIZE*sizeof(mytype), cudaMemcpyHostToDevice);
  cudaCheckErrors("cudaMemcpy fail");

  mykernel<<<grid, block>>>(d_data);
  cudaDeviceSynchronize();
  cudaCheckErrors("kernel fail");

  cudaMemcpy(h_data, d_data, TSIZE*sizeof(mytype), cudaMemcpyDeviceToHost);
  cudaCheckErrors("cudaMemcpy fail");


  for (int x=0; x<XSIZE; x++)
    for (int y=0; y<YSIZE; y++)
      for (int z=0; z<ZSIZE; z++)
        if(h_data[IDX(x,y,z)] != (mytype)x) {printf("data check fail at (x,y,z) = (%d, %d, %d), was: %f, should be: %f\n", x,y,z, h_data[IDX(x,y,z)], x); return 1;}
  printf("Data check passed!\n");


  return 0;
}

编译:

nvcc -arch=sm_20 -o t159 t159.cu

当我运行它时,我得到:

BONJOUR
Data check passed!
于 2013-05-24T05:05:37.777 回答