0

考虑以下代码:

#include<iostream>
#include<vector>

#include <cuda.h>
#include <cuda_runtime_api.h>

using namespace std;

__global__ void reduce_or(char* A) {
    if(threadIdx.x == 0) {
      A[blockIdx.x] = 1;
    }
}

#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
   if (code != cudaSuccess) 
   {
      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
      if (abort) exit(code);
   }
}

int main(int argc, char** argv) {
  const uint64_t group_size = 1 << 16; //1 << 15 would work
  char *dr;

  std::vector<char> result;
  result.resize(group_size, 0);

  gpuErrchk(cudaMalloc((void **)&dr, group_size));
  gpuErrchk(cudaMemcpy(dr, result.data(), group_size, cudaMemcpyHostToDevice));

  reduce_or<<<group_size, 32>>>(dr);

  gpuErrchk(cudaDeviceSynchronize());
  gpuErrchk(cudaMemcpy(result.data(), dr, group_size, cudaMemcpyDeviceToHost));

  for(int kk = 0; kk < group_size; ++kk) {
    if(result[kk]) {
      cout << std::dec << kk << std::hex << "  " << (unsigned long) result[kk] << endl;
    }
  }
}

如果块的数量大于或等于 65536,则不会修改输入数组,尽管device_query从 CUDA 代码示例中说,x 维度中的最大大小远大于 65536:

Device 0: "Tesla K20Xm"
  CUDA Driver Version / Runtime Version          6.5 / 6.5
  CUDA Capability Major/Minor version number:    3.5
  Total amount of global memory:                 5760 MBytes (6039339008 
  [...]
  Maximum number of threads per multiprocessor:  2048
  Maximum number of threads per block:           1024
  Max dimension size of a thread block (x,y,z): (1024, 1024, 64)
  Max dimension size of a grid size    (x,y,z): (2147483647, 65535, 65535)
  [...]

我做错了什么还是硬件对它的功能撒了谎?这是一个已知的错误?不应该抛出错误吗?

4

1 回答 1

3

检查编译器选项,要获得超过 65535 的网格大小,您必须将最小计算能力设置为至少 3.0,通过选项:compute_30,sm_30。

在此处查看表 13 http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#compute-capabilities如您在 2.x 设备中所见

线程块网格的最大 x 维度 65535

于 2016-02-19T20:45:25.580 回答