c++ - 使用 nvprof 进行分析时没有 GPU 活动

Question

我nvprof.exe在初始化数据的函数上运行，调用三个内核并释放数据。所有的配置文件都应该这样，我得到了这样的结果：

==7956== Profiling application: .\a.exe
==7956== Profiling result:
 GPU activities:   52.34%  25.375us         1  25.375us  25.375us  25.375us  th_single_row_add(float*, float*, float*)                                   
                   43.57%  21.120us         1  21.120us  21.120us  21.120us  th_single_col_add(float*, float*, float*)                                       
                    4.09%  1.9840us         1  1.9840us  1.9840us  1.9840us  th_single_elem_add(float*, float*, float*)                        
      API calls:   86.77%  238.31ms         9  26.479ms  14.600us  210.39ms  cudaMallocManaged
                   12.24%  33.621ms         1  33.621ms  33.621ms  33.621ms  cuDevicePrimaryCtxRelease
                    0.27%  730.80us         3  243.60us  242.10us  245.60us  cudaLaunchKernel
                    0.15%  406.90us         3  135.63us  65.400us  170.80us  cudaDeviceSynchronize
                    0.08%  229.70us        97  2.3680us     100ns  112.10us  cuDeviceGetAttribute
                    0.08%  206.60us         1  206.60us  206.60us  206.60us  cuModuleUnload
                    0.01%  19.700us         1  19.700us  19.700us  19.700us  cuDeviceTotalMem
                    0.00%  6.8000us         1  6.8000us  6.8000us  6.8000us  cuDeviceGetPCIBusId
                    0.00%  1.9000us         2     950ns     400ns  1.5000us  cuDeviceGet
                    0.00%  1.8000us         3     600ns     400ns     800ns  cuDeviceGetCount
                    0.00%     700ns         1     700ns     700ns     700ns  cuDeviceGetName
                    0.00%     200ns         1     200ns     200ns     200ns  cuDeviceGetUuid
                    0.00%     200ns         1     200ns     200ns     200ns  cuDeviceGetLuid

==7956== Unified Memory profiling result:
Device "GeForce RTX 2060 SUPER (0)"
   Count  Avg Size  Min Size  Max Size  Total Size  Total Time  Name
      18  20.000KB  8.0000KB  32.000KB  360.0000KB  300.7000us  Host To Device
      24  20.000KB  8.0000KB  32.000KB  480.0000KB  2.647400ms  Device To Host

如您所见，在GPU activities. 这是源代码：

void add_elem(int n) {
  float *a, *b, *c1, *c2, *c3;
  cudaMallocManaged(&a, n * n * sizeof(float));
  cudaMallocManaged(&b, n * n * sizeof(float));
  cudaMallocManaged(&c1, n * n * sizeof(float));
  cudaMallocManaged(&c2, n * n * sizeof(float));
  cudaMallocManaged(&c3, n * n * sizeof(float));

  for (int i = 0; i < n*n; i++) {
    a[i] = 1.0f;
    b[i] = 2.0f;
    c1[i] = 0.0f;
    c2[i] = 0.0f;
    c3[i] = 0.0f;
  }

  int blockSize = 256;
  int numBlocks = (n*n + blockSize - 1) / blockSize;
  th_single_elem_add<<<numBlocks, blockSize>>>(a, b, c1);
  th_single_row_add<<<numBlocks, blockSize>>>(a, b, c2);
  th_single_col_add<<<numBlocks, blockSize>>>(a, b, c3);

  cudaDeviceSynchronize();

  cudaFree(a);
  cudaFree(b);
  cudaFree(c1);
  cudaFree(c2);
  cudaFree(c3);
}

之后，我将初始化数据、内核调用和释放数据提取到单独的主机函数并nvprof再次调用。结果我只得到了有关 API 调用的信息，如下所示：

==18460== Profiling application: .\a.exe
==18460== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
      API calls:   81.86%  158.78ms         9  17.643ms  1.4000us  158.76ms  cudaMallocManaged
                    0.17%  322.80us        97  3.3270us     100ns  158.00us  cuDeviceGetAttribute
                    0.11%  214.50us         1  214.50us  214.50us  214.50us  cuModuleUnload
                    0.04%  68.600us         3  22.866us  7.3000us  39.400us  cudaDeviceSynchronize
                    0.01%  12.100us         9  1.3440us     400ns  7.9000us  cudaFree
                    0.00%  7.7000us         1  7.7000us  7.7000us  7.7000us  cuDeviceGetPCIBusId
                    0.00%  2.1000us         3     700ns     300ns  1.0000us  cuDeviceGetCount
                    0.00%  2.0000us         2  1.0000us     300ns  1.7000us  cuDeviceGet
                    0.00%  1.2000us         3     400ns     300ns     500ns  cudaLaunchKernel
                    0.00%     700ns         1     700ns     700ns     700ns  cuDeviceGetName
                    0.00%     300ns         1     300ns     300ns     300ns  cuDeviceGetUuid
                    0.00%     300ns         1     300ns     300ns     300ns  cuDeviceGetLuid

如您所见，也没有 section Unified Memory profiling result，所以我尝试像这样运行 nvprofnvprof.exe --unified-memory-profiling off .\a.exe但得到了相同的结果。源代码：

void add_elem(int n) {
  float *a, *b, *c1;
  cudaMallocManaged(&a, n * n * sizeof(float));
  cudaMallocManaged(&b, n * n * sizeof(float));
  cudaMallocManaged(&c1, n * n * sizeof(float));

  for (int i = 0; i < n*n; i++) {
    a[i] = 1.0f;
    b[i] = 2.0f;
    c1[i] = 0.0f;
  }

  int blockSize = 256;
  int numBlocks = (n*n + blockSize - 1) / blockSize;
  th_single_elem_add<<<numBlocks, blockSize>>>(a, b, c1);

  cudaDeviceSynchronize();

  cudaFree(a);
  cudaFree(b);
  cudaFree(c1);
}

void add_row(int n) {
  float *a, *b, *c1;
  cudaMallocManaged(&a, n * n * sizeof(float));
  cudaMallocManaged(&b, n * n * sizeof(float));
  cudaMallocManaged(&c1, n * n * sizeof(float));

  for (int i = 0; i < n*n; i++) {
    a[i] = 1.0f;
    b[i] = 2.0f;
    c1[i] = 0.0f;
  }

  int blockSize = 256;
  int numBlocks = (n + blockSize - 1) / blockSize;
  th_single_row_add<<<numBlocks, blockSize>>>(a, b, c1, n);

  cudaDeviceSynchronize();

  cudaFree(a);
  cudaFree(b);
  cudaFree(c1);
}

void add_col(int n) {
  float *a, *b, *c1;
  cudaMallocManaged(&a, n * n * sizeof(float));
  cudaMallocManaged(&b, n * n * sizeof(float));
  cudaMallocManaged(&c1, n * n * sizeof(float));

  for (int i = 0; i < n*n; i++) {
    a[i] = 1.0f;
    b[i] = 2.0f;
    c1[i] = 0.0f;
  }

  int blockSize = 256;
  int numBlocks = (n + blockSize - 1) / blockSize;
  th_single_col_add<<<numBlocks, blockSize>>>(a, b, c1, n);

  cudaDeviceSynchronize();

  cudaFree(a);
  cudaFree(b);
  cudaFree(c1);
}

更新：我发现了问题，我在数组中运行了包含 10000000000 个元素的代码，看起来甚至没有调用内核。因为我用 10000000 (10^8) 个元素运行它们，并且用了将近 3 秒的时间来完成，并且用 10000000000 (10^10) 元素立即完成。但是没有错误哦。

我应该如何捕捉这种情况？

score 0 · Accepted Answer

这里的原因是内核是用 unsupported 调用的<<<numBlocks, blockSize>>>。gpuErrchk( cudaPeekAtLastError() );在每次内核调用后添加后，我得到了GPUassert: invalid configuration argument，这意味着我的 GPUnumBlocks或blockSize参数不受支持。没有错误检查脚本只是默默地结束。正如 Robber Corvella 在评论中建议的那样，这里是正确的错误处理链接：

正确的 CUDA 错误检查

此外，跑步也有cuda-memcheck帮助

c++ - 使用 nvprof 进行分析时没有 GPU 活动

1 回答 1

Related

Reference