1

我想知道如何找到我的应用程序花费更多时间的确切位置。它是带有 CUDA 调用的 C++ 代码,因此我从 C++ 代码中创建了调用 CUDA 代码的包装器。对 C++ 代码进行计时,执行时间为 5 秒,但是如果我在 Nsight 中分析代码,内核需要 8 毫秒。这怎么可能?

c++代码:

double start_divide = get_host_current_time();
callDivideKernel( keep, d_a, d_A_N );
double end_divide = get_host_current_time();
printf("divideKernel : %g\n", end_divide - start_divide);

cu文件:

void callDivideKernel(int N, float* A, int* A_N){

  cudaEvent_t start, stop;
  float time;
  cudaEventCreate(&start);
  cudaEventCreate(&stop);

  dim3 dimGrid(618,128);
  dim3 dimBlock(512);

  cudaEventRecord(start, 0);
  DivideKernel<<< dimGrid,dimBlock >>>(N, A, A_N);
  cudaEventRecord(stop, 0);
  cudaEventSynchronize(stop);
  cudaEventElapsedTime(&time, start, stop);
  printf("callDividekernel = %f ms\n",time);
  cudaThreadSynchronize();

}

__global__ void DivideKernel(int N, float* A, int* A_N){

  int k =  blockIdx.x * blockDim.x + threadIdx.x +
    blockDim.x*gridDim.x*blockIdx.y;

  int kmax = (N*(N+1))/2;
  int row,col;

  if(k < kmax){
    row = (int)(sqrt(0.25+2.0*k)-0.5); 
    col = k - (row*(row+1))/2;
    int val = max(1, A_N[row*N + col]);
    A[row*N + col] /= (float)val;
  }
}

结果:

callDividekernel = 7.111040 ms
divideKernel : 5.66533
4

0 回答 0