我想知道如何找到我的应用程序花费更多时间的确切位置。它是带有 CUDA 调用的 C++ 代码,因此我从 C++ 代码中创建了调用 CUDA 代码的包装器。对 C++ 代码进行计时,执行时间为 5 秒,但是如果我在 Nsight 中分析代码,内核需要 8 毫秒。这怎么可能?
从c++
代码:
double start_divide = get_host_current_time();
callDivideKernel( keep, d_a, d_A_N );
double end_divide = get_host_current_time();
printf("divideKernel : %g\n", end_divide - start_divide);
cu
文件:
void callDivideKernel(int N, float* A, int* A_N){
cudaEvent_t start, stop;
float time;
cudaEventCreate(&start);
cudaEventCreate(&stop);
dim3 dimGrid(618,128);
dim3 dimBlock(512);
cudaEventRecord(start, 0);
DivideKernel<<< dimGrid,dimBlock >>>(N, A, A_N);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf("callDividekernel = %f ms\n",time);
cudaThreadSynchronize();
}
__global__ void DivideKernel(int N, float* A, int* A_N){
int k = blockIdx.x * blockDim.x + threadIdx.x +
blockDim.x*gridDim.x*blockIdx.y;
int kmax = (N*(N+1))/2;
int row,col;
if(k < kmax){
row = (int)(sqrt(0.25+2.0*k)-0.5);
col = k - (row*(row+1))/2;
int val = max(1, A_N[row*N + col]);
A[row*N + col] /= (float)val;
}
}
结果:
callDividekernel = 7.111040 ms
divideKernel : 5.66533