我想计算应该运行1次以上的内核时间,每个正在执行的内核要处理的数据是不同的。我的代码在下面,不应该计算cudaMemcpy的时间。
1 cudaEvent_t start;
2 error = cudaEventCreate(&start);
3 cudaEvent_t stop;
4 error = cudaEventCreate(&stop);
6 float msecTotal = 0.0f;
7 int nIter = 300;
8 for (int j = 0; j < nIter; j++)
9 {
10 cudaMemcpy(...);
// Record the start event
11 error = cudaEventRecord(start, NULL);
12 matrixMulCUDA1<<< grid, threads >>>(...);
// Record the stop event
13 error = cudaEventRecord(stop, NULL);
14 error = cudaEventSynchronize(stop);
15 float msec = 0.0f;
16 error = cudaEventElapsedTime(&msec, start, stop);
17 msecTotal+=msec;
18 }
19 cout<<"Total time = "<<msecTotal<<endl;
公平地说,对比算法应该如下:
1 cudaEvent_t start;
2 error = cudaEventCreate(&start);
3 cudaEvent_t stop;
4 error = cudaEventCreate(&stop);
6 float msecTotal = 0.0f;
7 int nIter = 300;
8 for (int j = 0; j < nIter; j++)
9 {
// Record the start event
11 error = cudaEventRecord(start, NULL);
12 matrixMulCUDA2<<< grid, threads >>>(...);
// Record the stop event
13 error = cudaEventRecord(stop, NULL);
14 error = cudaEventSynchronize(stop);
15 float msec = 0.0f;
16 error = cudaEventElapsedTime(&msec, start, stop);
17 msecTotal+=msec;
18 }
19 cout<<"Total time = "<<msecTotal<<endl;
我的问题是方法对吗?因为我不确定。显然,时间应该比正常时间更长。