我目前正在使用 CUDA 进行一个项目,其中每 1ms 用 200-10000 个新事件刷新管道。每次,我都想调用一个(/两个)内核来计算一小部分输出;然后将这些输出馈送到管道的下一个元素。
理论流程为:
- 在一个接收数据
std::vector
cudaMemcpy
到 GPU 的向量- 加工
- 生成小输出列表
cudaMemcpy
到输出std::vector
但是当我调用cudaDeviceSynchronize
一个没有处理的 1block/1thread 空内核时,它已经平均需要 0.7 到 1.4ms,这已经高于我的 1ms 时间范围。
我最终可以更改管道的时间范围,以便每 5 毫秒接收一次事件,但每次多 5 倍。但这并不理想。
最小化开销的最佳方法是cudaDeviceSynchronize
什么?流在这种情况下会有所帮助吗?或另一种有效运行管道的解决方案。
(Jetson TK1,计算能力 3.2)
这是应用程序的 nvprof 日志:
==8285== NVPROF is profiling process 8285, command: python player.py test.rec
==8285== Profiling application: python player.py test.rec
==8285== Profiling result:
Time(%) Time Calls Avg Min Max Name
94.92% 47.697ms 5005 9.5290us 1.7500us 13.083us reset_timesurface(__int64, __int64*, __int64*, __int64*, __int64*, float*, float*, bool*, bool*, Event*)
5.08% 2.5538ms 8 319.23us 99.750us 413.42us [CUDA memset]
==8285== API calls:
Time(%) Time Calls Avg Min Max Name
75.00% 5.03966s 5005 1.0069ms 25.083us 11.143ms cudaDeviceSynchronize
17.44% 1.17181s 5005 234.13us 83.750us 3.1391ms cudaLaunch
4.71% 316.62ms 9 35.180ms 23.083us 314.99ms cudaMalloc
2.30% 154.31ms 50050 3.0830us 1.0000us 2.6866ms cudaSetupArgument
0.52% 34.857ms 5005 6.9640us 2.5000us 464.67us cudaConfigureCall
0.02% 1.2048ms 8 150.60us 71.917us 183.33us cudaMemset
0.01% 643.25us 83 7.7490us 1.3330us 287.42us cuDeviceGetAttribute
0.00% 12.916us 2 6.4580us 2.0000us 10.916us cuDeviceGetCount
0.00% 5.3330us 1 5.3330us 5.3330us 5.3330us cuDeviceTotalMem
0.00% 4.0830us 1 4.0830us 4.0830us 4.0830us cuDeviceGetName
0.00% 3.4160us 2 1.7080us 1.5830us 1.8330us cuDeviceGet
程序的小重构(最后的 nvprof 日志) - 由于某种原因,平均值cudaDeviceSynchronize
低 4 倍,但对于空的 1 线程内核来说仍然非常高:
/* Compile with `nvcc test.cu -I.`
* with -I pointing to "helper_cuda.h" and "helper_string.h" from CUDA samples
**/
#include <iostream>
#include <cuda.h>
#include <helper_cuda.h>
#define MAX_INPUT_BUFFER_SIZE 131072
typedef struct {
unsigned short x;
unsigned short y;
short a;
long long b;
} Event;
long long *d_a_[2], *d_b_[2];
float *d_as_, *d_bs_;
bool *d_some_bool_[2];
Event *d_data_;
int width_ = 320;
int height_ = 240;
__global__ void reset_timesurface(long long ts,
long long *d_a_0, long long *d_a_1,
long long *d_b_0, long long *d_b_1,
float *d_as, float *d_bs,
bool *d_some_bool_0, bool *d_some_bool_1, Event *d_data) {
// nothing here
}
void reset_errors(long long ts) {
static const int n = 1024;
static const dim3 grid_size(width_ * height_ / n
+ (width_ * height_ % n != 0), 1, 1);
static const dim3 block_dim(n, 1, 1);
reset_timesurface<<<1, 1>>>(ts, d_a_[0], d_a_[1],
d_b_[0], d_b_[1],
d_as_, d_bs_,
d_some_bool_[0], d_some_bool_[1], d_data_);
cudaDeviceSynchronize();
// static long long *h_holder = (long long*)malloc(sizeof(long long) * 2000);
// cudaMemcpy(h_holder, d_a_[0], 0, cudaMemcpyDeviceToHost);
}
int main(void) {
checkCudaErrors(cudaMalloc(&(d_a_[0]), sizeof(long long)*width_*height_*2));
checkCudaErrors(cudaMemset(d_a_[0], 0, sizeof(long long)*width_*height_*2));
checkCudaErrors(cudaMalloc(&(d_a_[1]), sizeof(long long)*width_*height_*2));
checkCudaErrors(cudaMemset(d_a_[1], 0, sizeof(long long)*width_*height_*2));
checkCudaErrors(cudaMalloc(&(d_b_[0]), sizeof(long long)*width_*height_*2));
checkCudaErrors(cudaMemset(d_b_[0], 0, sizeof(long long)*width_*height_*2));
checkCudaErrors(cudaMalloc(&(d_b_[1]), sizeof(long long)*width_*height_*2));
checkCudaErrors(cudaMemset(d_b_[1], 0, sizeof(long long)*width_*height_*2));
checkCudaErrors(cudaMalloc(&d_as_, sizeof(float)*width_*height_*2));
checkCudaErrors(cudaMemset(d_as_, 0, sizeof(float)*width_*height_*2));
checkCudaErrors(cudaMalloc(&d_bs_, sizeof(float)*width_*height_*2));
checkCudaErrors(cudaMemset(d_bs_, 0, sizeof(float)*width_*height_*2));
checkCudaErrors(cudaMalloc(&(d_some_bool_[0]), sizeof(bool)*width_*height_*2));
checkCudaErrors(cudaMemset(d_some_bool_[0], 0, sizeof(bool)*width_*height_*2));
checkCudaErrors(cudaMalloc(&(d_some_bool_[1]), sizeof(bool)*width_*height_*2));
checkCudaErrors(cudaMemset(d_some_bool_[1], 0, sizeof(bool)*width_*height_*2));
checkCudaErrors(cudaMalloc(&d_data_, sizeof(Event)*MAX_INPUT_BUFFER_SIZE));
for (int i = 0; i < 5005; ++i)
reset_errors(16487L);
cudaFree(d_a_[0]);
cudaFree(d_a_[1]);
cudaFree(d_b_[0]);
cudaFree(d_b_[1]);
cudaFree(d_as_);
cudaFree(d_bs_);
cudaFree(d_some_bool_[0]);
cudaFree(d_some_bool_[1]);
cudaFree(d_data_);
cudaDeviceReset();
}
/* nvprof ./a.out
==9258== NVPROF is profiling process 9258, command: ./a.out
==9258== Profiling application: ./a.out
==9258== Profiling result:
Time(%) Time Calls Avg Min Max Name
92.64% 48.161ms 5005 9.6220us 6.4160us 13.250us reset_timesurface(__int64, __int64*, __int64*, __int64*, __int64*, float*, float*, bool*, bool*, Event*)
7.36% 3.8239ms 8 477.99us 148.92us 620.17us [CUDA memset]
==9258== API calls:
Time(%) Time Calls Avg Min Max Name
53.12% 1.22036s 5005 243.83us 9.6670us 8.5762ms cudaDeviceSynchronize
25.10% 576.78ms 5005 115.24us 44.250us 11.888ms cudaLaunch
9.13% 209.77ms 9 23.308ms 16.667us 208.54ms cudaMalloc
6.56% 150.65ms 1 150.65ms 150.65ms 150.65ms cudaDeviceReset
5.33% 122.39ms 50050 2.4450us 833ns 6.1167ms cudaSetupArgument
0.60% 13.808ms 5005 2.7580us 1.0830us 104.25us cudaConfigureCall
0.10% 2.3845ms 9 264.94us 22.333us 537.75us cudaFree
0.04% 938.75us 8 117.34us 58.917us 169.08us cudaMemset
0.02% 461.33us 83 5.5580us 1.4160us 197.58us cuDeviceGetAttribute
0.00% 15.500us 2 7.7500us 3.6670us 11.833us cuDeviceGetCount
0.00% 7.6670us 1 7.6670us 7.6670us 7.6670us cuDeviceTotalMem
0.00% 4.8340us 1 4.8340us 4.8340us 4.8340us cuDeviceGetName
0.00% 3.6670us 2 1.8330us 1.6670us 2.0000us cuDeviceGet
*/