考虑以下程序,用于将非阻塞 GPU 流上的一些工作排入队列:
#include <iostream>
using clock_value_t = long long;
__device__ void gpu_sleep(clock_value_t sleep_cycles) {
clock_value_t start = clock64();
clock_value_t cycles_elapsed;
do { cycles_elapsed = clock64() - start; }
while (cycles_elapsed < sleep_cycles);
}
void callback(cudaStream_t, cudaError_t, void *ptr) {
*(reinterpret_cast<bool *>(ptr)) = true;
}
__global__ void dummy(clock_value_t sleep_cycles) { gpu_sleep(sleep_cycles); }
int main() {
const clock_value_t duration_in_clocks = 1e6;
const size_t buffer_size = 1e7;
bool callback_executed = false;
cudaStream_t stream;
auto host_ptr = std::unique_ptr<char[]>(new char[buffer_size]);
char* device_ptr;
cudaMalloc(&device_ptr, buffer_size);
cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);
cudaMemcpyAsync(device_ptr, host_ptr.get(), buffer_size, cudaMemcpyDefault, stream);
dummy<<<128, 128, 0, stream>>>(duration_in_clocks);
cudaMemcpyAsync(host_ptr.get(), device_ptr, buffer_size, cudaMemcpyDefault, stream);
cudaStreamAddCallback(
stream, callback, &callback_executed, 0 /* fixed and meaningless */);
snapshot = callback_executed;
std::cout << "Right after we finished enqueuing work, the stream has "
<< (snapshot ? "" : "not ") << "concluded execution." << std::endl;
cudaStreamSynchronize(stream);
snapshot = callback_executed;
std::cout << "After cudaStreamSynchronize, the stream has "
<< (snapshot ? "" : "not ") << "concluded execution." << std::endl;
}
缓冲区的大小和内核睡眠周期的长度足够高,当它们与 CPU 线程并行执行时,它应该在它们结束之前完成排队(复制 8 毫秒 + 8 毫秒,复制 20 毫秒)内核)。
然而,看看下面的跟踪,似乎两者cudaMemcpyAsync()
实际上是同步的,即它们阻塞直到(非阻塞)流实际上结束了复制。这是预期的行为吗?它似乎收缩了CUDA Runtime API 文档的相关部分。这有什么意义?
跟踪:(编号的行,以微秒为单位的时间):
1 "Start" "Duration" "Grid X" "Grid Y" "Grid Z" "Block X" "Block Y" "Block Z"
104 14102.830000 59264.347000 "cudaMalloc"
105 73368.351000 19.886000 "cudaStreamCreateWithFlags"
106 73388.and 20 ms for the kernel).
然而,看看下面的跟踪,似乎两者cudaMemcpyAsync()
实际上是同步的,即它们阻塞直到(非阻塞)流实际上结束了复制。这是预期的行为吗?这似乎与 CUDA Runtime API 文档的相关部分相矛盾。这有什么意义?
850000 8330.257000 "cudaMemcpyAsync"
107 73565.702000 8334.265000 47.683716 5.587311 "Pageable" "Device" "GeForce GTX 650 Ti BOOST (0)" "1"
108 81721.124000 2.394000 "cudaConfigureCall"
109 81723.865000 3.585000 "cudaSetupArgument"
110 81729.332000 30.742000 "cudaLaunch (dummy(__int64) [107])"
111 81760.604000 39589.422000 "cudaMemcpyAsync"
112 81906.303000 20157.648000 128 1 1 128 1 1
113 102073.103000 18736.208000 47.683716 2.485355 "Device" "Pageable" "GeForce GTX 650 Ti BOOST (0)" "1"
114 121351.936000 5.560000 "cudaStreamSynchronize"