asynchronous - 为什么 cudaMemcpyAsync 和内核启动即使使用异步流也会阻塞？

Question

考虑以下程序，用于将非阻塞 GPU 流上的一些工作排入队列：

#include <iostream>

using clock_value_t = long long;

__device__ void gpu_sleep(clock_value_t sleep_cycles) {
    clock_value_t start = clock64();
    clock_value_t cycles_elapsed;
    do { cycles_elapsed = clock64() - start; }
    while (cycles_elapsed < sleep_cycles);
}

void callback(cudaStream_t, cudaError_t, void *ptr) { 
    *(reinterpret_cast<bool *>(ptr)) = true; 
}

__global__ void dummy(clock_value_t sleep_cycles) { gpu_sleep(sleep_cycles); }

int main() {
    const clock_value_t duration_in_clocks = 1e6;
    const size_t buffer_size = 1e7;
    bool callback_executed = false;
    cudaStream_t stream;
    auto host_ptr = std::unique_ptr<char[]>(new char[buffer_size]);
    char* device_ptr;
    cudaMalloc(&device_ptr, buffer_size);
    cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);
    cudaMemcpyAsync(device_ptr, host_ptr.get(), buffer_size, cudaMemcpyDefault, stream);
    dummy<<<128, 128, 0, stream>>>(duration_in_clocks);
    cudaMemcpyAsync(host_ptr.get(), device_ptr, buffer_size, cudaMemcpyDefault, stream);
    cudaStreamAddCallback(
        stream, callback, &callback_executed, 0 /* fixed and meaningless */);
    snapshot = callback_executed;
    std::cout << "Right after we finished enqueuing work, the stream has "
        << (snapshot ? "" : "not ") << "concluded execution." << std::endl;
    cudaStreamSynchronize(stream);
    snapshot = callback_executed;
    std::cout << "After cudaStreamSynchronize, the stream has "
        << (snapshot ? "" : "not ") << "concluded execution." << std::endl;
}

缓冲区的大小和内核睡眠周期的长度足够高，当它们与 CPU 线程并行执行时，它应该在它们结束之前完成排队（复制 8 毫秒 + 8 毫秒，复制 20 毫秒）内核）。

然而，看看下面的跟踪，似乎两者cudaMemcpyAsync()实际上是同步的，即它们阻塞直到（非阻塞）流实际上结束了复制。这是预期的行为吗？它似乎收缩了CUDA Runtime API 文档的相关部分。这有什么意义？

跟踪：（编号的行，以微秒为单位的时间）：

      1 "Start"        "Duration"    "Grid X"                             "Grid Y"  "Grid Z"    "Block X"   "Block Y"                       "Block Z"  
    104 14102.830000   59264.347000  "cudaMalloc"
    105 73368.351000   19.886000     "cudaStreamCreateWithFlags"
    106 73388.and 20 ms for the kernel).

然而，看看下面的跟踪，似乎两者cudaMemcpyAsync()实际上是同步的，即它们阻塞直到（非阻塞）流实际上结束了复制。这是预期的行为吗？这似乎与 CUDA Runtime API 文档的相关部分相矛盾。这有什么意义？

850000   8330.257000   "cudaMemcpyAsync"
        107 73565.702000   8334.265000   47.683716                            5.587311  "Pageable"  "Device"    "GeForce GTX 650 Ti BOOST (0)"  "1"        
        108 81721.124000   2.394000      "cudaConfigureCall"
        109 81723.865000   3.585000      "cudaSetupArgument"
        110 81729.332000   30.742000     "cudaLaunch (dummy(__int64) [107])"
        111 81760.604000   39589.422000  "cudaMemcpyAsync"
        112 81906.303000   20157.648000  128                                  1         1           128         1                               1          
        113 102073.103000  18736.208000  47.683716                            2.485355  "Device"    "Pageable"  "GeForce GTX 650 Ti BOOST (0)"  "1"        
        114 121351.936000  5.560000      "cudaStreamSynchronize"

score 2 · Accepted Answer

这看起来很奇怪，所以我联系了 CUDA 驱动团队的某个人，他确认文档是正确的。我也能够确认：

#include <iostream>
#include <memory>

using clock_value_t = long long;

__device__ void gpu_sleep(clock_value_t sleep_cycles) {
    clock_value_t start = clock64();
    clock_value_t cycles_elapsed;
    do { cycles_elapsed = clock64() - start; }
    while (cycles_elapsed < sleep_cycles);
}

void callback(cudaStream_t, cudaError_t, void *ptr) { 
    *(reinterpret_cast<bool *>(ptr)) = true; 
}

__global__ void dummy(clock_value_t sleep_cycles) { gpu_sleep(sleep_cycles); }

int main(int argc, char* argv[]) {
  cudaFree(0);
  struct timespec start, stop;
    const clock_value_t duration_in_clocks = 1e6;
    const size_t buffer_size = 2 * 1024 * 1024 * (size_t)1024;
    bool callback_executed = false;
    cudaStream_t stream;
    void* host_ptr;
    if (argc == 1){
      host_ptr = malloc(buffer_size);
    }
    else {
      cudaMallocHost(&host_ptr, buffer_size, 0);
    }
    char* device_ptr;
    cudaMalloc(&device_ptr, buffer_size);
    cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);

    clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &start);
    cudaMemcpyAsync(device_ptr, host_ptr, buffer_size, cudaMemcpyDefault, stream);
    clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &stop);
    double result = (stop.tv_sec - start.tv_sec) * 1e6 + (stop.tv_nsec - start.tv_nsec) / 1e3;
    std::cout << "Elapsed: " << result / 1000 / 1000<< std::endl;

    dummy<<<128, 128, 0, stream>>>(duration_in_clocks);

    clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &start);
    cudaMemcpyAsync(host_ptr, device_ptr, buffer_size, cudaMemcpyDefault, stream);
    clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &stop);
    result = (stop.tv_sec - start.tv_sec) * 1e6 + (stop.tv_nsec - start.tv_nsec) / 1e3;
    std::cout << "Elapsed: " << result / 1000 / 1000 << std::endl;

    cudaStreamAddCallback(
        stream, callback, &callback_executed, 0 /* fixed and meaningless */);
    auto snapshot = callback_executed;
    std::cout << "Right after we finished enqueuing work, the stream has "
        << (snapshot ? "" : "not ") << "concluded execution." << std::endl;
    cudaStreamSynchronize(stream);
    snapshot = callback_executed;
    std::cout << "After cudaStreamSynchronize, the stream has "
        << (snapshot ? "" : "not ") << "concluded execution." << std::endl;
}

这基本上是你的代码，有一些修改：

时间测量
从可分页或固定内存分配的开关
2 GiB 的缓冲区大小以确保可测量的复制时间
cudaFree(0)强制 CUDA 延迟初始化。

结果如下：

$ nvcc -std=c++11 main.cu -lrt

$ ./a.out # using pageable memory
Elapsed: 0.360828 # (memcpyDtoH pageable -> device, fully async)
Elapsed: 5.20288 # (memcpyHtoD device -> pageable, sync)

$ ./a.out 1 # using pinned memory
Elapsed: 4.412e-06 # (memcpyDtoH pinned -> device, fully async)
Elapsed: 7.127e-06 # (memcpyDtoH device -> pinned, fully async)

从可分页复制到设备时速度较慢，但它确实是异步的。

我很抱歉我的错误。我删除了我以前的评论以避免混淆人们。

score 1 · Accepted Answer

正如@RobinThoni 所指出的那样，CUDA 内存副本仅在严格的条件下是异步的。对于有问题的代码，问题主要是使用未固定（即分页）的主机内存。

引用运行时 API 文档的单独部分（强调我的）：

2.API同步行为

API 以同步和异步形式提供 memcpy/memset 函数，后者具有“Async”后缀。这是用词不当，因为每个函数都可能表现出同步或异步行为，具体取决于传递给函数的参数。

...

异步

对于从设备内存到可分页主机内存的传输，该函数仅在复制完成后返回。

而这只是其中的一半！这是真的

对于从可分页主机内存到设备内存的传输，数据将首先暂存于固定主机内存，然后复制到设备；并且该函数仅在暂存发生后才返回。

asynchronous - 为什么 cudaMemcpyAsync 和内核启动即使使用异步流也会阻塞？

2 回答 2

2.API同步行为

Related

Reference