cuda - 在 CUDA 内核中是否有与 memcpy() 等效的方法？

Question

我正在尝试使用 CUDA 内核异步分解和重塑数组的结构。memcpy()在内核中不起作用，cudaMemcpy()*;也不起作用我不知所措。

谁能告诉我从 CUDA 内核中复制内存的首选方法？

值得注意的是，cudaMemcpy(void *to, void *from, size, cudaMemcpyDeviceToDevice)它不适用于我正在尝试做的事情，因为它只能从内核外部调用并且不会异步执行。

score 36 · Accepted Answer

是的，memcpy在 cuda 内核中有一个等价物。它被称为 memcpy。举个例子：

__global__ void kernel(int **in, int **out, int len, int N)
{
    int idx = threadIdx.x + blockIdx.x*blockDim.x;

    for(; idx<N; idx+=gridDim.x*blockDim.x)
        memcpy(out[idx], in[idx], sizeof(int)*len);

}

像这样编译没有错误：

$ nvcc -Xptxas="-v" -arch=sm_20 -c memcpy.cu 
ptxas info    : Compiling entry function '_Z6kernelPPiS0_ii' for 'sm_20'
ptxas info    : Function properties for _Z6kernelPPiS0_ii
    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 11 registers, 48 bytes cmem[0]

并发出 PTX：

.version 3.0
.target sm_20
.address_size 32

    .file   1 "/tmp/tmpxft_00000407_00000000-9_memcpy.cpp3.i"
    .file   2 "memcpy.cu"
    .file   3 "/usr/local/cuda/nvvm/ci_include.h"

.entry _Z6kernelPPiS0_ii(
    .param .u32 _Z6kernelPPiS0_ii_param_0,
    .param .u32 _Z6kernelPPiS0_ii_param_1,
    .param .u32 _Z6kernelPPiS0_ii_param_2,
    .param .u32 _Z6kernelPPiS0_ii_param_3
)
{
    .reg .pred  %p<4>;
    .reg .s32   %r<32>;
    .reg .s16   %rc<2>;


    ld.param.u32    %r15, [_Z6kernelPPiS0_ii_param_0];
    ld.param.u32    %r16, [_Z6kernelPPiS0_ii_param_1];
    ld.param.u32    %r2, [_Z6kernelPPiS0_ii_param_3];
    cvta.to.global.u32  %r3, %r15;
    cvta.to.global.u32  %r4, %r16;
    .loc 2 4 1
    mov.u32     %r5, %ntid.x;
    mov.u32     %r17, %ctaid.x;
    mov.u32     %r18, %tid.x;
    mad.lo.s32  %r30, %r5, %r17, %r18;
    .loc 2 6 1
    setp.ge.s32     %p1, %r30, %r2;
    @%p1 bra    BB0_5;

    ld.param.u32    %r26, [_Z6kernelPPiS0_ii_param_2];
    shl.b32     %r7, %r26, 2;
    .loc 2 6 54
    mov.u32     %r19, %nctaid.x;
    .loc 2 4 1
    mov.u32     %r29, %ntid.x;
    .loc 2 6 54
    mul.lo.s32  %r8, %r29, %r19;

BB0_2:
    .loc 2 7 1
    shl.b32     %r21, %r30, 2;
    add.s32     %r22, %r4, %r21;
    ld.global.u32   %r11, [%r22];
    add.s32     %r23, %r3, %r21;
    ld.global.u32   %r10, [%r23];
    mov.u32     %r31, 0;

BB0_3:
    add.s32     %r24, %r10, %r31;
    ld.u8   %rc1, [%r24];
    add.s32     %r25, %r11, %r31;
    st.u8   [%r25], %rc1;
    add.s32     %r31, %r31, 1;
    setp.lt.u32     %p2, %r31, %r7;
    @%p2 bra    BB0_3;

    .loc 2 6 54
    add.s32     %r30, %r8, %r30;
    ld.param.u32    %r27, [_Z6kernelPPiS0_ii_param_3];
    .loc 2 6 1
    setp.lt.s32     %p3, %r30, %r27;
    @%p3 bra    BB0_2;

BB0_5:
    .loc 2 9 2
    ret;
}

at 的代码块是编译器自动发出BB0_3的字节大小的循环。memcpy从性能的角度来看，使用它可能不是一个好主意，但它得到了完全支持（并且在所有架构上已经存在很长时间了）。

四年后编辑添加，由于设备端运行时 API 是作为 CUDA 6 发布周期的一部分发布的，因此也可以直接调用类似

cudaMemcpyAsync(void *to, void *from, size, cudaMemcpyDeviceToDevice)

在所有支持它的架构的设备代码中（Compute Capability 3.5 和使用单独编译和设备链接的更新硬件）。

score 9 · Accepted Answer

在我的测试中，最好的答案是编写自己的循环复制例程。就我而言：

__device__
void devCpyCplx(const thrust::complex<float> *in, thrust::complex<float> *out, int len){
  // Casting for improved loads and stores
  for (int i=0; i<len/2; ++i) {
    ((float4*) out)[i] = ((float4*) out)[i];
  }
  if (len%2) {
    ((float2*) out)[len-1] = ((float2*) in)[len-1];
  } 
}

memcpy在内核中工作，但它可能会慢得多。cudaMemcpyAsync从主机是一个有效的选项。

我需要通过 1,600 次复制调用在不同的缓冲区中将 800 个长度为 33,000 的连续向量分区为 16,500 长度。使用 nvvp 计时：

内核中的 memcpy：140 毫秒
主机上的 cudaMemcpy DtoD：34 毫秒
内核中的循环复制：8.6 ms

@talonmies 报告说memcpy逐字节复制，这在加载和存储方面效率低下。我的目标仍然是计算 3.0，所以我无法在设备上测试 cudaMemcpy。

编辑：在较新的设备上测试。设备运行时间cudaMemcpyAsync(out, in, bytes, cudaMemcpyDeviceToDevice, 0)相当于一个好的复制循环，比一个坏的复制循环更好。注意使用设备运行时 api 可能需要编译更改（sm>=3.5，单独编译）。请参阅编程指南和nvcc文档进行编译。

设备memcpy坏。楼主cudaMemcpyAsync好。设备cudaMemcpyAsync好。

score 1 · Accepted Answer

cudaMemcpy() does indeed run asynchronously but you're right, it can't be executed from within a kernel.

Is the new shape of the array determined based on some calculation? Then, you would typically run the same number of threads as there are entries in your array. Each thread would run a calculation to determine the source and destination of a single entry in the array and then copy it there with a single assignment. (dst[i] = src[j]). If the new shape of the array is not based on calculations, it might be more efficient to run a series of cudaMemcpy() with cudaMemCpyDeviceToDevice from the host.

cuda - 在 CUDA 内核中是否有与 memcpy() 等效的方法？

3 回答 3

Related

Reference