0

我正在尝试使用 CUDA 驱动程序 API 将数据复制到 2D 数组中,在下面列出的程序中,但是当我传递我的复制参数时出现“无效值”错误。它们的什么价值是错误的?

#include <cuda.h>

#include <iostream>
#include <iomanip>
#include <numeric>
#include <limits>
#include <cstring>

[[noreturn]] void die_(const std::string& message) {
    std::cerr << message << "\n";
    exit(EXIT_FAILURE);
}

void die_if_error(CUresult status, const std::string& extra_message) {
    if (status != CUDA_SUCCESS) {
        const char* error_string;
        cuGetErrorString(status, &error_string);
        die_(extra_message + ": " + error_string);
    }
}

template <typename T = void>
T* as_pointer(CUdeviceptr address) noexcept { return reinterpret_cast<T*>(address); }

CUdeviceptr as_address(void* ptr) noexcept { return reinterpret_cast<CUdeviceptr>(ptr); }

int main() {
    CUresult status;
    int device_id = 0;
    status = cuInit(0);
    die_if_error(status, "Initializing the CUDA driver");
    CUcontext pctx;
    status = cuDevicePrimaryCtxRetain(&pctx, device_id);
    die_if_error(status, "Obtaining the primary device context");
    cuCtxSetCurrent(pctx);
    struct { unsigned width, height; } dims = { 3, 3 };
    std::cout << "Creating a " << dims.width << " x " << dims.height << " CUDA array" << std::endl;
    CUarray arr_handle;
    {
        CUDA_ARRAY_DESCRIPTOR array_descriptor;
        array_descriptor.Width = dims.width;
        array_descriptor.Height = dims.height;
        array_descriptor.Format = CU_AD_FORMAT_FLOAT;
        array_descriptor.NumChannels = 1;
        status = cuArrayCreate(&arr_handle, &array_descriptor);
        die_if_error(status, "Failed creating a 2D CUDA array");
    }
    auto arr_size = dims.width * dims.height;
    CUdeviceptr dptr;
    status = cuMemAllocManaged(&dptr, arr_size, CU_MEM_ATTACH_GLOBAL);
    die_if_error(status, "Failed allocating managed memory");
    float* ptr_in = as_pointer<float>(dptr);
    std::iota(ptr_in, ptr_in + arr_size, 0);
    CUmemorytype ptr_in_memory_type;
    status = cuPointerGetAttribute(&ptr_in_memory_type, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, as_address(ptr_in));
    if (not (ptr_in_memory_type == CU_MEMORYTYPE_UNIFIED or ptr_in_memory_type == CU_MEMORYTYPE_DEVICE)) {
        die_("Unexpected memory type for ptr_in");
    }
    std::cout << "The memory type of ptr_in is " << (ptr_in_memory_type == CU_MEMORYTYPE_DEVICE ? "DEVICE" : "UNIFIED") << std::endl;
    std::cout << "Will copy from ptr_in into a 2D CUDA array" << std::endl;

    CUDA_MEMCPY2D cp;
    {
        // Source

        cp.srcXInBytes = 0; cp.srcY = 0; // No offset
        cp.srcMemoryType = ptr_in_memory_type;
        cp.srcDevice = as_address(ptr_in);
        // no extra source pitch
        cp.srcPitch = dims.width * sizeof(float);

        // Destination

        cp.dstXInBytes = 0; cp.dstY = 0; // No destination offset
        cp.dstMemoryType = CU_MEMORYTYPE_ARRAY;
        cp.dstArray = arr_handle;

        cp.WidthInBytes = dims.width * sizeof(float);
        cp.Height = dims.height;
    }
    status = cuMemcpy2D(&cp);
    die_if_error(status, "cuMemcpy2D failed");
    cuMemFree(as_address(ptr_in));
}

该程序的完整输出:

Creating a 3 x 3 CUDA array
The memory type of ptr_in is DEVICE
Will copy from ptr_in into a 2D CUDA array
cuMemcpy2D failed: invalid argument

附加信息:

  • CUDA 工具包版本:11.4
  • NVIDIA驱动版本:470.57.02
  • 操作系统发行版:Devuan Chimaera GNU/Linux
  • GPU:GeForce 1050 TI Boost(计算能力 6.1)
  • 主机架构:amd64
4

2 回答 2

2

错误在这里:

auto arr_size = dims.width * dims.height;
CUdeviceptr dptr;
status = cuMemAllocManaged(&dptr, arr_size, CU_MEM_ATTACH_GLOBAL);
                                  ^^^^^^^^

那应该是arr_size*sizeof(float)

cuMemAllocManaged(), likemalloc() 需要一个以字节为单位的大小参数。此大小需要与(大于或等于)您在cuMemcpy2D调用中的隐含传输大小一致。

于 2021-11-29T22:42:10.233 回答
-1

tl; dr:“无效值”可以是没有足够分配内存的指针

(@RobertCrovella 注意到了这个错误,但我想强调一点:)

我们已经习惯了 API 不能过多地检查指针,只凭信心接受它们,然后可能会因无效访问错误而失败(主机端的分段错误、设备端的无效内存访问等)

但是,CUDA(特别是 CUDA 驱动程序)更仔细地检查指针。您已经知道是这种情况,看看它如何告诉您指针指向的内存类型。

好吧,似乎 cuMemCpy2D() 还检查了分配的内存量ptr_in- 并发现它不足以填充该区域,即它将从未分配的内存中复制。这就是它返回“无效值”错误的原因。所以错误代码是有效的,尽管相当模糊。

具体来说,正如@RobertCrovella 指出的那样,您没有为 3x3 浮点数分配足够的内存 - 您arr_size在元素中,即 9,而您需要分配 9 个浮点数,即 36 个字节。您很幸运地写了它,可能是因为 CUDA 的内存分配量或内存页面粒度等。

于 2021-11-30T08:34:50.870 回答