cuda - 为什么 cuMemAddressReserve() 因 CUDA_INVALID_VALUE 而失败？

Question

考虑以下程序（用 C 语法编写）：

#include <cuda.h>
#include <stdio.h>
#include <stdlib.h>

int main() {
    CUresult result;
    unsigned int init_flags = 0;
    result = cuInit(init_flags);
    if (result != CUDA_SUCCESS) { exit(EXIT_FAILURE); }
    CUcontext ctx;
    unsigned int ctx_create_flags = 0;
    CUdevice device_id = 0;
    result = cuCtxCreate(&ctx, ctx_create_flags, device_id);
    // Note: The created context is also made the current context,
    // so we are _in_ a context from now on.
    if (result != CUDA_SUCCESS) { exit(EXIT_FAILURE); }
    CUdeviceptr requested = 0;
    CUdeviceptr reserved;
    size_t size = 0x20000;
    size_t alignment = 0; // default
    unsigned long long reserve_flags = 0;

    // -----------------------------------
    // ==>> FAILURE on next statement <<==
    // -----------------------------------

    result = cuMemAddressReserve(&reserved, size, alignment, requested, reserve_flags);
    if (result != CUDA_SUCCESS) {
        const char* error_string;
        cuGetErrorString(result, &error_string);
        fprintf(stderr, "cuMemAddressReserve() failed: %s\n", error_string);
        exit(EXIT_FAILURE);
    }
    return 0;
}

尝试预订时失败：

cuMemAddressReserve() failed: invalid argument

我的论点有什么问题？是尺码吗？对齐？请求地址为 0？如果是后者 - 当我真的不在乎的时候，我怎么知道要请求哪个地址？

score 3 · Accepted Answer

如果我没记错的话，虚拟内存管理功能的大小必须是 CUDA 分配粒度的倍数。请参阅cuMemGetAllocationGranularity此博客文章https://developer.nvidia.com/blog/introducing-low-level-gpu-virtual-memory-management/

以下适用于我的机器。

#include <cuda.h>
#include <stdio.h>
#include <stdlib.h>

int main() {
    CUresult result;
    unsigned int init_flags = 0;
    result = cuInit(init_flags);
    if (result != CUDA_SUCCESS) { exit(EXIT_FAILURE); }
    CUcontext ctx;
    unsigned int ctx_create_flags = 0;
    CUdevice device_id = 0;
    result = cuCtxCreate(&ctx, ctx_create_flags, device_id);
    // Note: The created context is also made the current context,
    // so we are _in_ a context from now on.
    if (result != CUDA_SUCCESS) { exit(EXIT_FAILURE); }
    CUdeviceptr requested = 0;
    CUdeviceptr reserved;
    size_t size = 0x20000;
    size_t alignment = 0; // default
    unsigned long long reserve_flags = 0;

    size_t granularity;
    CUmemAllocationProp prop;
    prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
    prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
    prop.location.id = (int)0;
    prop.win32HandleMetaData = NULL;
    result = cuMemGetAllocationGranularity (&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM );
    if (result != CUDA_SUCCESS) { exit(EXIT_FAILURE); }
    printf("minimum granularity %lu\n", granularity);


    size_t padded_size = ((granularity + size - 1) / granularity) * granularity;
    result = cuMemAddressReserve(&reserved, padded_size, alignment, requested, reserve_flags);
    if (result != CUDA_SUCCESS) {
        const char* error_string;
        cuGetErrorString(result, &error_string);
        fprintf(stderr, "cuMemAddressReserve() failed: %s\n", error_string);
        exit(EXIT_FAILURE);
    }
    return 0;
}

score -3 · Accepted Answer

tl; dr：您的保留区域大小不是（某些设备的）分配粒度的倍数。

正如@AbatorAbetor 建议的那样，cuMemAddressReserve()隐式要求内存区域的大小是某个粒度值的倍数。尽管 0x20000 似乎是一个足够大的值（2^21 字节......系统内存页面通常为 4 KiB = 2^12 字节） - NVIDIA GPU 在这里非常苛刻。

例如，具有约 4GB 内存的 Pascal GTX 1050 Ti GPU 的粒度为 0x200000，即 2 MiB - 是您尝试分配的 16 倍。

现在，如果我们有两个具有不同粒度值的设备会发生什么？我们需要使用最小公倍数吗？谁知道。

无论如何，底线：始终在分配之前和保留之前检查粒度。

我已将此作为文档错误提交给 NVIDIA，错误 3486420（但您可能无法访问该链接，因为 NVIDIA 对其用户隐藏了他们的错误）。

cuda - 为什么 cuMemAddressReserve() 因 CUDA_INVALID_VALUE 而失败？

2 回答 2

Related

Reference