cuda - 常量内存中的推力::device_vector

Question

我有一个需要在设备上多次引用的浮点数组，所以我相信存储它的最佳位置是在 __ 常量 __ 内存中（使用此引用）。数组（或向量）在初始化时需要在运行时写入一次，但被多个不同的函数读取数百万次，因此每个函数调用不断复制到内核似乎是个坏主意。

const int n = 32;
__constant__ float dev_x[n]; //the array in question

struct struct_max : public thrust::unary_function<float,float> {
    float C;
    struct_max(float _C) : C(_C) {}
    __host__ __device__ float operator()(const float& x) const { return fmax(x,C);}
};
void foo(const thrust::host_vector<float> &, const float &);

int main() {
    thrust::host_vector<float> x(n);
    //magic happens populate x
    cudaMemcpyToSymbol(dev_x,x.data(),n*sizeof(float));

    foo(x,0.0);
    return(0);
}

void foo(const thrust::host_vector<float> &input_host_x, const float &x0) {
    thrust::device_vector<float> dev_sol(n);
    thrust::host_vector<float> host_sol(n);

    //this method works fine, but the memory transfer is unacceptable
    thrust::device_vector<float> input_dev_vec(n);
    input_dev_vec = input_host_x; //I want to avoid this
    thrust::transform(input_dev_vec.begin(),input_dev_vec.end(),dev_sol.begin(),struct_max(x0));
    host_sol = dev_sol; //this memory transfer for debugging

    //this method compiles fine, but crashes at runtime
    thrust::device_ptr<float> dev_ptr = thrust::device_pointer_cast(dev_x);
    thrust::transform(dev_ptr,dev_ptr+n,dev_sol.begin(),struct_max(x0));
    host_sol = dev_sol; //this line crashes
}

我尝试添加一个全局推力::device_vector dev_x(n)，但这在运行时也崩溃了，并且会在 __ global __ 内存中而不是 __ constant__ 内存中

如果我只是丢弃推力库，这一切都可以工作，但是有没有办法将推力库与全局变量和设备常量内存一起使用？

score 8 · Accepted Answer

好问题！您不能将__constant__数组视为常规设备指针。

我会回答你的问题（在下面的行之后），但首先：这是一个不好的用法__constant__，它不是你真正想要的。CUDA 中的常量缓存经过优化，可以跨线程进行统一访问。这意味着 warp 中的所有线程同时访问相同的位置。如果 warp 的每个线程访问不同的常量内存位置，则访问将被序列化。因此，连续线程访问连续内存位置的访问模式将比统一访问慢 32 倍。你真的应该只使用设备内存。如果你需要写入一次数据，但读取多次，那么只需使用一个 device_vector：初始化一次，然后多次读取。

要按照您的要求进行操作，您可以使用 athrust::counting_iterator作为输入来thrust::transform为数组生成一系列索引__constant__。然后你的仿函数operator()采用int索引操作数而不是float值操作数，并查找常量内存。

（请注意，这意味着您的仿函数现在__device__只是代码。如果您需要可移植性，您可以轻松地重载运算符以获取浮点数并在主机数据上以不同的方式调用它。）

我修改了您的示例以初始化数据并打印结果以验证它是否正确。

#include <stdio.h>
#include <stdlib.h>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/iterator/counting_iterator.h>

const int n = 32;
__constant__ float dev_x[n]; //the array in question

struct struct_max : public thrust::unary_function<float,float> {
    float C;
    struct_max(float _C) : C(_C) {}

    // only works as a device function
    __device__ float operator()(const int& i) const { 
        // use index into constant array
        return fmax(dev_x[i],C); 
    }
};

void foo(const thrust::host_vector<float> &input_host_x, const float &x0) {
    thrust::device_vector<float> dev_sol(n);
    thrust::host_vector<float> host_sol(n);

    thrust::device_ptr<float> dev_ptr = thrust::device_pointer_cast(dev_x);
    thrust::transform(thrust::make_counting_iterator(0),
                      thrust::make_counting_iterator(n),
                      dev_sol.begin(),
                      struct_max(x0));
    host_sol = dev_sol; //this line crashes

    for (int i = 0; i < n; i++)
        printf("%f\n", host_sol[i]);
}

int main() {
    thrust::host_vector<float> x(n);

    //magic happens populate x
    for (int i = 0; i < n; i++) x[i] = rand() / (float)RAND_MAX;

    cudaMemcpyToSymbol(dev_x,x.data(),n*sizeof(float));

    foo(x, 0.5);
    return(0);
}

cuda - 常量内存中的推力::device_vector

1 回答 1

Related

Reference