cuda - 如何使用 CUDA Thrust 通过索引列表从矩阵中收集行

Question

这似乎是一个简单的问题，但我就是想不出一个优雅的方式来用 CUDA Thrust 做到这一点。

我有一个二维矩阵 NxM 和一个大小为 L 的所需行索引的向量，它是所有行的子集（即 L < N）并且不规则（基本上是一个不规则列表，如 7,11,13,205，... ETC。）。该矩阵按行存储在推力装置向量中。索引数组也是一个设备向量。这是我的两个问题：

从原始 NxM 矩阵复制所需行形成新矩阵 LxM 的最有效方法是什么？
是否可以为原始 NxM 矩阵创建一个迭代器，该迭代器仅取消对属于所需行的元素的引用？

非常感谢您的帮助。

score 3 · Accepted Answer

您所问的问题似乎是一个非常直接的流压缩问题，并且使用推力没有任何特别的问题，但是有一些曲折。为了选择要复制的行，您需要有一个流压缩算法可以使用的模板或密钥。这需要使用要复制的行列表通过搜索或选择操作来构建。

执行此操作的一个示例过程如下所示：

构造一个迭代器，它返回输入矩阵中任何条目的行号。推力有一个非常有用的counting_iterator，transform_iterator可以结合起来做到这一点
执行对该行号迭代器的搜索，以查找哪些条目与要复制的行列表匹配。thrust::binary search可用于此。搜索产生流压缩操作的模板
用于使用thrust::copy_if模板对输入矩阵执行流压缩。

这听起来像是很多工作和中间步骤，但计数和转换迭代器实际上并没有产生任何中间设备向量。唯一需要的中间存储是模板数组，它可以是一个布尔值（所以 m*n 字节）。

代码中的完整示例：

#include <thrust/copy.h>
#include <thrust/binary_search.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/transform_iterator.h>
#include <thrust/device_vector.h>
#include <cstdio>

struct div_functor : public thrust::unary_function<int,int>
{
    int m;
    div_functor(int _m) : m(_m) {};

    __host__ __device__
    int operator()(int x) const
    {
        return x / m;
    }
};

struct is_true
{
    __host__ __device__
    bool operator()(bool x) { return x; }
};


int main(void)
{

    // dimensions of the problem
    const int m=20, n=5, l=4;

    // Counting iterator for generating sequential indices

    // Sample matrix containing 0...(m*n)
    thrust::counting_iterator<float> indices(0.f);
    thrust::device_vector<float> in_matrix(m*n);
    thrust::copy(indices, indices+(m*n), in_matrix.begin());

    // device vector contain rows to select
    thrust::device_vector<int> select(l);
    select[0] = 1;
    select[1] = 4;
    select[2] = 9;
    select[3] = 16;

    // construct device iterator supplying row numbers via a functor
    typedef thrust::counting_iterator<int> counter;
    typedef thrust::transform_iterator<div_functor, counter> rowIterator;
    rowIterator rows_begin = thrust::make_transform_iterator(thrust::make_counting_iterator(0), div_functor(n));
    rowIterator rows_end = rows_begin + (m*n);

    // constructor a stencil array which indicates which entries will be copied
    thrust::device_vector<bool> docopy(m*n);
    thrust::binary_search(select.begin(), select.end(), rows_begin, rows_end, docopy.begin());

    // use stream compaction on the matrix with the stencil array
    thrust::device_vector<float> out_matrix(l*n);
    thrust::copy_if(in_matrix.begin(), in_matrix.end(), docopy.begin(), out_matrix.begin(), is_true());

    for(int i=0; i<(l*n); i++) {
        float val = out_matrix[i];
        printf("%i %f\n", i, val);
    }
}

（通常的免责声明：使用风险自负）

关于我要发表的唯一评论是，copy_if调用的谓词感觉有点多余，因为我们已经有一个可以直接使用的二进制模板，但似乎没有压缩算法的变体可以在二进制模板直接。同样，我想不出一种直接在流压缩调用中使用行列表的明智方法。很可能有一种更有效的方法可以通过推力来做到这一点，但这至少应该让你开始。

从您的评论来看，空间似乎很紧张，并且二进制搜索和模板创建的额外内存开销对于您的应用程序来说是禁止的。在这种情况下，我会遵循我在对 Roger Dahl 的回答的评论中提供的建议，并改用自定义复制内核。推力设备向量可以转换为可以直接传递给内核 ( thrust::raw_pointer_cast) 的指针，因此它不需要干扰现有的推力代码。我建议每行使用一个线程块来复制，这允许合并读取和写入，并且应该比使用thrust::copy每行更好。一个非常简单的实现可能看起来像这样（重用我的大部分推力示例）：

#include <thrust/copy.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/device_vector.h>
#include <cstdio>

__global__ 
void rowcopykernel(const float *in, float *out, const int *list, const int m, const int n, const int l)
{
    __shared__ const float * inrowp; 
    __shared__ float * outrowp;

    if (threadIdx.x == 0) {
        inrowp = (blockIdx.x < l) ? in + (n*list[blockIdx.x]) : 0;
        outrowp = out + (n*blockIdx.x);
    }
    __syncthreads();

    for(int i=threadIdx.x; (inrowp != 0) && (i<n); i+=blockDim.x) {
        *(outrowp+i) = *(inrowp+i);
    }
}

int main(void)
{
    // dimensions of the problem
    const int m=20, n=5, l=4;

    // Sample matrix containing 0...(m*n)
    thrust::counting_iterator<float> indices(0.f);
    thrust::device_vector<float> in_matrix(m*n);
    thrust::copy(indices, indices+(m*n), in_matrix.begin());

    // device vector contain rows to select
    thrust::device_vector<int> select(l);
    select[0] = 1;
    select[1] = 4;
    select[2] = 9;
    select[3] = 16;

    // Output matrix
    thrust::device_vector<float> out_matrix(l*n);

    // raw pointer to thrust vectors
    int * selp = thrust::raw_pointer_cast(&select[0]);
    float * inp = thrust::raw_pointer_cast(&in_matrix[0]);
    float * outp = thrust::raw_pointer_cast(&out_matrix[0]);

    dim3 blockdim = dim3(128);
    dim3 griddim = dim3(l);
    rowcopykernel<<<griddim,blockdim>>>(inp, outp, selp, m, n, l);

    for(int i=0; i<(l*n); i++) {
        float val = out_matrix[i];
        printf("%i %f\n", i, val);
    }
}

（标准免责声明：使用风险自负）。

执行参数的选择可以做得更漂亮，但除此之外，这应该是所需的全部。如果您的行非常小，您可能希望使用每行的扭曲而不是块进行调查（因此一个块复制几行）。如果您有超过 65535 个输出行，那么您将需要使用 2D 网格，或者修改代码以使每个块执行多行。但是，与基于推力的解决方案一样，这应该可以帮助您入门。

score 1 · Accepted Answer

如果您不固定推力，请查看Arrafire：

令人惊讶的是，与推力不同的是，这个库对下标索引具有原生支持，因此只需几行代码即可解决您的问题：

const int N = 7, M = 5;

float L_host[] = {3, 6, 4, 1}; 
int szL = sizeof(L_host) / sizeof(float); 

// generate random NxM matrix with cuComplex data 
array A = randu(N, M, c32); 
// array used to index rows
array L(szL, 1, L_host);

print(A);
print(L);

array B = A(L,span); // copy selected rows of A
print(B);

结果：

A =
       0.7402 +       0.9210i       0.6814 +       0.2920i       0.5786 +       0.5538i       0.2133 +       0.4131i       0.7305 +       0.9400i
       0.0390 +       0.9690i       0.3194 +       0.8109i       0.3557 +       0.7229i       0.0328 +       0.5360i       0.8432 +       0.6116i
       0.9251 +       0.4464i       0.1541 +       0.4452i       0.2783 +       0.6192i       0.7214 +       0.3546i       0.2674 +       0.0208i
       0.6673 +       0.1099i       0.2080 +       0.6110i       0.5876 +       0.3750i       0.2527 +       0.9847i       0.8331 +       0.7218i
       0.4702 +       0.5132i       0.3073 +       0.4156i       0.2405 +       0.4148i       0.9200 +       0.1872i       0.6087 +       0.6301i
       0.7762 +       0.2948i       0.2343 +       0.8793i       0.0937 +       0.6326i       0.1820 +       0.5984i       0.5298 +       0.8127i
       0.7140 +       0.3585i       0.6462 +       0.9264i       0.2849 +       0.7793i       0.7082 +       0.0421i       0.0593 +       0.4797i


L = (row indices)
       3.0000 
       6.0000 
       4.0000 
       1.0000




B =
       0.6673 +       0.1099i       0.2080 +       0.6110i       0.5876 +       0.3750i       0.2527 +       0.9847i       0.8331 +       0.7218i
       0.7140 +       0.3585i       0.6462 +       0.9264i       0.2849 +       0.7793i       0.7082 +       0.0421i       0.0593 +       0.4797i
       0.4702 +       0.5132i       0.3073 +       0.4156i       0.2405 +       0.4148i       0.9200 +       0.1872i       0.6087 +       0.6301i
       0.0390 +       0.9690i       0.3194 +       0.8109i       0.3557 +       0.7229i       0.0328 +       0.5360i       0.8432 +       0.6116i

它的工作速度也很快。我使用以下代码使用大小为 2000 x 2000 的 cuComplex 数组对此进行了测试：

float *g_data = 0, *g_data2 = 0;
int g_N = 2000, g_M = 2000, // matrix of size g_N x g_M
       g_L = 400;          // copy g_L rows
void af_test() 
{   
  array A(g_N, g_M, (cuComplex *)g_data, afDevicePointer);
  array L(g_L, 1, g_data2, afDevicePointer);
  array B = (A(L, span));
  std::cout << "sz: " << B.elements() << "\n";
}

int main()
{
   // input matrix N x M of cuComplex
   array in = randu(g_N, g_M, c32);
   g_data = (float *)in.device< cuComplex >();
   // generate unique row indices
   array in2 = setunique(floor(randu(g_L) * g_N));
   print(in2);
   g_data2 = in2.device<float>();

   const int N_ITERS = 30;
   try {
     info();
     af::sync();
     timer::tic();

     for(int i = 0; i < N_ITERS; i++) {
       af_test();      
     }
     af::sync();
     printf("af:  %.5f seconds\n",  timer::toc() / N_ITERS);


   } catch (af::exception& e) {
     fprintf(stderr, "%s\n", e.what());
  }
  in.unlock();
  in2.unlock();
}

score 0 · Accepted Answer

我不认为有办法用 Thrust 做到这一点，但是，因为该操作将受内存限制，所以编写一个以最大可能性能执行此操作的内核应该很容易。只需创建与向量中的索引相同数量的线程。让每个线程计算一行的源地址和目标地址，然后用于memcpy()复制该行。

您可能还需要仔细考虑是否可以设置后续处理步骤来访问适当的行，从而避免整个、昂贵的“压缩”操作，它只会打乱内存。即使寻址行变得稍微复杂一些（可能需要额外的内存查找和乘法），整体性能可能会好得多。

cuda - 如何使用 CUDA Thrust 通过索引列表从矩阵中收集行

3 回答 3

Related

Reference