cuda - Cusp中的真实缩放稀疏矩阵向量乘法？

Question

在 cusp 中，有一个乘法来计算 spmv（稀疏矩阵向量乘法），它需要一个 reduce 和一个 combine：

 template <typename LinearOperator,
             typename MatrixOrVector1,
             typename MatrixOrVector2,
             typename UnaryFunction,
             typename BinaryFunction1,
             typename BinaryFunction2>
    void multiply(const LinearOperator&  A,
                  const MatrixOrVector1& B,
                  MatrixOrVector2& C,
                  UnaryFunction  initialize,
                  BinaryFunction1 combine,
                  BinaryFunction2 reduce);

从界面看来，对于任何矩阵/向量乘法，自定义合并和归约都应该是可能的。我认为 cusp 支持使用在推力/功能.h 中定义的其他组合和减少函数，除了乘法和加法来计算 spmv。例如，我可以使用thrust::plus 来代替原来的combine 函数（即乘法）吗？而且我猜，这个缩放的 spmv 也支持 coo、csr、dia、hyb 格式的稀疏矩阵。

但是，当我在矩阵 A 为 coo 格式的 a.cu 中测试以下示例时，我得到了错误的答案。它使用加号运算符进行组合。我用 cmd 编译它：nvcc a.cu -o ato 。

#include <cusp/csr_matrix.h>
#include <cusp/monitor.h>
#include <cusp/multiply.h>
#include <cusp/print.h>
#include <cusp/krylov/cg.h>

int main(void)
{
    // COO format in host memory
    int   host_I[13] = {0,0,1,1,2,2,2,3,3,3,4,5,5}; // COO row indices
    int   host_J[13] = {0,1,1,2,2,4,6,3,4,5,5,5,6}; // COO column indices
    int   host_V[13] = {1,1,1,1,1,1,1,1,1,1,1,1,1};
    // x and y arrays in host memory
    int host_x[7] = {1,1,1,1,1,1,1};
    int host_y[6] = {0,0,0,0,0,0};

    // allocate device memory for COO format
    int   * device_I;
    cudaMalloc(&device_I, 13 * sizeof(int));
    int   * device_J;
    cudaMalloc(&device_J, 13 * sizeof(int));
    int * device_V;
    cudaMalloc(&device_V, 13 * sizeof(int));

    // allocate device memory for x and y arrays
    int * device_x;
    cudaMalloc(&device_x, 7 * sizeof(int));
    int * device_y;
    cudaMalloc(&device_y, 6 * sizeof(int));

    // copy raw data from host to device
    cudaMemcpy(device_I, host_I, 13 * sizeof(int),   cudaMemcpyHostToDevice);
    cudaMemcpy(device_J, host_J, 13 * sizeof(int),   cudaMemcpyHostToDevice);
    cudaMemcpy(device_V, host_V, 13 * sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(device_x, host_x,  7 * sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(device_y, host_y,  6 * sizeof(int), cudaMemcpyHostToDevice);

    // matrices and vectors now reside on the device

    // *NOTE* raw pointers must be wrapped with thrust::device_ptr!
    thrust::device_ptr<int>   wrapped_device_I(device_I);
    thrust::device_ptr<int>   wrapped_device_J(device_J);
    thrust::device_ptr<int>   wrapped_device_V(device_V);
    thrust::device_ptr<int>   wrapped_device_x(device_x);
    thrust::device_ptr<int>   wrapped_device_y(device_y);

    // use array1d_view to wrap the individual arrays
    typedef typename cusp::array1d_view< thrust::device_ptr<int>   > DeviceIndexArrayView;
    typedef typename cusp::array1d_view< thrust::device_ptr<int> > DeviceValueArrayView;

    DeviceIndexArrayView row_indices   (wrapped_device_I, wrapped_device_I + 13);
    DeviceIndexArrayView column_indices(wrapped_device_J, wrapped_device_J + 13);
    DeviceValueArrayView values        (wrapped_device_V, wrapped_device_V + 13);
    DeviceValueArrayView x             (wrapped_device_x, wrapped_device_x + 7);
    DeviceValueArrayView y             (wrapped_device_y, wrapped_device_y + 6);

    // combine the three array1d_views into a coo_matrix_view
    typedef cusp::coo_matrix_view<DeviceIndexArrayView,
            DeviceIndexArrayView,
            DeviceValueArrayView> DeviceView;

    // construct a coo_matrix_view from the array1d_views
    DeviceView A(6, 7, 13, row_indices, column_indices, values);

    std::cout << "\ndevice coo_matrix_view" << std::endl;
    cusp::print(A);
    cusp::constant_functor<int> initialize;
    thrust::plus<int> combine;
    thrust::plus<int> reduce;
    cusp::multiply(A , x , y , initialize, combine, reduce);
    std::cout << "\nx array" << std::endl;
    cusp::print(x);
    std::cout << "\n y array, y = A * x" << std::endl;
    cusp::print(y);

    cudaMemcpy(host_y, device_y,  6 * sizeof(int), cudaMemcpyDeviceToHost);

    // free device arrays
    cudaFree(device_I);
    cudaFree(device_J);
    cudaFree(device_V);
    cudaFree(device_x);
    cudaFree(device_y);

    return 0;
}

我得到了以下答案。

device coo_matrix_view
sparse matrix <6, 7> with 13 entries
              0              0        (1)
              0              1        (1)
              1              1        (1)
              1              2        (1)
              2              2        (1)
              2              4        (1)
              2              6        (1)
              3              3        (1)
              3              4        (1)
              3              5        (1)
              4              5        (1)
              5              5        (1)
              5              6        (1)
x array
array1d <7>

        (1)
        (1)
        (1)
        (1)
        (1)
        (1)
        (1)
 y array, y = A * x
array1d <6>
        (4)
        (4)
        (6)
        (6)
        (2)
        (631)

我得到的向量 y 很奇怪，我认为正确的答案 y 应该是：

[9,
9,
10,
10,
8,
9]

所以我不确定combine和reduce的这种替换是否可以适应其他稀疏矩阵格式，比如coo。或者我上面写的代码调用乘法可能是不正确的。你能给我一些帮助吗？任何信息都会有所帮助。

谢谢！

score 1 · Accepted Answer

从您的示例的代码和工具的非常简短的阅读来看，这似乎是 CUSP 中严重破坏的东西，导致此用例出现问题。该代码似乎仅在组合运算符为乘法的情况下意外正常工作，因为它对零元素执行的虚假运算不会影响归约运算（即，它只是将许多额外的零相加）。

cuda - Cusp中的真实缩放稀疏矩阵向量乘法？

1 回答 1

Related

Reference