cuda - 为什么 cusparse 没有 cusparsecscmm?

Question

cusparse 手册只提供了一个函数 cusparsecsrmm ，它将一个 CSR 格式的稀疏矩阵乘以一个密集矩阵，但是为什么它没有为 CSC 格式的稀疏矩阵提供一个 cusparsecscmm 函数（因为它是作为稀疏数据格式之一引入的）手册）？我错过了什么吗？

我按照 Eric 的建议进行了尝试，但失败了：

cusparse Error: 3 in cusparse_test.cpp at line 106

错误代码 3 是

CUSPARSE_STATUS_INVALID_VALUE invalid parameters were passed (m,n,k,nnz<0 or ldb and ldc are incorrect).

从手册。

以下是我的源代码：

#include <iostream>
#include <stdlib.h>
#include <cuda_runtime.h>
#include <cusparse_v2.h>

#include <gsl/gsl_blas.h>
#include <gsl/gsl_vector.h>
#include <gsl/gsl_matrix.h>


using namespace std;


#define TYPE double
#define Zratio 0.3 //The percentage of zero values
#define M 3
#define K 2
#define N 3

#define CALL_CUDA( err ) \
{ if (err != cudaSuccess) \
    {cout<<"cuda Error "<<err<<" in "<<__FILE__<<" at line "<<__LINE__<<"\n"; exit(EXIT_FAILURE); }\
}

static void HandleError( cusparseStatus_t err,
                         const char *file,
                         int line ) { 
    if (err != CUSPARSE_STATUS_SUCCESS) {
        cout<<"cusparse Error: "<<err<<" in "<<file<<" at line "<<line<<endl;
        exit( EXIT_FAILURE );
    }   
}
#define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ ))

int main()
{
    cout<<"M: "<<M<<"; K: "<<K<<"; N: "<<N<<endl;

    const TYPE alpha =1.0, beta = 0.0;
    int i,j,k;
    double dif;

    float elapsedTime;
    cudaEvent_t start,stop;
    CALL_CUDA( cudaEventCreate( &start ) );
    CALL_CUDA( cudaEventCreate( &stop ) );

    cusparseHandle_t hdl;
    HANDLE_ERROR(cusparseCreate(&hdl));
    cusparseMatDescr_t descr;
    HANDLE_ERROR(cusparseCreateMatDescr(&descr));
    HANDLE_ERROR(cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL));
    HANDLE_ERROR(cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO));


    // Allocate host memory
    TYPE *A, *B, *Out, *x, *y;
    CALL_CUDA( cudaEventRecord( start, 0 ) );
    CALL_CUDA(cudaHostAlloc((void**)&A, M*K*sizeof(TYPE), cudaHostAllocMapped));
    CALL_CUDA(cudaHostAlloc((void**)&B, K*N*sizeof(TYPE), cudaHostAllocMapped));
    CALL_CUDA(cudaHostAlloc((void**)&Out, M*N*sizeof(TYPE), cudaHostAllocMapped));
    CALL_CUDA( cudaEventRecord( stop, 0 ) );
    CALL_CUDA( cudaEventSynchronize( stop ) );
    CALL_CUDA( cudaEventElapsedTime( &elapsedTime,start, stop ) );
    cout<<endl<<"Host allocation time: "<<elapsedTime<<" milliseconds"<<endl;
    TYPE randtmp;
    for(i=0; i<M; i++) for (j=0; j<K; j++) 
    {
        randtmp = (float)rand()/(float)RAND_MAX;
        A[j*M+i] = (randtmp<Zratio)?0:randtmp;
    }
    for(i=0; i<K; i++) for (j=0; j<N; j++) B[j*K+i] = (float)rand()/(float)RAND_MAX;

    cout<<"A:"<<endl; for (i=0; i<M; i++) {for (j=0; j<K; j++) cout<<A[j*M+i]<<" "; cout<<endl;}
    cout<<"B:"<<endl; for (i=0; i<K; i++) {for (j=0; j<N; j++) cout<<B[j*K+i]<<" "; cout<<endl;} cout<<endl;

    // Convert dense matrix to CSR format
    TYPE * cscVal;
    int * cscColPtr, * cscRowInd;
    int nnz, * nnzPerCol;
    CALL_CUDA(cudaHostAlloc((void**)&nnzPerCol, K*sizeof(int), cudaHostAllocMapped));
    HANDLE_ERROR(cusparseDnnz(hdl, CUSPARSE_DIRECTION_COLUMN, M, K, descr, A, M, nnzPerCol, &nnz));
    cout<<"A nnz "<<nnz<<endl;
    cout<<"A nnz per col: "; for (i=0; i<K; i++) cout<<nnzPerCol[i]<<" "; cout<<endl<<endl;
    CALL_CUDA(cudaMalloc((void**)&cscVal, nnz*sizeof(TYPE)));
    CALL_CUDA(cudaMalloc((void**)&cscColPtr, (K+1)*sizeof(int)));
    CALL_CUDA(cudaMalloc((void**)&cscRowInd, nnz*sizeof(int)));
    HANDLE_ERROR(cusparseDdense2csc(hdl, M, K, descr, A, M, nnzPerCol, cscVal, cscRowInd, cscColPtr));
    TYPE * hcscVal; int *hcscColPtr, *hcscRowInd;
    CALL_CUDA(cudaHostAlloc((void**)&hcscVal, nnz*sizeof(TYPE), cudaHostAllocMapped));
    CALL_CUDA(cudaHostAlloc((void**)&hcscColPtr, (K+1)*sizeof(int), cudaHostAllocMapped));
    CALL_CUDA(cudaHostAlloc((void**)&hcscRowInd, nnz*sizeof(int), cudaHostAllocMapped));
    CALL_CUDA(cudaMemcpy(hcscVal, cscVal, nnz*sizeof(TYPE), cudaMemcpyDeviceToHost));
    CALL_CUDA(cudaMemcpy(hcscColPtr, cscColPtr, (K+1)*sizeof(int), cudaMemcpyDeviceToHost));
    CALL_CUDA(cudaMemcpy(hcscRowInd, cscRowInd, nnz*sizeof(int), cudaMemcpyDeviceToHost));
    cout<<"cscVal: "<<endl; for (i=0; i<nnz; i++) cout<<hcscVal[i]<<" "; cout<<endl;
    cout<<"cscColPtr: "<<endl; for (i=0; i<K+1; i++) cout<<hcscColPtr[i]<<" "; cout<<endl;
    cout<<"cscRowInd: "<<endl; for (i=0; i<nnz; i++) cout<<hcscRowInd[i]<<" "; cout<<endl<<endl;


    // GPU-cusparse calculation
    TYPE *gB, *gOut;
    CALL_CUDA(cudaMalloc((void**)&gB,K*N*sizeof(TYPE)));
    CALL_CUDA(cudaMalloc((void**)&gOut,M*N*sizeof(TYPE)));
    CALL_CUDA(cudaMemcpy(gB, B, K*N*sizeof(TYPE), cudaMemcpyHostToDevice));
    HANDLE_ERROR(cusparseDcsrmm(hdl, CUSPARSE_OPERATION_TRANSPOSE, M, N, K, nnz, &alpha, descr, cscVal, cscColPtr, cscRowInd, gB, K, &beta, gOut, M));
    CALL_CUDA(cudaMemcpy(Out, gOut, M*N*sizeof(TYPE), cudaMemcpyDeviceToHost));
    cout<<"Out:"<<endl; for (i=0; i<M; i++) {for (j=0; j<N; j++) cout<<Out[j*M+i]<<" "; cout<<endl;}; cout<<endl;


    //clean up
    CALL_CUDA(cudaFreeHost(A));
    CALL_CUDA(cudaFreeHost(B));
    CALL_CUDA(cudaFreeHost(hcscVal));
    CALL_CUDA(cudaFreeHost(hcscColPtr));
    CALL_CUDA(cudaFreeHost(hcscRowInd));
    CALL_CUDA(cudaFreeHost(Out));
    CALL_CUDA(cudaFree(gB));
    CALL_CUDA(cudaFree(cscVal));
    CALL_CUDA(cudaFree(cscColPtr));
    CALL_CUDA(cudaFree(cscRowInd));
    CALL_CUDA(cudaFree(gOut));
    return 0;
}

score 3 · Accepted Answer

因为没有必要提供单独的cscmm().

仅当您翻转参数时，现有的csrmm()才能执行将执行的确切操作。cscmm()cusparseOperation_t transA

您可以这样做的原因是矩阵的 CSC 表示A与转置矩阵的 CSR 表示完全相同A'

cuda - 为什么 cusparse 没有 cusparsecscmm?

1 回答 1

Related

Reference