1

我正忙于研究 LS 方法,我手动实现了一个共轭梯度求解器,但是在更新我的 CUDA 版本后,我看到有一个新函数(cusolverDnSSgels),我认为它比我的手动实现要快。我的第一个任务是尝试在测试用例上运行它(见下文),我希望结果是:-6.5, 9.7根据 MA​​Tlab。不幸的是我找不到我做错了什么,我也找不到一个例子,因为它是一个相对较新的功能。

输出表明niter= -3,根据文档,这将表明迭代次数过多,但这没有意义,因为它是一个非常小的矩阵,应该很容易解决。

#include <iostream>
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <cusolverDn.h>
#include "device_launch_parameters.h"


int main()
{   
    //init id, handle and stat
    int id = cudaGetDevice(&id);
    cusolverDnHandle_t cusolverH;
    cusolverStatus_t stat;

    // create handle
    stat = cusolverDnCreate(&cusolverH);

    //params
    const int C = 3;
    const int M = 2;
    long lda = C;

    //init variables
    float *Amat, *Ymat, *Xmat;
    float *gAmat, *gYmat, *gXmat;

    //allocate mem
    Amat = (float*)malloc(M * C * sizeof(float));
    Ymat = (float*)malloc(C * sizeof(float));
    Xmat = (float*)malloc(M * sizeof(float));

    srand(100);
#if 0
    for (int i = 0; i < C * M; i++) {
        Amat[i] = rand() % 10 + 1;
        Amat[i] = (float)Amat[i];

    }

    for (int i = 0; i < C; i++) {
        Ymat[i] = rand() % 10 + 1;
        Ymat[i] = (float)Ymat[i];
    }
#endif
    Amat[0] = 6;
    Amat[1] = 7;
    Amat[2] = 6;
    Amat[3] = 5;
    Amat[4] = 5;
    Amat[5] = 5;
    Ymat[0] = 9;
    Ymat[1] = 3;
    Ymat[2] = 10;

    //allocate mem
    cudaMalloc(&gAmat, M * C * sizeof(float));
    cudaMalloc(&gYmat, C * sizeof(float));
    cudaMalloc(&gXmat, M * 1 * sizeof(float));

    //copy mem
    cudaMemcpy(gAmat, Amat, M * C * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(gYmat, Ymat, C * 1 * sizeof(float), cudaMemcpyHostToDevice);

    float *gdwork;
    size_t work_bytes;

    stat = cusolverDnSSgels_bufferSize(cusolverH,C, M, 1, gAmat, lda, gYmat, C, gXmat, M, NULL, &work_bytes);

    std::cout << "Status = " << stat << std::endl;

    int niter = 0;
    int dinfo = 0;

    cudaMalloc(&gdwork, work_bytes * sizeof(float));

    stat = cusolverDnSSgels(cusolverH, C, M, 1, gAmat, lda, gYmat, C, gXmat, M, gdwork, work_bytes, &niter, &dinfo);

    std::cout << "Status = " << stat  << std::endl;
    std::cout << "niter = "  << niter << std::endl;
    std::cout << "dinfo = "  << dinfo << std::endl;

    cudaDeviceSynchronize();

    cudaMemcpy(Xmat, gXmat, M * 1 * sizeof(float), cudaMemcpyDeviceToHost);


    //Output printed
    std::cout << Xmat[0] << ", " << Xmat[1] << std::endl;

    //free memory
    cudaFree(gdwork);
    free(Amat);
    free(Ymat);
    free(Xmat);


    cudaFree(gXmat);
    cudaFree(gAmat);
    cudaFree(gYmat);

    //destory handle
    cusolverDnDestroy(cusolverH);



    return 0;
}

我得到的结果是:

Status = 0
Status = 0
niter = -3
dinfo = 0
-4.31602e+08, -4.31602e+08

有人可以指出我做错了什么吗?

4

1 回答 1

2

您的dinfo参数使用有问题。参考文档,我们看到:

cusolverDngels() 函数的参数

参数 Memory In/out 含义

dinfo device output 返回时 IRS 求解器的状态。如果 0 - 解决成功。如果 dinfo = -i 则第 i 个参数无效。

dinfo参数预计将存在于设备内存中。但是你在主机内存中有它:

int dinfo = 0;

如果我将存储移动到正确的位置,您的代码会按预期输出您指示的值:

$ cat t143.cu
#include <iostream>
#include <cublas_v2.h>
#include <cusolverDn.h>


int main()
{
    //init id, handle and stat
    int id = cudaGetDevice(&id);
    cusolverDnHandle_t cusolverH;
    cusolverStatus_t stat;

    // create handle
    stat = cusolverDnCreate(&cusolverH);

    //params
    const int C = 3;
    const int M = 2;
    long lda = C;

    //init variables
    float *Amat, *Ymat, *Xmat;
    float *gAmat, *gYmat, *gXmat;

    //allocate mem
    Amat = (float*)malloc(M * C * sizeof(float));
    Ymat = (float*)malloc(C * sizeof(float));
    Xmat = (float*)malloc(M * sizeof(float));

    srand(100);
#if 0
    for (int i = 0; i < C * M; i++) {
        Amat[i] = rand() % 10 + 1;
        Amat[i] = (float)Amat[i];

    }

    for (int i = 0; i < C; i++) {
        Ymat[i] = rand() % 10 + 1;
        Ymat[i] = (float)Ymat[i];
    }
#endif
    Amat[0] = 6;
    Amat[1] = 7;
    Amat[2] = 6;
    Amat[3] = 5;
    Amat[4] = 5;
    Amat[5] = 5;
    Ymat[0] = 9;
    Ymat[1] = 3;
    Ymat[2] = 10;

    //allocate mem
    cudaMalloc(&gAmat, M * C * sizeof(float));
    cudaMalloc(&gYmat, C * sizeof(float));
    cudaMalloc(&gXmat, M * 1 * sizeof(float));

    //copy mem
    cudaMemcpy(gAmat, Amat, M * C * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(gYmat, Ymat, C * 1 * sizeof(float), cudaMemcpyHostToDevice);

    float *gdwork;
    size_t work_bytes;

    stat = cusolverDnSSgels_bufferSize(cusolverH,C, M, 1, gAmat, lda, gYmat, C, gXmat, M, NULL, &work_bytes);

    std::cout << "Status = " << stat << std::endl;

    int niter = 0;
    int *dinfo, hinfo;

    cudaMalloc(&gdwork, work_bytes * sizeof(float));
    cudaMalloc(&dinfo, sizeof(int));

    stat = cusolverDnSSgels(cusolverH, C, M, 1, gAmat, lda, gYmat, C, gXmat, M, gdwork, work_bytes, &niter, dinfo);
    cudaMemcpy(&hinfo, dinfo, sizeof(int), cudaMemcpyDeviceToHost);
    std::cout << "Status = " << stat  << std::endl;
    std::cout << "niter = "  << niter << std::endl;
    std::cout << "dinfo = "  << hinfo << std::endl;

    cudaDeviceSynchronize();

    cudaMemcpy(Xmat, gXmat, M * 1 * sizeof(float), cudaMemcpyDeviceToHost);


    //Output printed
    std::cout << Xmat[0] << ", " << Xmat[1] << std::endl;

    //free memory
    cudaFree(gdwork);
    free(Amat);
    free(Ymat);
    free(Xmat);


    cudaFree(gXmat);
    cudaFree(gAmat);
    cudaFree(gYmat);

    //destory handle
    cusolverDnDestroy(cusolverH);



    return 0;
}
$ nvcc -o t143 t143.cu -lcublas -lcusolver
$ cuda-memcheck ./t143
========= CUDA-MEMCHECK
Status = 0
Status = 0
niter = -51
dinfo = 0
-6.5, 9.7
========= ERROR SUMMARY: 0 errors
$

笔记:

  • 我在上面使用 CUDA 11.3。如果您使用的是早期版本,我强烈建议您升级到 CUDA 11.3 或更高版本以使用此功能。

  • 您可以通过运行代码来获得有关问题的提示cuda-memcheck

  • 通过使用文档中给出的参数位置(主机/设备)表查看参数使用情况,可以很快发现问题。您在这里遇到了一个类似的问题,您可以通过对照文档中给出的表格查看您的参数位置(主机/设备)来专注于该问题。这可能是一件好事,可以检查以节省您将来的时间。

于 2021-05-17T19:15:50.117 回答