我尝试使用 CUDA 的 cusolver 库在 GPU 上执行 QR 分解。
我将我的问题简化为下面的示例。
基本上,这几个步骤是:
- 我分配内存并在主机上用 1 初始化了一个 [5x3] 矩阵,
- 我分配内存并在设备上复制矩阵
- 我初始化求解器处理程序
cusolverDnCreate
- 我确定所需工作空间的大小
cusolverDnDgeqrf_bufferSize
- 最后,尝试使用 QR 分解
cusolverDnDgeqrf
不幸的是,最后一个命令通过返回CUSOLVER_STATUS_EXECUTION_FAILED
(int value = 6) 系统地失败了,我不知道出了什么问题!
这是错误的代码:
#include <cusolverDn.h>
#include <cuda_runtime_api.h>
int main(void)
{
int N = 5, P = 3;
double *hostData;
cudaMallocHost((void **) &hostData, N * sizeof(double));
for (int i = 0; i < N * P; ++i)
hostData[i] = 1.;
double *devData;
cudaMalloc((void**)&devData, N * sizeof(double));
cudaMemcpy((void*)devData, (void*)hostData, N * sizeof(double), cudaMemcpyHostToDevice);
cusolverStatus_t retVal;
cusolverDnHandle_t solverHandle;
retVal = cusolverDnCreate(&solverHandle);
std::cout << "Handler creation : " << retVal << std::endl;
double *devTau, *work;
int szWork;
cudaMalloc((void**)&devTau, P * sizeof(double));
retVal = cusolverDnDgeqrf_bufferSize(solverHandle, N, P, devData, N, &szWork);
std::cout << "Work space sizing : " << retVal << std::endl;
cudaMalloc((void**)&work, szWork * sizeof(double));
int *devInfo;
cudaMalloc((void **)&devInfo, 1);
retVal = cusolverDnDgeqrf(solverHandle, N, P, devData, N, devTau, work, szWork, devInfo); //CUSOLVER_STATUS_EXECUTION_FAILED
std::cout << "QR factorization : " << retVal << std::endl;
int hDevInfo = 0;
cudaMemcpy((void*)devInfo, (void*)&hDevInfo, 1 * sizeof(int), cudaMemcpyDeviceToHost);
std::cout << "Info device : " << hDevInfo << std::endl;
cudaFree(devInfo);
cudaFree(work);
cudaFree(devTau);
cudaFree(devData);
cudaFreeHost(hostData);
cudaDeviceReset();
}
您是否会在我的代码中看到任何明显的错误,请告诉我!非常感谢。