我有一些规则构建的稀疏 3 对角 NxN 矩阵A
,并且想要解决系统问题Ax=b
。为此,我使用cusolverSpScsrlsvqr()
来自 cuSolverSp 模块。设备版本比cusolverSpScsrlsvqrHost()
大 N 慢很多倍可以吗?例如,对于 N=2^14,设备时间为 174.1 毫秒,主机时间为 3.5 毫秒。我在 RTX 2060 上。
代码:
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <cusolverSp.h>
#include <cusparse_v2.h>
#include <stdio.h>
#include <iostream>
#include <iomanip>
#include <chrono>
using namespace std;
void checkCudaCusolverStatus(cusolverStatus_t status, char const* operation) {
char const *str = "UNKNOWN STATUS";
switch (status) {
case CUSOLVER_STATUS_SUCCESS:
str = "CUSOLVER_STATUS_SUCCESS";
break;
case CUSOLVER_STATUS_NOT_INITIALIZED:
str = "CUSOLVER_STATUS_NOT_INITIALIZED";
break;
case CUSOLVER_STATUS_ALLOC_FAILED:
str = "CUSOLVER_STATUS_ALLOC_FAILED";
break;
case CUSOLVER_STATUS_INVALID_VALUE:
str = "CUSOLVER_STATUS_INVALID_VALUE";
break;
case CUSOLVER_STATUS_ARCH_MISMATCH:
str = "CUSOLVER_STATUS_ARCH_MISMATCH";
break;
case CUSOLVER_STATUS_MAPPING_ERROR:
str = "CUSOLVER_STATUS_MAPPING_ERROR";
break;
case CUSOLVER_STATUS_EXECUTION_FAILED:
str = "CUSOLVER_STATUS_EXECUTION_FAILED";
break;
case CUSOLVER_STATUS_INTERNAL_ERROR:
str = "CUSOLVER_STATUS_INTERNAL_ERROR";
break;
case CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
str = "CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
break;
case CUSOLVER_STATUS_ZERO_PIVOT:
str = "CUSOLVER_STATUS_ZERO_PIVOT";
break;
}
cout << left << setw(30) << operation << " " << str << endl;
}
__global__ void fillAB(float *aValues, int *aRowPtrs, int *aColIdxs, float *b, int const n) {
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i >= n) return;
if (i == 0) {
float xn = 10 * (n + 1);
aValues[n * 3] = xn;
aRowPtrs[0] = 0;
aRowPtrs[n + 1] = n * 3 + 1;
aColIdxs[n * 3] = n;
b[n] = xn * 2;
}
float xi = 10 * (i + 1);
aValues[i * 3 + 0] = xi;
aValues[i * 3 + 1] = xi + 5;
aValues[i * 3 + 2] = xi - 5;
aColIdxs[i * 3 + 0] = i;
aColIdxs[i * 3 + 1] = i + 1;
aColIdxs[i * 3 + 2] = i;
aRowPtrs[i + 1] = 2 + (i * 3);
b[i] = xi * 2;
}
int main() {
int const n = (int)pow(2, 14); // <<<<<<<<<<<<<<<<<<<<<<<<<<<<< N HERE
int const valCount = n * 3 - 2;
float *const aValues = new float[valCount];
int *const aRowPtrs = new int[n + 1];
int *const aColIdxs = new int[valCount];
float *const b = new float[n];
float *const x = new float[n];
float *dev_aValues;
int *dev_aRowPtrs;
int *dev_aColIdxs;
float *dev_b;
float *dev_x;
int aValuesSize = sizeof(float) * valCount;
int aRowPtrsSize = sizeof(int) * (n + 1);
int aColIdxsSize = sizeof(int) * valCount;
int bSize = sizeof(float) * n;
int xSize = sizeof(float) * n;
cudaMalloc((void**)&dev_aValues, aValuesSize);
cudaMalloc((void**)&dev_aRowPtrs, aRowPtrsSize);
cudaMalloc((void**)&dev_aColIdxs, aColIdxsSize);
cudaMalloc((void**)&dev_b, bSize);
cudaMalloc((void**)&dev_x, xSize);
fillAB<<<1024, (int)ceil(n / 1024.f)>>>(dev_aValues, dev_aRowPtrs, dev_aColIdxs, dev_b, n - 1);
cudaMemcpy(aValues, dev_aValues, aValuesSize, cudaMemcpyDeviceToHost);
cudaMemcpy(aRowPtrs, dev_aRowPtrs, aRowPtrsSize, cudaMemcpyDeviceToHost);
cudaMemcpy(aColIdxs, dev_aColIdxs, aColIdxsSize, cudaMemcpyDeviceToHost);
cudaMemcpy(b, dev_b, bSize, cudaMemcpyDeviceToHost);
cusolverSpHandle_t handle;
checkCudaCusolverStatus(cusolverSpCreate(&handle), "cusolverSpCreate");
cusparseMatDescr_t aDescr;
cusparseCreateMatDescr(&aDescr);
cusparseSetMatIndexBase(aDescr, CUSPARSE_INDEX_BASE_ZERO);
cusparseSetMatType(aDescr, CUSPARSE_MATRIX_TYPE_GENERAL);
int singularity;
cusolverStatus_t status;
cusolverSpScsrlsvqr(handle, n, valCount, aDescr, dev_aValues, dev_aRowPtrs, dev_aColIdxs, dev_b, 0.1f, 0, dev_x, &singularity);
cudaDeviceSynchronize();
auto t0 = chrono::high_resolution_clock::now();
for (int i = 0; i < 10; ++i)
////////////////////// CUSOLVER HERE //////////////////////
status = cusolverSpScsrlsvqr(handle, n, valCount, aDescr, dev_aValues, dev_aRowPtrs, dev_aColIdxs, dev_b, 0.1f, 0, dev_x, &singularity);
//status = cusolverSpScsrlsvqrHost(handle, n, valCount, aDescr, aValues, aRowPtrs, aColIdxs, b, 0.1f, 0, x, &singularity);
///////////////////////////////////////////////////////////
cudaDeviceSynchronize();
auto t1 = chrono::high_resolution_clock::now();
checkCudaCusolverStatus(status, "cusolverSpScsrlsvqr");
checkCudaCusolverStatus(cusolverSpDestroy(handle), "cusolverSpDestroy");
cout << "System solved: " << setw(20) << fixed << right << setprecision(3) << (t1 - t0).count() / 10.0 / 1000000 << " ms" << endl;
cudaMemcpy(x, dev_x, xSize, cudaMemcpyDeviceToHost);
/*for (int i = 0; i < n; ++i) {
cout << " " << x[i];
}*/
cudaFree(dev_aValues);
cudaFree(dev_aRowPtrs);
cudaFree(dev_aColIdxs);
cudaFree(dev_b);
cudaFree(dev_x);
delete[] aValues;
delete[] aRowPtrs;
delete[] aColIdxs;
delete[] b;
delete[] x;
cudaDeviceReset();
return 0;
}