我正在尝试通过 cublas CUDA 库使用线性方程求解器对矩阵求逆。
原方程的形式为:
Ax = B = I
I - identity matrix
A - The matrix I'm trying to inverse
x - (hopefully) the inverse(A) matrix
我想执行 LU 分解,收到以下信息:
LUx = B
L - is a lower triangle matrix
U - is a upper triangle matrix
这是一个很好的例子,可以展示我正在尝试做的事情:
http://www.youtube.com/watch?v=dza5JTvMpzk
为了便于讨论,假设我已经对 A 进行了 LU 分解。(A = LU)。我试图在两步系列中找到逆:
Ax = B = I
LUx = B = I
1st step: Declare y:
**Ly = I**
solve 1st linear equation
2nd step, Solve the following linear equation
**Ux = y**
find X = inverse(A) - *Bingo!@!*
现在我不知道我在这里做错了什么。可能有两个假设,要么我没有正确使用 cublas,要么 cublas 无缘无故抛出异常。
见附上我的代码,最后有Matlab代码:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
//#include "cublas.h"
#include "cublas_v2.h"
int main()
{
cudaError_t cudaStat;
cublasStatus_t stat;
cublasHandle_t handle;
//cublasInit();
stat = cublasCreate(&handle);
if (stat != CUBLAS_STATUS_SUCCESS)
{
printf ("CUBLAS initialization failed\n");
return -1;
}
unsigned int size = 3;
// Allowcate device matrixes
float* p_h_LowerTriangle;
float* p_d_LowerTriangle;
p_h_LowerTriangle = new float[size*size];
p_h_LowerTriangle[0] = 1.f;
p_h_LowerTriangle[1] = 0.f;
p_h_LowerTriangle[2] = 0.f;
p_h_LowerTriangle[3] = 2.56f;
p_h_LowerTriangle[4] = 1.f;
p_h_LowerTriangle[5] = 0.f;
p_h_LowerTriangle[6] = 5.76f;
p_h_LowerTriangle[7] = 3.5f;
p_h_LowerTriangle[8] = 1.f;
float* p_h_UpperTriangle;
float* p_d_UpperTriangle;
p_h_UpperTriangle = new float[size*size];
p_h_UpperTriangle[0] = 25.f;
p_h_UpperTriangle[1] = 5.f;
p_h_UpperTriangle[2] = 1.f;
p_h_UpperTriangle[3] = 0.f;
p_h_UpperTriangle[4] = -4.8f;
p_h_UpperTriangle[5] = -1.56f;
p_h_UpperTriangle[6] = 0.f;
p_h_UpperTriangle[7] = 0.f;
p_h_UpperTriangle[8] = 0.7f;
float* p_h_IdentityMatrix;
float* p_d_IdentityMatrix;
p_h_IdentityMatrix = new float[size*size];
p_h_IdentityMatrix[0] = 1.f;
p_h_IdentityMatrix[1] = 0.f;
p_h_IdentityMatrix[2] = 0.f;
p_h_IdentityMatrix[3] = 0.f;
p_h_IdentityMatrix[4] = 1.f;
p_h_IdentityMatrix[5] = 0.f;
p_h_IdentityMatrix[6] = 0.f;
p_h_IdentityMatrix[7] = 0.f;
p_h_IdentityMatrix[8] = 1.f;
cudaMalloc ((void**)&p_d_LowerTriangle, size*size*sizeof(float));
cudaMalloc ((void**)&p_d_UpperTriangle, size*size*sizeof(float));
cudaMalloc ((void**)&p_d_IdentityMatrix, size*size*sizeof(float));
float* p_d_tempMatrix;
cudaMalloc ((void**)&p_d_tempMatrix, size*size*sizeof(float));
stat = cublasSetMatrix(size,size,sizeof(float),p_h_LowerTriangle,size,p_d_LowerTriangle,size);
stat = cublasSetMatrix(size,size,sizeof(float),p_h_UpperTriangle,size,p_d_UpperTriangle,size);
stat = cublasSetMatrix(size,size,sizeof(float),p_h_IdentityMatrix,size,p_d_IdentityMatrix,size);
cudaDeviceSynchronize();
// 1st phase - solve Ly = b
const float alpha = 1.f;
// Function solves the triangulatr linear system with multiple right hand sides, function overrides b as a result
stat = cublasStrsm(handle,
cublasSideMode_t::CUBLAS_SIDE_LEFT,
cublasFillMode_t::CUBLAS_FILL_MODE_LOWER,
cublasOperation_t::CUBLAS_OP_N,
CUBLAS_DIAG_UNIT,
size,
size,
&alpha,
p_d_LowerTriangle,
size,
p_d_IdentityMatrix,
size);
////////////////////////////////////
// TODO: printf 1st phase the results
cudaDeviceSynchronize();
cudaMemcpy(p_h_IdentityMatrix,p_d_IdentityMatrix,size*size*sizeof(float),cudaMemcpyDeviceToHost);
cudaMemcpy(p_h_UpperTriangle,p_d_UpperTriangle,size*size*sizeof(float),cudaMemcpyDeviceToHost);
cudaMemcpy(p_h_LowerTriangle,p_d_LowerTriangle,size*size*sizeof(float),cudaMemcpyDeviceToHost);
stat =cublasGetMatrix(size,size,sizeof(float),p_d_IdentityMatrix,size,p_h_IdentityMatrix,size);
stat =cublasGetMatrix(size,size,sizeof(float),p_d_UpperTriangle,size,p_h_UpperTriangle,size);
stat =cublasGetMatrix(size,size,sizeof(float),p_d_LowerTriangle,size,p_h_LowerTriangle,size);
////////////////////////////////////
// 2nd phase - solve Ux = b
stat = cublasStrsm(handle,
cublasSideMode_t::CUBLAS_SIDE_LEFT,
cublasFillMode_t::CUBLAS_FILL_MODE_UPPER,
cublasOperation_t::CUBLAS_OP_N,
CUBLAS_DIAG_NON_UNIT,
size,
size,
&alpha,
p_d_UpperTriangle,
size,
p_d_IdentityMatrix,
size);
// TODO print the results
cudaDeviceSynchronize();
////////////////////////////////////
cudaMemcpy(p_h_IdentityMatrix,p_d_IdentityMatrix,size*size*sizeof(float),cudaMemcpyDeviceToHost);
cudaMemcpy(p_h_UpperTriangle,p_d_UpperTriangle,size*size*sizeof(float),cudaMemcpyDeviceToHost);
cudaMemcpy(p_h_LowerTriangle,p_d_LowerTriangle,size*size*sizeof(float),cudaMemcpyDeviceToHost);
////////////////////////////////////
cublasDestroy(handle);
//cublasShutdown();
cudaFree(p_d_LowerTriangle);
cudaFree(p_d_UpperTriangle);
cudaFree(p_d_IdentityMatrix);
free(p_h_LowerTriangle);
free(p_h_UpperTriangle);
free(p_h_IdentityMatrix);
return 0;
}
Matlab 代码 - 完美运行:
clear all
UpperMatrix = [ 25 5 1 ; 0 -4.8 -1.56 ; 0 0 0.7 ]
LowerMatrix = [1 0 0 ; 2.56 1 0 ; 5.76 3.5 1 ]
IdentityMatrix = eye(3)
% 1st phase solution
otps1.LT = true;
y = linsolve(LowerMatrix,IdentityMatrix,otps1)
%2nd phase solution
otps2.UT = true;
y = linsolve(UpperMatrix,y,otps2)
MATLAB 输出
上矩阵 =
25.0000 5.0000 1.0000
0 -4.8000 -1.5600
0 0 0.7000
下矩阵 =
1.0000 0 0
2.5600 1.0000 0
5.7600 3.5000 1.0000
身份矩阵 =
1 0 0
0 1 0
0 0 1
y =
1.0000 0 0
-2.5600 1.0000 0
3.2000 -3.5000 1.0000
y =
0.0476 -0.0833 0.0357
-0.9524 1.4167 -0.4643
4.5714 -5.0000 1.4286
上矩阵 =
25.0000 5.0000 1.0000
0 -4.8000 -1.5600
0 0 0.7000
下矩阵 =
1.0000 0 0
2.5600 1.0000 0
5.7600 3.5000 1.0000
身份矩阵 =
1 0 0
0 1 0
0 0 1
y =
1.0000 0 0
-2.5600 1.0000 0
3.2000 -3.5000 1.0000
>>
>>
>>
>> Inverse_LU_UT
上矩阵 =
25.0000 5.0000 1.0000
0 -4.8000 -1.5600
0 0 0.7000
下矩阵 =
1.0000 0 0
2.5600 1.0000 0
5.7600 3.5000 1.0000
身份矩阵 =
1 0 0
0 1 0
0 0 1
y =
1.0000 0 0
-2.5600 1.0000 0
3.2000 -3.5000 1.0000
y =
0.0476 -0.0833 0.0357
-0.9524 1.4167 -0.4643
4.5714 -5.0000 1.4286
我不知道我在这里做错了什么,我怀疑 cublasCreate 操作。每次我点击命令:
cublasStatus_t stat;
cublasHandle_t handle;
stat = cublasCreate(&handle);
stat 和 handle 变量都是有效的。
但是 VS10 输出了几个错误信息,具体如下:
0x 处的第一次机会异常... 内存位置 0x 处的 Microsoft C++ 异常 cudaError_enum ...
有人可能会说这是一个内部 cublas 错误消息,由库本身处理。