我正在尝试反转由复数组成的矩阵,其中我使用矩阵求逆代码来表示“用户” cuda 矩阵逆高斯约旦在以下链接中发布的实数
代码编译,没有错误,但问题是输出错误!我不知道我哪里错了。任何人都可以,请,帮助。先感谢您!
这是完整的代码:
#include <stdio.h>
#include <iostream>
#include <fstream>
#include <vector>
#include <string>
#pragma comment(lib, "cuda.lib")
#pragma comment(lib, "cudart.lib")
#include <cuda.h>
#include <math.h>
#include <cuda_runtime.h>
#include <cuda_runtime_api.h>
#include "device_launch_parameters.h"
#include <cublas_v2.h>
#include "cuComplex.h"
#include <complex>
__device__ __host__ cuDoubleComplex operator*(cuDoubleComplex a, cuDoubleComplex b) { return cuCmul(a,b); }
__device__ __host__ cuDoubleComplex operator+(cuDoubleComplex a, cuDoubleComplex b) { return cuCadd(a,b); }
__device__ __host__ cuDoubleComplex operator/(cuDoubleComplex a, cuDoubleComplex b) { return cuCdiv(a,b); }
__device__ __host__ cuDoubleComplex operator-(cuDoubleComplex a, cuDoubleComplex b) { return cuCsub(a,b); }
using namespace std;
__global__ void gaussjordan(cuDoubleComplex *A, cuDoubleComplex *I,int n, int i)
{
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
cuDoubleComplex P;
if(x<n && y<n)
if(x>i){
P=A[x*n+i]/A[i*n+i];
I[x*n+y] = I[x*n+y] - I[i*n+y]*P;
if(y>=i){
A[x*n+y] = A[x*n+y] - A[i*n+y]*P;
}
}
}
__global__ void dev(cuDoubleComplex *d_A, cuDoubleComplex *dI, int h)
{
cuDoubleComplex temp = make_cuDoubleComplex(0,0);
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
if(x<h && y<h)
if( cuCimag(d_A[x*h+x]) != cuCimag(temp)){
if( cuCreal(d_A[x*h+x]) != cuCreal(temp)){
dI[x*h+y] = dI[x*h+y]/d_A[x*h+x];
d_A[x*h+y] = d_A[x*h+y]/d_A[x*h+x];
}
}
__syncthreads();
}
int main()
{
int const n = 3;
// creating input
cuDoubleComplex iL[n*n],L[n*n], I[n*n];
for(int i=0;i<n;i++){
for(int j=0;j<n;j++){
if(i==j ) L[i*n+j] =make_cuDoubleComplex(0,1);
else L[i*n+j] = make_cuDoubleComplex(0,0);
printf("%.2f ", cuCimag(L[i*n+j]));
}
printf("\n");
}
printf("\n");
cuDoubleComplex *d_A, *d_L, *dI;
float time;
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
int ddsize = n*n*sizeof(cuDoubleComplex);
dim3 threadsPerBlock(n/16,n/16); //!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
dim3 numBlocks(16,16); //!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
// memory allocation
cudaMalloc( (void**) &d_A, ddsize);
cudaMalloc( (void**) &dI, ddsize);
for(int i=0;i<n;i++){
for(int j=0;j<n;j++){
if(i==j) I[i*n+i]=make_cuDoubleComplex(1,0);
else I[i*n+j]=make_cuDoubleComplex(0,0);
}
}
//copy data from GPU to CPU
cudaMemcpy( d_A, L, ddsize, cudaMemcpyHostToDevice);
cudaMemcpy( dI, I, ddsize, cudaMemcpyHostToDevice);
//timer start
cudaEventRecord( start, 0);
// L^(-1)
for(int i=0;i<n;i++){
gaussjordan<<<numBlocks,threadsPerBlock>>>(d_A, dI, n, i);
}
dev<<<numBlocks, threadsPerBlock>>>(d_A, dI, n);
cudaMemcpy(iL, dI, ddsize, cudaMemcpyDeviceToHost );
cudaMemcpy(L, d_A, ddsize, cudaMemcpyDeviceToHost );
cudaEventRecord( stop, 0 );
cudaEventSynchronize( stop );
cudaEventElapsedTime( &time, start, stop );
cudaEventDestroy( start );
cudaEventDestroy( stop );
for(int i=0;i<n;i++){
for(int j=0;j<n;j++){
printf("%.2f ", cuCimag(iL[i*n+j]));
}
printf("\n");
}
printf("\n");
std::cout<<"Cuda Time - inverse: "<< time <<"ms\n";
cudaFree(d_A);
cudaFree(dI);
system("Pause");
return 0;
}
感谢@RobertCrovella 为您提供快速且非常有见地的建议!关于您对我的问题的回答:我更改了我的 threadsPerBlock(4,4) 和 numBlocks(1,1) 所以我将使用 1 个块和 16 个线程作为我的 4x4 矩阵。我的输入矩阵如下
1 0 0 0
0 2 0 0
0 0 3 0
0 0 0 4
这里所有的数字都是实数,那么预期的倒矩阵应该看起来像
1 0 0 0
0 1/2 0 0
0 0 1/3 0
0 0 0 1/4
我一点也不明白。我输入了 cuda memcheck 工具来查看我的内核是否没有在午餐,但它没有显示任何错误消息。我最近开始学习 CUDA,没有太多经验。谁能给出更详细的答复?谢谢你!
这是我修改后的代码。
#include <stdio.h>
#include <iostream>
#include <fstream>
#include <vector>
#include <string>
#pragma comment(lib, "cuda.lib")
#pragma comment(lib, "cudart.lib")
#include <cuda.h>
#include <math.h>
#include <cuda_runtime.h>
#include <cuda_runtime_api.h>
#include "device_launch_parameters.h"
#include <cublas_v2.h>
#include "cuComplex.h"
#include <complex>
__device__ __host__ cuDoubleComplex operator*(cuDoubleComplex a, cuDoubleComplex b) { return cuCmul(a,b); }
__device__ __host__ cuDoubleComplex operator+(cuDoubleComplex a, cuDoubleComplex b) { return cuCadd(a,b); }
__device__ __host__ cuDoubleComplex operator/(cuDoubleComplex a, cuDoubleComplex b) { return cuCdiv(a,b); }
__device__ __host__ cuDoubleComplex operator-(cuDoubleComplex a, cuDoubleComplex b) { return cuCsub(a,b); }
using namespace std;
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
__global__ void gaussjordan(cuDoubleComplex *A, cuDoubleComplex *I,int n, int i)
{
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
cuDoubleComplex P;
if(x<n && y<n)
if(x>i){
P=A[x*n+i]/A[i*n+i];
I[x*n+y] = I[x*n+y] - I[i*n+y]*P;
if(y>=i){
A[x*n+y] = A[x*n+y] - A[i*n+y]*P;
}
}
}
__global__ void dev(cuDoubleComplex *d_A, cuDoubleComplex *dI, int h)
{
cuDoubleComplex temp = make_cuDoubleComplex(0,0);
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
if(x<h && y<h)
if( cuCimag(d_A[x*h+x]) != 0 ){
if( cuCreal(d_A[x*h+x]) != 0 ){
dI[x*h+y] = dI[x*h+y]/d_A[x*h+x];
d_A[x*h+y] = d_A[x*h+y]/d_A[x*h+x];
}
}
__syncthreads();
}
int main()
{
int const n= 4;
// creating input
cuDoubleComplex iL[n*n],L[n*n], I[n*n];
for(int i=0;i<n;i++){
for(int j=0;j<n;j++){
if(i==j ) L[i*n+j] =make_cuDoubleComplex(i+1,0);
else L[i*n+j] = make_cuDoubleComplex(0,0);
printf("%.2f ", cuCreal(L[i*n+j]));
}
printf("\n");
}
printf("\n");
cuDoubleComplex *d_A, *dI;
float time;
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
int ddsize = n*n*sizeof(cuDoubleComplex);
dim3 threadsPerBlock(n,n); //!!!!!!!!!!!!!!!!!!
dim3 numBlocks(1,1); //!!!!!!!!!!!!!!!!!!
// memory allocation
cudaMalloc( (void**) &d_A, ddsize);
cudaMalloc( (void**) &dI, ddsize);
for(int i=0;i<n;i++){
for(int j=0;j<n;j++){
if(i==j) I[i*n+i]=make_cuDoubleComplex(1,0);
else I[i*n+j]=make_cuDoubleComplex(0,0);
}
}
//copy data from GPU to CPU
cudaMemcpy( d_A, L, ddsize, cudaMemcpyHostToDevice);
cudaMemcpy( dI, I, ddsize, cudaMemcpyHostToDevice);
//timer start
cudaEventRecord( start, 0);
// L^(-1)
for(int i=0;i<n;i++){
gaussjordan<<<numBlocks,threadsPerBlock>>>(d_A, dI, n, i);
gpuErrchk( cudaPeekAtLastError() );
}
dev<<<numBlocks, threadsPerBlock>>>(d_A, dI, n);
gpuErrchk( cudaPeekAtLastError() );
gpuErrchk(cudaMemcpy(iL, dI, ddsize, cudaMemcpyDeviceToHost ));
gpuErrchk(cudaMemcpy(L, d_A, ddsize, cudaMemcpyDeviceToHost ));
cudaEventRecord( stop, 0 );
cudaEventSynchronize( stop );
cudaEventElapsedTime( &time, start, stop );
cudaEventDestroy( start );
cudaEventDestroy( stop );
for(int i=0;i<n;i++){
for(int j=0;j<n;j++){
printf("%.2f ", cuCreal(iL[i*n+j]));
}
printf("\n");
}
printf("\n");
std::cout<<"Cuda Time - inverse: "<< time <<"ms\n";
cudaFree(d_A);
cudaFree(dI);
system("Pause");
return 0;
}