我有一个来自 MAGMA doc 的简单程序,它可以反转 GPU 卡上的大矩阵。
#include <stdio.h>
#include <cuda.h>
#include "magma_v2.h"
#include "magma_lapack.h"
int main() {
//double *d_lA[MagmaMaxGPUs];
magma_init (); // initialize Magma
magma_queue_t queue=NULL;
magma_int_t dev=0;
magma_queue_create(dev ,&queue );
double gpu_time , *dwork; // dwork - workspace
magma_int_t ldwork; // size of dwork
magma_int_t *piv , info; // piv - array of indices of inter -
magma_int_t m = 20000; // changed rows; a - mxm matrix
magma_int_t mm=m*m; // size of a, r, c
double *a; // a- mxm matrix on the host
double *d_a; // d_a - mxm matrix a on the device
double *d_r; // d_r - mxm matrix r on the device
double *d_c; // d_c - mxm matrix c on the device
magma_int_t ione = 1;
magma_int_t ISEED [4] = { 0,0,0,1 }; // seed
magma_int_t err;
const double alpha = 1.0; // alpha =1
const double beta = 0.0; // beta=0
ldwork = m * magma_get_dgetri_nb( m ); // optimal block size
// allocate matrices
err = magma_dmalloc_cpu( &a , mm ); // host memory for a
err = magma_dmalloc( &d_a , mm ); // device memory for a
err = magma_dmalloc( &d_r , mm ); // device memory for r
err = magma_dmalloc( &d_c , mm ); // device memory for c
err = magma_dmalloc( &dwork , ldwork );// dev. mem. for ldwork
piv=( magma_int_t *) malloc(m*sizeof(magma_int_t ));// host mem.
// generate random matrix a // for piv
lapackf77_dlarnv (&ione ,ISEED ,&mm ,a); // randomize a
printf("here1\n");
magma_dsetmatrix( m, m, a, m, d_a , m, queue); // copy a -> d_a
magmablas_dlacpy(MagmaFull , m, m, d_a , m, d_r ,m,queue);//d_a ->d_r
// find the inverse matrix: d_a*X=I using the LU factorization
// with partial pivoting and row interchanges computed by
// magma_dgetrf_gpu; row i is interchanged with row piv(i);
// d_a -mxm matrix; d_a is overwritten by the inverse
gpu_time = magma_sync_wtime(NULL);
printf("here2\n");
//magma_dgetrf_mgpu(2, m, m, d_lA, m, piv, &info);
magma_dgetrf_gpu(m, m, d_a, m, piv, &info);
magma_dgetri_gpu(m, d_a, m, piv, dwork, ldwork, &info);
printf("here3\n");
gpu_time = magma_sync_wtime(NULL)-gpu_time;
magma_dgemm(MagmaNoTrans ,MagmaNoTrans ,m,m,m,alpha ,d_a ,m,
d_r ,m,beta ,d_c ,m,queue); // multiply a^-1*a
printf("magma_dgetrf_gpu + magma_dgetri_gpu time: %7.5f sec.\
\n",gpu_time );
magma_dgetmatrix( m, m, d_c , m, a, m, queue); // copy d_c ->a
printf("upper left corner of a^-1*a:\n");
magma_dprint( 4, 4, a, m ); // part of a^-1*a
free(a); // free host memory
free(piv); // free host memory
magma_free(d_a); // free device memory
magma_free(d_r); // free device memory
magma_free(d_c); // free device memory
magma_queue_destroy(queue); // destroy queue
magma_finalize (); // finalize Magma
return 0;
}
一切正常,检查预期的短运行时间:
现在,如果您查看上面的代码,您会看到我注释了 2 行:
//double *d_lA[MagmaMaxGPUs];
和
//magma_dgetrf_mgpu(2, m, m, d_lA, m, piv, &info);
实际上,取消注释这 2 行并替换magma_dgetrf_gpu(m, m, d_a, m, piv, &info);
为magma_dgetrf_mgpu(2, m, m, d_lA, m, piv, &info);
是为了使此代码分布在 2 个 GPU 卡(在 2 个 GPU 卡 RTX A6000)上运行。
我已经MAGMA_NUM_GPUS=2
从 MAGMA 文档中设置了。
所以我有代码:
#include <stdio.h>
#include <cuda.h>
#include "magma_v2.h"
#include "magma_lapack.h"
int main() {
double *d_lA[MagmaMaxGPUs];
magma_init (); // initialize Magma
magma_queue_t queue=NULL;
magma_int_t dev=0;
magma_queue_create(dev ,&queue );
double gpu_time , *dwork; // dwork - workspace
magma_int_t ldwork; // size of dwork
magma_int_t *piv , info; // piv - array of indices of inter -
magma_int_t m = 20000; // changed rows; a - mxm matrix
magma_int_t mm=m*m; // size of a, r, c
double *a; // a- mxm matrix on the host
double *d_a; // d_a - mxm matrix a on the device
double *d_r; // d_r - mxm matrix r on the device
double *d_c; // d_c - mxm matrix c on the device
magma_int_t ione = 1;
magma_int_t ISEED [4] = { 0,0,0,1 }; // seed
magma_int_t err;
const double alpha = 1.0; // alpha =1
const double beta = 0.0; // beta=0
ldwork = m * magma_get_dgetri_nb( m ); // optimal block size
// allocate matrices
err = magma_dmalloc_cpu( &a , mm ); // host memory for a
err = magma_dmalloc( &d_a , mm ); // device memory for a
err = magma_dmalloc( &d_r , mm ); // device memory for r
err = magma_dmalloc( &d_c , mm ); // device memory for c
err = magma_dmalloc( &dwork , ldwork );// dev. mem. for ldwork
piv=( magma_int_t *) malloc(m*sizeof(magma_int_t ));// host mem.
// generate random matrix a // for piv
lapackf77_dlarnv (&ione ,ISEED ,&mm ,a); // randomize a
printf("here1\n");
magma_dsetmatrix( m, m, a, m, d_a , m, queue); // copy a -> d_a
magmablas_dlacpy(MagmaFull , m, m, d_a , m, d_r ,m,queue);//d_a ->d_r
// find the inverse matrix: d_a*X=I using the LU factorization
// with partial pivoting and row interchanges computed by
// magma_dgetrf_gpu; row i is interchanged with row piv(i);
// d_a -mxm matrix; d_a is overwritten by the inverse
gpu_time = magma_sync_wtime(NULL);
printf("here2\n");
magma_dgetrf_mgpu(2, m, m, d_lA, m, piv, &info);
magma_dgetri_gpu(m, d_a, m, piv, dwork, ldwork, &info);
printf("here3\n");
gpu_time = magma_sync_wtime(NULL)-gpu_time;
magma_dgemm(MagmaNoTrans ,MagmaNoTrans ,m,m,m,alpha ,d_a ,m,
d_r ,m,beta ,d_c ,m,queue); // multiply a^-1*a
printf("magma_dgetrf_gpu + magma_dgetri_gpu time: %7.5f sec.\
\n",gpu_time );
magma_dgetmatrix( m, m, d_c , m, a, m, queue); // copy d_c ->a
printf("upper left corner of a^-1*a:\n");
magma_dprint( 4, 4, a, m ); // part of a^-1*a
free(a); // free host memory
free(piv); // free host memory
magma_free(d_a); // free device memory
magma_free(d_r); // free device memory
magma_free(d_c); // free device memory
magma_queue_destroy(queue); // destroy queue
magma_finalize (); // finalize Magma
return 0;
}
在执行开始时,两个 GPU 似乎都在运行,但我很快得到以下错误:
$ ./main_magma_double_example.exe
here1
here2
CUDA runtime error: an illegal memory access was encountered (700) in magma_dgetrf_mgpu at /home/fab/magma-2.6.1/src/dgetrf_mgpu.cpp:183
CUDA runtime error: an illegal memory access was encountered (700) in magma_dgetrf_mgpu at /home/fab/magma-2.6.1/src/dgetrf_mgpu.cpp:183
CUDA runtime error: an illegal memory access was encountered (700) in magma_dgetrf_mgpu at /home/fab/magma-2.6.1/src/dgetrf_mgpu.cpp:192
CUDA runtime error: an illegal memory access was encountered (700) in magma_dgetrf_mgpu at /home/fab/magma-2.6.1/src/dgetrf_mgpu.cpp:193
CUDA runtime error: an illegal memory access was encountered (700) in magma_dgetrf_mgpu at /home/fab/magma-2.6.1/src/dgetrf_mgpu.cpp:192
CUDA runtime error: an illegal memory access was encountered (700) in magma_dgetrf_mgpu at /home/fab/magma-2.6.1/src/dgetrf_mgpu.cpp:193
here3
CUDA runtime error: an illegal memory access was encountered (700) in magma_sync_wtime at /home/fab/magma-2.6.1/control/magma_timer.cpp:98
magma_dgetrf_gpu + magma_dgetri_gpu time: 1.07300 sec.
CUBLAS error: memory mapping error (11) in main at example_double_MAGMA_NVIDIA.cpp:57
upper left corner of a^-1*a:
[
0.1206 0.4128 0.9920 0.4738
0.6438 0.1080 0.1855 0.9998
0.0623 0.0777 0.2275 0.1513
0.4903 0.1876 0.8492 0.3984
];
CUDA runtime error: an illegal memory access was encountered (700) in main at example_double_MAGMA_NVIDIA.cpp:62
CUDA runtime error: an illegal memory access was encountered (700) in main at example_double_MAGMA_NVIDIA.cpp:63
CUDA runtime error: an illegal memory access was encountered (700) in main at example_double_MAGMA_NVIDIA.cpp:64
CUDA runtime error: an illegal memory access was encountered (700) in main at example_double_MAGMA_NVIDIA.cpp:65
我们可以看到反转是错误的。我不知道这些错误消息的来源。
编译由 Makefile.inc 完成:
CXX = nvcc -O3
LAPACK = /opt/intel/oneapi/mkl/latest/lib/intel64
MAGMA = /usr/local/magma
INCLUDE_CUDA=/usr/local/cuda/include
LIBCUDA=/usr/local/cuda/lib64
CXXFLAGS = -c -I${MAGMA}/include -I${INCLUDE_CUDA} -lpthread
LDFLAGS = -L${LAPACK} -lmkl_intel_lp64 -L${LIBCUDA} -lcuda -lcudart -lcublas -L${MAGMA}/lib -lmagma -lpthread
SOURCES = example_double_MAGMA_NVIDIA.cpp
EXECUTABLE = main_magma_double_example.exe
我可以尝试同时利用两个 GPU 卡吗?
PS:我必须提到在 2 个 GPU 卡之间有一个 NVLink 硬件组件。
更新
从使用的文档(Magma script doc)中magma_dgetrf_mgpu
,我尝试对其进行调整以计算矩阵的逆a
:
#include <stdio.h>
#include <cuda.h>
#include "magma_v2.h"
#include "magma_lapack.h"
#define min(a,b) (((a)<(b))?(a):(b))
int main( int argc , char** argv)
{
magma_init (); // initialize Magma
int num_gpus = 2;
magma_setdevice (0);
magma_queue_t queues[num_gpus ];
for( int dev = 0; dev < num_gpus; ++dev ) {
magma_queue_create( dev , &queues[dev] );
}
magma_int_t err;
real_Double_t cpu_time ,gpu_time;
magma_int_t m = 8192, n = 8192; // a,r - mxn matrices
magma_int_t mm = m*n;
magma_int_t nrhs =100; // b - nxnrhs , c - mxnrhs matrices
magma_int_t *ipiv; // array of indices of interchanged rows
magma_int_t n2=m*n; // size of a,r
magma_int_t nnrhs=n*nrhs; // size of b
magma_int_t mnrhs=m*nrhs; // size of c
double *a, *r; // a,r - mxn matrices on the host
double *b, *c;// b - nxnrhs , c - mxnrhs matrices on the host
double *dwork; // dwork - workspace
magmaDouble_ptr d_la[num_gpus ];
double alpha =1.0, beta =0.0; // alpha=1,beta=0
magma_int_t ldwork; // size of dwork
ldwork = m * magma_get_dgetri_nb( m ); // optimal block size
//4.3 LU decomposition and solving general linear systems 282
magma_int_t n_local;
magma_int_t ione = 1, info;
magma_int_t i, min_mn=min(m,n), nb;
magma_int_t ldn_local;// mxldn_local - size of the part of a
magma_int_t ISEED [4] = {0,0,0,1}; // on i-th device
nb =magma_get_dgetrf_nb(m,n); // optim.block size for dgetrf
// allocate memory on cpu
ipiv=( magma_int_t *) malloc(min_mn*sizeof(magma_int_t ));
// host memory for ipiv
err = magma_dmalloc_cpu (&a,n2); // host memory for a
err = magma_dmalloc_pinned (&r,n2); // host memory for r
err = magma_dmalloc_pinned (&b,nnrhs); // host memory for b
err = magma_dmalloc_pinned (&c,mnrhs); // host memory for c
// allocate device memory on num_gpus devices
for(i=0; i<num_gpus; i++){
n_local = ((n/nb)/ num_gpus )*nb;
if (i < (n/nb)% num_gpus)
n_local += nb;
else if (i == (n/nb)% num_gpus)
n_local += n%nb;
ldn_local = (( n_local +31)/32)*32;
magma_setdevice(i);
err = magma_dmalloc (&d_la[i],m*ldn_local ); // device memory
} // on i-th device
magma_setdevice (0);
lapackf77_dlarnv (&ione ,ISEED ,&mm ,a); // randomize a
// copy the corresponding parts of the matrix r to num_gpus
magma_dsetmatrix_1D_col_bcyclic( num_gpus , m, n, nb , a, m, d_la , m, queues );
// MAGMA
// LU decomposition on num_gpus devices with partial pivoting
// and row interchanges , row i is interchanged with row ipiv(i)
gpu_time = magma_sync_wtime(NULL);
magma_dgetrf_mgpu( num_gpus, m, n, d_la, m, ipiv, &info);
magma_dgetri_gpu(m, a, m, ipiv, dwork, ldwork, &info);
gpu_time = magma_sync_wtime(NULL)-gpu_time;
printf("magma_dgetrf_mgpu time: %7.5f sec.\n",gpu_time );
// print part of the solution from dgetrf_mgpu and dgetrs
printf("upper left corner of a^-1*a:\n");
magma_dprint( 4, 4, a, m); // magma_dgetrf_mgpu + dgetrs
free(ipiv); // free host memory
free(a); // free host memory
magma_free_pinned(r); // free host memory
magma_free_pinned(b); // free host memory
magma_free_pinned(c); // free host memory
for(i=0; i<num_gpus; i++){
magma_free(d_la[i] ); // free device memory
}
for( int dev = 0; dev < num_gpus; ++dev ) {
magma_queue_destroy( queues[dev] );
}
magma_finalize ();
}
通常,编译正常,但在执行时会出现以下错误:
CUBLAS error: memory mapping error (11) in magma_dtrtri_gpu at /home/henry/magma-2.6.1/src/dtrtri_gpu.cpp:162
CUBLAS error: memory mapping error (11) in magma_dtrtri_gpu at /home/henry/magma-2.6.1/src/dtrtri_gpu.cpp:172
CUBLAS error: memory mapping error (11) in magma_dtrtri_gpu at /home/henry/magma-2.6.1/src/dtrtri_gpu.cpp:162
CUBLAS error: memory mapping error (11) in magma_dtrtri_gpu at /home/henry/magma-2.6.1/src/dtrtri_gpu.cpp:172
CUDA runtime error: an illegal memory access was encountered (700) in magma_dtrtri_gpu at /home/henry/magma-2.6.1/src/dtrtri_gpu.cpp:173
CUBLAS error: memory mapping error (11) in magma_dtrtri_gpu at /home/henry/magma-2.6.1/src/dtrtri_gpu.cpp:162
CUDA runtime error: an illegal memory access was encountered (700) in magma_dtrtri_gpu at /home/henry/magma-2.6.1/src/dtrtri_gpu.cpp:163
...
但是,我认为我已经很好地初始化了变量d_la
,但似乎仍然存在编码错误。
接下来我可以尝试什么?