1

我有一个来自 MAGMA doc 的简单程序,它可以反转 GPU 卡上的大矩阵。

#include <stdio.h>
#include <cuda.h>
#include "magma_v2.h"
#include "magma_lapack.h"

int main() {

  //double *d_lA[MagmaMaxGPUs];
  magma_init (); // initialize Magma
  magma_queue_t queue=NULL;
  magma_int_t dev=0;
  magma_queue_create(dev ,&queue );
  double gpu_time , *dwork; // dwork - workspace
  magma_int_t ldwork; // size of dwork
  magma_int_t *piv , info; // piv - array of indices of inter -
  magma_int_t m = 20000; // changed rows; a - mxm matrix
  magma_int_t mm=m*m; // size of a, r, c
  double *a; // a- mxm matrix on the host
  double *d_a; // d_a - mxm matrix a on the device
  double *d_r; // d_r - mxm matrix r on the device
  double *d_c; // d_c - mxm matrix c on the device
  magma_int_t ione = 1;
  magma_int_t ISEED [4] = { 0,0,0,1 }; // seed
  magma_int_t err;
  const double alpha = 1.0; // alpha =1
  const double beta = 0.0; // beta=0
  ldwork = m * magma_get_dgetri_nb( m ); // optimal block size
  // allocate matrices
  err = magma_dmalloc_cpu( &a , mm ); // host memory for a
  err = magma_dmalloc( &d_a , mm ); // device memory for a
  err = magma_dmalloc( &d_r , mm ); // device memory for r
  err = magma_dmalloc( &d_c , mm ); // device memory for c
  err = magma_dmalloc( &dwork , ldwork );// dev. mem. for ldwork
  piv=( magma_int_t *) malloc(m*sizeof(magma_int_t ));// host mem.
  // generate random matrix a // for piv
  lapackf77_dlarnv (&ione ,ISEED ,&mm ,a); // randomize a
  printf("here1\n");
  magma_dsetmatrix( m, m, a, m, d_a , m, queue); // copy a -> d_a
  magmablas_dlacpy(MagmaFull , m, m, d_a , m, d_r ,m,queue);//d_a ->d_r
  // find the inverse matrix: d_a*X=I using the LU factorization
  // with partial pivoting and row interchanges computed by
  // magma_dgetrf_gpu; row i is interchanged with row piv(i);
  // d_a -mxm matrix; d_a is overwritten by the inverse
  gpu_time = magma_sync_wtime(NULL);

  printf("here2\n");
  //magma_dgetrf_mgpu(2, m, m, d_lA, m, piv, &info);
  magma_dgetrf_gpu(m, m, d_a, m, piv, &info);
  magma_dgetri_gpu(m, d_a, m, piv, dwork, ldwork, &info);
  printf("here3\n");

  gpu_time = magma_sync_wtime(NULL)-gpu_time;
  magma_dgemm(MagmaNoTrans ,MagmaNoTrans ,m,m,m,alpha ,d_a ,m,
  d_r ,m,beta ,d_c ,m,queue); // multiply a^-1*a
  printf("magma_dgetrf_gpu + magma_dgetri_gpu time: %7.5f sec.\
  \n",gpu_time );
  magma_dgetmatrix( m, m, d_c , m, a, m, queue); // copy d_c ->a
  printf("upper left corner of a^-1*a:\n");
  magma_dprint( 4, 4, a, m ); // part of a^-1*a
  free(a); // free host memory
  free(piv); // free host memory
  magma_free(d_a); // free device memory
  magma_free(d_r); // free device memory
  magma_free(d_c); // free device memory
  magma_queue_destroy(queue); // destroy queue
  magma_finalize (); // finalize Magma
  return 0;

}

一切正常,检查预期的短运行时间:

单卡上的良好执行

现在,如果您查看上面的代码,您会看到我注释了 2 行:

  //double *d_lA[MagmaMaxGPUs];

  //magma_dgetrf_mgpu(2, m, m, d_lA, m, piv, &info);

实际上,取消注释这 2 行并替换magma_dgetrf_gpu(m, m, d_a, m, piv, &info);magma_dgetrf_mgpu(2, m, m, d_lA, m, piv, &info);是为了使此代码分布在 2 个 GPU 卡(在 2 个 GPU 卡 RTX A6000)上运行。

我已经MAGMA_NUM_GPUS=2从 MAGMA 文档中设置了。

所以我有代码:

#include <stdio.h>
#include <cuda.h>
#include "magma_v2.h"
#include "magma_lapack.h"

int main() {

  double *d_lA[MagmaMaxGPUs];
  magma_init (); // initialize Magma
  magma_queue_t queue=NULL;
  magma_int_t dev=0;
  magma_queue_create(dev ,&queue );
  double gpu_time , *dwork; // dwork - workspace
  magma_int_t ldwork; // size of dwork
  magma_int_t *piv , info; // piv - array of indices of inter -
  magma_int_t m = 20000; // changed rows; a - mxm matrix
  magma_int_t mm=m*m; // size of a, r, c
  double *a; // a- mxm matrix on the host
  double *d_a; // d_a - mxm matrix a on the device
  double *d_r; // d_r - mxm matrix r on the device
  double *d_c; // d_c - mxm matrix c on the device
  magma_int_t ione = 1;
  magma_int_t ISEED [4] = { 0,0,0,1 }; // seed
  magma_int_t err;
  const double alpha = 1.0; // alpha =1
  const double beta = 0.0; // beta=0
  ldwork = m * magma_get_dgetri_nb( m ); // optimal block size
  // allocate matrices
  err = magma_dmalloc_cpu( &a , mm ); // host memory for a
  err = magma_dmalloc( &d_a , mm ); // device memory for a
  err = magma_dmalloc( &d_r , mm ); // device memory for r
  err = magma_dmalloc( &d_c , mm ); // device memory for c
  err = magma_dmalloc( &dwork , ldwork );// dev. mem. for ldwork
  piv=( magma_int_t *) malloc(m*sizeof(magma_int_t ));// host mem.
  // generate random matrix a // for piv
  lapackf77_dlarnv (&ione ,ISEED ,&mm ,a); // randomize a
  printf("here1\n");
  magma_dsetmatrix( m, m, a, m, d_a , m, queue); // copy a -> d_a
  magmablas_dlacpy(MagmaFull , m, m, d_a , m, d_r ,m,queue);//d_a ->d_r
  // find the inverse matrix: d_a*X=I using the LU factorization
  // with partial pivoting and row interchanges computed by
  // magma_dgetrf_gpu; row i is interchanged with row piv(i);
  // d_a -mxm matrix; d_a is overwritten by the inverse
  gpu_time = magma_sync_wtime(NULL);

  printf("here2\n");
  magma_dgetrf_mgpu(2, m, m, d_lA, m, piv, &info);
  magma_dgetri_gpu(m, d_a, m, piv, dwork, ldwork, &info);
  printf("here3\n");

  gpu_time = magma_sync_wtime(NULL)-gpu_time;
  magma_dgemm(MagmaNoTrans ,MagmaNoTrans ,m,m,m,alpha ,d_a ,m,
  d_r ,m,beta ,d_c ,m,queue); // multiply a^-1*a
  printf("magma_dgetrf_gpu + magma_dgetri_gpu time: %7.5f sec.\
  \n",gpu_time );
  magma_dgetmatrix( m, m, d_c , m, a, m, queue); // copy d_c ->a
  printf("upper left corner of a^-1*a:\n");
  magma_dprint( 4, 4, a, m ); // part of a^-1*a
  free(a); // free host memory
  free(piv); // free host memory
  magma_free(d_a); // free device memory
  magma_free(d_r); // free device memory
  magma_free(d_c); // free device memory
  magma_queue_destroy(queue); // destroy queue
  magma_finalize (); // finalize Magma
  return 0;

}

在执行开始时,两个 GPU 似乎都在运行,但我很快得到以下错误:

$ ./main_magma_double_example.exe
here1
here2
CUDA runtime error: an illegal memory access was encountered (700) in magma_dgetrf_mgpu at /home/fab/magma-2.6.1/src/dgetrf_mgpu.cpp:183
CUDA runtime error: an illegal memory access was encountered (700) in magma_dgetrf_mgpu at /home/fab/magma-2.6.1/src/dgetrf_mgpu.cpp:183
CUDA runtime error: an illegal memory access was encountered (700) in magma_dgetrf_mgpu at /home/fab/magma-2.6.1/src/dgetrf_mgpu.cpp:192
CUDA runtime error: an illegal memory access was encountered (700) in magma_dgetrf_mgpu at /home/fab/magma-2.6.1/src/dgetrf_mgpu.cpp:193
CUDA runtime error: an illegal memory access was encountered (700) in magma_dgetrf_mgpu at /home/fab/magma-2.6.1/src/dgetrf_mgpu.cpp:192
CUDA runtime error: an illegal memory access was encountered (700) in magma_dgetrf_mgpu at /home/fab/magma-2.6.1/src/dgetrf_mgpu.cpp:193
here3
CUDA runtime error: an illegal memory access was encountered (700) in magma_sync_wtime at /home/fab/magma-2.6.1/control/magma_timer.cpp:98
magma_dgetrf_gpu + magma_dgetri_gpu time: 1.07300 sec.
CUBLAS error: memory mapping error (11) in main at example_double_MAGMA_NVIDIA.cpp:57
upper left corner of a^-1*a:
[
   0.1206   0.4128   0.9920   0.4738
   0.6438   0.1080   0.1855   0.9998
   0.0623   0.0777   0.2275   0.1513
   0.4903   0.1876   0.8492   0.3984
];
CUDA runtime error: an illegal memory access was encountered (700) in main at example_double_MAGMA_NVIDIA.cpp:62
CUDA runtime error: an illegal memory access was encountered (700) in main at example_double_MAGMA_NVIDIA.cpp:63
CUDA runtime error: an illegal memory access was encountered (700) in main at example_double_MAGMA_NVIDIA.cpp:64
CUDA runtime error: an illegal memory access was encountered (700) in main at example_double_MAGMA_NVIDIA.cpp:65

我们可以看到反转是错误的。我不知道这些错误消息的来源。

编译由 Makefile.inc 完成:

CXX = nvcc -O3
LAPACK = /opt/intel/oneapi/mkl/latest/lib/intel64
MAGMA = /usr/local/magma
INCLUDE_CUDA=/usr/local/cuda/include
LIBCUDA=/usr/local/cuda/lib64
CXXFLAGS = -c -I${MAGMA}/include -I${INCLUDE_CUDA} -lpthread
LDFLAGS = -L${LAPACK} -lmkl_intel_lp64 -L${LIBCUDA} -lcuda -lcudart -lcublas -L${MAGMA}/lib -lmagma -lpthread
SOURCES = example_double_MAGMA_NVIDIA.cpp
EXECUTABLE = main_magma_double_example.exe

我可以尝试同时利用两个 GPU 卡吗?

PS:我必须提到在 2 个 GPU 卡之间有一个 NVLink 硬件组件。

更新

从使用的文档(Magma script doc)中magma_dgetrf_mgpu,我尝试对其进行调整以计算矩阵的逆a

#include <stdio.h>
#include <cuda.h>
#include "magma_v2.h"
#include "magma_lapack.h"
#define min(a,b) (((a)<(b))?(a):(b))

int main( int argc , char** argv)
{
  magma_init (); // initialize Magma
  int num_gpus = 2;
  magma_setdevice (0);
  magma_queue_t queues[num_gpus ];
  for( int dev = 0; dev < num_gpus; ++dev ) {
  magma_queue_create( dev , &queues[dev] );
  }
  magma_int_t err;
  real_Double_t cpu_time ,gpu_time;
  magma_int_t m = 8192, n = 8192; // a,r - mxn matrices
  magma_int_t mm = m*n;
  magma_int_t nrhs =100; // b - nxnrhs , c - mxnrhs matrices
  magma_int_t *ipiv; // array of indices of interchanged rows
  magma_int_t n2=m*n; // size of a,r
  magma_int_t nnrhs=n*nrhs; // size of b
  magma_int_t mnrhs=m*nrhs; // size of c
  double *a, *r; // a,r - mxn matrices on the host
  double *b, *c;// b - nxnrhs , c - mxnrhs matrices on the host
  double *dwork; // dwork - workspace
  magmaDouble_ptr d_la[num_gpus ];
  double alpha =1.0, beta =0.0; // alpha=1,beta=0
  magma_int_t ldwork; // size of dwork
  ldwork = m * magma_get_dgetri_nb( m ); // optimal block size

  //4.3 LU decomposition and solving general linear systems 282
  magma_int_t n_local;
  magma_int_t ione = 1, info;
  magma_int_t i, min_mn=min(m,n), nb;
  magma_int_t ldn_local;// mxldn_local - size of the part of a
  magma_int_t ISEED [4] = {0,0,0,1}; // on i-th device
  nb =magma_get_dgetrf_nb(m,n); // optim.block size for dgetrf
  // allocate memory on cpu
  ipiv=( magma_int_t *) malloc(min_mn*sizeof(magma_int_t ));

  // host memory for ipiv
  err = magma_dmalloc_cpu (&a,n2); // host memory for a
  err = magma_dmalloc_pinned (&r,n2); // host memory for r
  err = magma_dmalloc_pinned (&b,nnrhs); // host memory for b
  err = magma_dmalloc_pinned (&c,mnrhs); // host memory for c

  // allocate device memory on num_gpus devices
  for(i=0; i<num_gpus; i++){
  n_local = ((n/nb)/ num_gpus )*nb;
  if (i < (n/nb)% num_gpus)
  n_local += nb;
  else if (i == (n/nb)% num_gpus)
  n_local += n%nb;
  ldn_local = (( n_local +31)/32)*32;
  magma_setdevice(i);
  err = magma_dmalloc (&d_la[i],m*ldn_local ); // device memory
  } // on i-th device
  magma_setdevice (0);
  
  lapackf77_dlarnv (&ione ,ISEED ,&mm ,a); // randomize a
 
  // copy the corresponding parts of the matrix r to num_gpus
  magma_dsetmatrix_1D_col_bcyclic( num_gpus , m, n, nb , a, m, d_la , m, queues );

  // MAGMA
  // LU decomposition on num_gpus devices with partial pivoting
  // and row interchanges , row i is interchanged with row ipiv(i)
  gpu_time = magma_sync_wtime(NULL);
  magma_dgetrf_mgpu( num_gpus, m, n, d_la, m, ipiv, &info);
  magma_dgetri_gpu(m, a, m, ipiv, dwork, ldwork, &info);
  gpu_time = magma_sync_wtime(NULL)-gpu_time;
  printf("magma_dgetrf_mgpu time: %7.5f sec.\n",gpu_time );

  // print part of the solution from dgetrf_mgpu and dgetrs
  printf("upper left corner of a^-1*a:\n");
  magma_dprint( 4, 4, a, m); // magma_dgetrf_mgpu + dgetrs
  free(ipiv); // free host memory
  free(a); // free host memory
  magma_free_pinned(r); // free host memory
  magma_free_pinned(b); // free host memory
  magma_free_pinned(c); // free host memory
  for(i=0; i<num_gpus; i++){
  magma_free(d_la[i] ); // free device memory
  }
  for( int dev = 0; dev < num_gpus; ++dev ) {
  magma_queue_destroy( queues[dev] );
  }
  magma_finalize ();
}

通常,编译正常,但在执行时会出现以下错误:

CUBLAS error: memory mapping error (11) in magma_dtrtri_gpu at /home/henry/magma-2.6.1/src/dtrtri_gpu.cpp:162
CUBLAS error: memory mapping error (11) in magma_dtrtri_gpu at /home/henry/magma-2.6.1/src/dtrtri_gpu.cpp:172
CUBLAS error: memory mapping error (11) in magma_dtrtri_gpu at /home/henry/magma-2.6.1/src/dtrtri_gpu.cpp:162
CUBLAS error: memory mapping error (11) in magma_dtrtri_gpu at /home/henry/magma-2.6.1/src/dtrtri_gpu.cpp:172
CUDA runtime error: an illegal memory access was encountered (700) in magma_dtrtri_gpu at /home/henry/magma-2.6.1/src/dtrtri_gpu.cpp:173
CUBLAS error: memory mapping error (11) in magma_dtrtri_gpu at /home/henry/magma-2.6.1/src/dtrtri_gpu.cpp:162
CUDA runtime error: an illegal memory access was encountered (700) in magma_dtrtri_gpu at /home/henry/magma-2.6.1/src/dtrtri_gpu.cpp:163
...

但是,我认为我已经很好地初始化了变量d_la,但似乎仍然存在编码错误。

接下来我可以尝试什么?

4

0 回答 0