3

我有一个函数的第一个版本,它反转一个大小矩阵并m使用 如下:magma_dgetrf_gpumagma_dgetri_gpu

  // Inversion
  magma_dgetrf_gpu( m, m, d_a, m, piv, &info);
  magma_dgetri_gpu( m, d_a, m, piv, dwork, ldwork, &info);

现在,我也想使用 Cholesky 分解进行逆运算。该功能看起来像第一个版本,除了使用的功能是:

  // Inversion
  magma_dpotrf_gpu( MagmaLower, m, d_a, m, &info);
  magma_dpotri_gpu( MagmaLower, m, d_a, m, &info);

这是反转的整个函数:

// ROUTINE MAGMA TO INVERSE WITH CHOLESKY
void matrix_inverse_magma(vector<vector<double>> const &F_matrix, vector<vector<double>> &F_output) {

  // Index for loop and arrays
  int i, j, ip, idx;

  // Start magma part
  magma_int_t m = F_matrix.size();
  if (m) {
  magma_init (); // initialize Magma
  magma_queue_t queue=NULL;
  magma_int_t dev=0;
  magma_queue_create(dev ,&queue );
  double gpu_time , *dwork; // dwork - workspace
  magma_int_t ldwork; // size of dwork
  magma_int_t *piv, info; // piv - array of indices of inter -
  // changed rows; a - mxm matrix
  magma_int_t mm=m*m; // size of a, r, c
  double *a; // a- mxm matrix on the host
  double *d_a; // d_a - mxm matrix a on the device
 
  magma_int_t err;
  ldwork = m * magma_get_dgetri_nb( m ); // optimal block size
  // allocate matrices
  err = magma_dmalloc_cpu( &a , mm ); // host memory for a

  // Convert matrix to *a double pointer
  for (i = 0; i<m; i++){
    for (j = 0; j<m; j++){
      idx = i*m + j;
      a[idx] = F_matrix[i][j];
    }
  }
  err = magma_dmalloc( &d_a , mm ); // device memory for a
  err = magma_dmalloc( &dwork , ldwork );// dev. mem. for ldwork
  piv=( magma_int_t *) malloc(m*sizeof(magma_int_t ));// host mem.

  magma_dsetmatrix( m, m, a, m, d_a, m, queue); // copy a -> d_a

  // Inversion
  magma_dpotrf_gpu( MagmaLower, m, d_a, m, &info);
  magma_dpotri_gpu( MagmaLower, m, d_a, m, &info);

  magma_dgetmatrix( m, m, d_a , m, a, m, queue); // copy d_a ->a

  // Save Final matrix
  for (i = 0; i<m; i++){
    for (j = 0; j<m; j++){
      idx = i*m + j;
      F_output[i][j] = a[idx];
    }
  }

  free(a); // free host memory
  free(piv); // free host memory
  magma_free(d_a); // free device memory
  magma_queue_destroy(queue); // destroy queue
  magma_finalize (); 
  // End magma part
  }
}

不幸的是,在检查了输出数据之后,我的实现出现了错误的反转。

我对这一行的使用有疑问:

  ldwork = m * magma_get_dgetri_nb( m ); // optimal block size

谁能一眼看出我使用dpotrfanddpotri函数(实际上是magma_dpotrf_gpuand magma_dpotri_gpu)的错误来自哪里?

编辑1:

根据 Damir Tenishev 的建议,我举了一个使用 LAPACKE 对矩阵求逆的函数示例:

// LAPACK version
void matrix_inverse_lapack(vector<vector<double>> const &F_matrix, vector<vector<double>> &F_output) {

  // Index for loop and arrays
  int i, j, ip, idx;

  // Size of F_matrix
  int N = F_matrix.size();

  int *IPIV = new int[N];

  // Statement of main array to inverse
  double *arr = new double[N*N];

  // Output Diagonal block
  double *diag = new double[N];

  for (i = 0; i<N; i++){
    for (j = 0; j<N; j++){
      idx = i*N + j;
      arr[idx] = F_matrix[i][j];
    }
  }

  // LAPACKE routines
  int info1 = LAPACKE_dgetrf(LAPACK_ROW_MAJOR, N, N, arr, N, IPIV);
  int info2 = LAPACKE_dgetri(LAPACK_ROW_MAJOR, N, arr, N, IPIV);

  for (i = 0; i<N; i++){
    for (j = 0; j<N; j++){
      idx = i*N + j;
      F_output[i][j] = arr[idx];
    }
  }

  delete[] IPIV;
  delete[] arr;
}

如您所见,这是矩阵求逆的经典版本,它使用LAPACKE_dgetrfLAPACKE_dgetri

编辑2:版本MAGMA是:

// MAGMA version
void matrix_inverse_magma(vector<vector<double>> const &F_matrix, vector<vector<double>> &F_output) {

  // Index for loop and arrays
  int i, j, ip, idx;

  // Start magma part
  magma_int_t m = F_matrix.size();

  if (m) {
  magma_init (); // initialize Magma
  magma_queue_t queue=NULL;
  magma_int_t dev=0;
  magma_queue_create(dev ,&queue );
  double gpu_time , *dwork; // dwork - workspace
  magma_int_t ldwork; // size of dwork
  magma_int_t *piv, info; // piv - array of indices of inter -

  // changed rows; a - mxm matrix
  magma_int_t mm=m*m; // size of a, r, c
  double *a; // a- mxm matrix on the host
  double *d_a; // d_a - mxm matrix a on the device
 
  magma_int_t ione = 1;
  magma_int_t ISEED [4] = { 0,0,0,1 }; // seed
  magma_int_t err;
  const double alpha = 1.0; // alpha =1
  const double beta = 0.0; // beta=0
  ldwork = m * magma_get_dgetri_nb( m ); // optimal block size
  // allocate matrices
  err = magma_dmalloc_cpu( &a , mm ); // host memory for a

  for (i = 0; i<m; i++){
    for (j = 0; j<m; j++){
      idx = i*m + j;
      a[idx] = F_matrix[i][j]
    }
  }
  err = magma_dmalloc( &d_a , mm ); // device memory for a
  err = magma_dmalloc( &dwork , ldwork );// dev. mem. for ldwork
  piv=( magma_int_t *) malloc(m*sizeof(magma_int_t ));// host mem.

  magma_dsetmatrix( m, m, a, m, d_a, m, queue); // copy a -> d_a

  // find the inverse matrix: d_a*X=I using the LU factorization
  // with partial pivoting and row interchanges computed by
  // magma_dgetrf_gpu; row i is interchanged with row piv(i);
  // d_a -mxm matrix; d_a is overwritten by the inverse

  magma_dgetrf_gpu( m, m, d_a, m, piv, &info);
  magma_dgetri_gpu(m, d_a, m, piv, dwork, ldwork, &info);

  magma_dgetmatrix( m, m, d_a , m, a, m, queue); // copy d_a ->a

  for (i = 0; i<m; i++){
    for (j = 0; j<m; j++){
      idx = i*m + j;
      F_output[i][j] = a[idx];
    }
  }

  free(a); // free host memory
  free(piv); // free host memory
  magma_free(d_a); // free device memory
  magma_queue_destroy(queue); // destroy queue
  magma_finalize (); 
  // End magma part
  }
}

如您所见,我使用了magma_dgetrf_gpumagma_dgetri_gpu函数。

现在,我想用LAPACKEor MAGMA+LAPACK、使用dpotrfanddpotri函数来做同样的事情。我记得我逆的矩阵是对称的。

编辑 3:我的尝试来自此文档链接

特别是,请参见magma dpotri - invert a positive definite matrix in double precision, CPU interface第 325 页的第 4.4.21 节。

4

0 回答 0