0

我正在尝试编写一个程序来将 cusolverSp 连接到 fortran。尽管我对用 C 编写 cuda 并不陌生,但我不确定如何在 fortran 上使用它。

以下是我的代码:

!  Fortran Console Application 
!
  module cuda_cusolverSP

  interface

 ! cudaMalloc
 integer (c_int) function cudaMalloc ( buffer, size ) bind (C, name="cudaMalloc" ) 
   use iso_c_binding
   implicit none
   type (c_ptr)  :: buffer
   integer (c_size_t), value :: size
 end function cudaMalloc

 ! cudaMemcpy 
 integer (c_int) function cudaMemcpy ( dst, src, count, kind ) bind (C, name="cudaMemcpy" )
   ! note: cudaMemcpyHostToDevice = 1
   ! note: cudaMemcpyDeviceToHost = 2
   use iso_c_binding
   type (C_PTR), value :: dst, src
   integer (c_size_t), value :: count, kind
 end function cudaMemcpy

 ! cudaFree
 integer (c_int) function cudaFree(buffer)  bind(C, name="cudaFree")
   use iso_c_binding
   implicit none
   type (C_PTR), value :: buffer
 end function cudaFree

 integer (c_int) function cudaMemGetInfo(fre, tot) bind(C, name="cudaMemGetInfo")
   use iso_c_binding
   implicit none
   type(c_ptr),value :: fre
   type(c_ptr),value :: tot
 end function cudaMemGetInfo


 integer(c_int) function cusolverSpCreate(cusolver_Hndl) bind(C,name="cusolverSpCreate")

 use iso_c_binding
 implicit none

 type(c_ptr)::cusolver_Hndl

 end function

 integer(c_int) function cusolverSpDestroy(cusolver_Hndl) bind(C,name="cusolverSpDestroy")

 use iso_c_binding
 implicit none

 type(c_ptr),value::cusolver_Hndl

 end function

integer(c_int) function cusolverSpSgetrf_bufferSize(cusolver_Hndl,m,n,d_A,lda,Lwork) bind(C,name="cusolverSpSgetrf_bufferSize") 

  use iso_c_binding
  implicit none

  type(c_ptr),value::cusolver_Hndl
  integer(c_int),value::m
  integer(c_int),value::n
  type(c_ptr),value::d_A
  integer(c_int),value::lda
  type(c_ptr),value::Lwork
end function

 integer(c_int) function cusolverSpSgetrf(cusolver_Hndl,m,n,d_A,lda,d_WS,d_Ipiv,d_devInfo) bind(C, name="cusolverSpSgetrf")

   use iso_c_binding
   implicit none
   type(c_ptr),value::cusolver_Hndl
   integer(c_int),value::m
   integer(c_int),value::n
   type(c_ptr),value::d_A
   integer(c_int),value::lda
   type(c_ptr),value::d_WS
   type(c_ptr),value::d_Ipiv
   type(c_ptr),value::d_devInfo

 end function 

 integer (c_int) function cusolverSpSgetrs(cusolver_Hndl,trans,n,nrhs,d_A,lda,d_Ipiv,d_B,ldb,d_devInfo) bind(C, name="cusolverSpSgetrs")

   use iso_c_binding
   implicit none
   type(c_ptr),value::cusolver_Hndl
   integer(c_int), value::trans
   integer(c_int), value::n
   integer(c_int), value::nrhs
   type(c_ptr),value::d_A
   integer(c_int), value::lda    
   type(c_ptr),value::d_Ipiv
   type(c_ptr),value::d_B
   integer(c_int),value::ldb
   type(c_ptr),value::d_devInfo

      end function

   end interface  

  end module 

program prog

use iso_c_binding
use cuda_cusolverSP

! ------ Matrix Definition & host CPU storage variables 

integer(c_int) rowsA  ! number of rows of A
integer(c_int) colsA  ! number of columns of A
integer(c_int) nnzA   ! number of nonzeros of A
integer(c_int) baseA  ! base index in CSR format

! CSR(A) from I/O <--- pointers to host CPU memory
type(c_ptr) :: h_csrRowPtrA
type(c_ptr) :: h_csrColIndA(:)
type(c_ptr) :: h_csrValA(:) 

type(c_ptr) :: h_x  ! x = A \ b
type(c_ptr) :: h_b  ! b = ones(m,1)
type(c_ptr) :: h_r  ! r = b - A*x

type(c_ptr) :: h_Q  ! <int> n
                    ! reorder to reduce zero fill-in
                    ! Q = symrcm(A) or Q = symamd(A)
!   B = Q*A*Q^T
type(c_ptr) :: h_csrRowPtrB ! <int> n+1
type(c_ptr) :: h_csrColIndB ! <int> nnzA
type(c_ptr) :: h_csrValB    ! <double> nnzA
type(c_ptr) :: h_mapBfromA  ! <int> nnzA


integer size_perm
type(c_ptr) :: buffer_cpu   ! working space for permutation: B = Q*A*Q^T


! -------------------- pointers to device memory    
type(c_ptr) :: d_csrRowPtrA
type(c_ptr) :: d_csrColIndA
type(c_ptr) :: d_csrValA 
type(c_ptr) :: d_x    ! x = A \ b
type(c_ptr) :: d_b    ! a copy of h_b
type(c_ptr) :: d_r    ! r = b - A*x

doubleprecision tol
integer reorder
integer singularity 

type(c_ptr)::cpfre,cptot
integer*8,target::free,total
integer res
integer*8 cudaMemcpyDeviceToHost, cudaMemcpyHostToDevice
integer*4 CUBLAS_OP_N, CUBLAS_OP_T
parameter (cudaMemcpyHostToDevice=1)
parameter (cudaMemcpyDeviceToHost=2)
parameter (CUBLAS_OP_N=0)
parameter (CUBLAS_OP_T=1)   


! ==================================================================
rowsA = 0
colsA = 0
nnzA = 0
baseA = 0    



A_size = SIZEOF(rowsA)
B_size = SIZEOF(B)
X_size = SIZEOF(X)

size_perm = 0
tol = 1.e-12
reorder = 0 ! no reordering
singularity = 0 ! -1 if A is invertible under tol.

! Step 1: Create cudense handle ---------------
cusolver_stat = cusolverSpCreate(cusolver_Hndl)  
if (cusolver_stat .ne. 0 ) then
    write (*,*)
    write (*, '(A, I2)') " cusolverSpCreate error: ", cusolver_stat
    write (*,*)
    stop
end if

! Step 2: copy A and B to Device

A_mem_stat    = cudaMalloc(d_A,A_size)
if (A_mem_stat .ne. 0 ) then
    write (*,*)
    write (*, '(A, I2)') " cudaMalloc 1 error: ", A_mem_stat
    write (*,*)
    stop
end if  

B_mem_stat    = cudaMalloc(d_B,B_size)  
if (B_mem_stat .ne. 0 ) then
    write (*,*)
    write (*, '(A, I2)') " cudaMalloc 2 error: ", B_mem_stat
    write (*,*)
    stop
end if    


! ---------- copy A and B to Device

A_mem_stat = cudaMemcpy(d_A,CPU_A_ptr,A_size,cudaMemcpyHostToDevice)
if (A_mem_stat .ne. 0 ) then
    write (*,*)
    write (*, '(A, I2)') " cudaMemcpy 1 error: ", A_mem_stat
    write (*,*)
!       stop
   end if

B_mem_stat = cudaMemcpy(d_B,CPU_B_ptr,B_size,cudaMemcpyHostToDevice)
if (B_mem_stat .ne. 0 ) then
    write (*,*)
    write (*, '(A, I2)') " cudaMemcpy 2 error: ", B_mem_stat
    write (*,*)
!       stop
    end if

! Step 3: query working space of Sgetrf (and allocate memory on device)

Lwork = 5
cusolver_stat =  cusolverSpSgetrf_bufferSize(cusolver_Hndl,m,n,d_A,lda,CPU_Lwork_ptr) 
if (cusolver_stat .ne. 0 ) then
    write (*,*)
    write (*, '(A, I2)') " SpSgetrf_bufferSize error: ", cusolver_stat
    write (*,*)
!      stop
    end if

write (*,*)
write (*, '(A, I12)') " Lwork: ", Lwork
write (*,*)


Workspace = 4*Lwork
WS_mem_stat = cudaMalloc(d_WS,Workspace)
if (WS_mem_stat .ne. 0 ) then
    write (*,*)
    write (*, '(A, I2)') " cudaMalloc 6 error: ", WS_mem_stat
    write (*,*)
!      stop
    end if

! Step 4: compute LU factorization of [A] 

cusolver_stat = cusolverSpSgetrf(cusolver_Hndl,m,n,d_A,lda,d_WS,d_Ipiv,d_devInfo) 
if (cusolver_stat .ne. 0 ) then
    write (*,*)
    write (*, '(A, I2)') " cusolverSpSgetrf error: ", WS_mem_stat
    write (*,*)
!      stop
    end if

! Step 5: compute solution vector [X] for Right hand side [B]

cusolver_stat = cusolverSpSgetrs(cusolver_Hndl,CUBLAS_OP_N,n,nrhs,d_A,lda,d_Ipiv,d_B,ldb,d_devInfo)  
if (cusolver_stat .ne. 0 ) then
    write (*,*)
    write (*, '(A, I2)') " cusolverSpSgetrs error: ", WS_mem_stat
    write (*,*)
!      stop
    end if

! Step 6: copy solution vector stored in [B] on device into [X] vector on host

X_mem_stat = cudaMemcpy(CPU_X_ptr,d_B,B_size,cudaMemcpyDeviceToHost)  
if (X_mem_stat .ne. 0 ) then
    write (*,*)
    write (*, '(A, I2)') " cudaMemcpy 4 error: ", WS_mem_stat
    write (*,*)
!      stop
    end if

!    do i = 1, n
!        print *, x(i,1)
!    enddo

! step 7: free memory on device and release CPU-side resources

A_mem_Stat    = cudafree(d_A)
B_mem_Stat    = cudafree(d_B)
Ipiv_mem_stat = cudafree(d_Ipiv)
WS_mem_stat   = cudafree(d_WS)
Lwork_mem_stat = cudafree(d_Lwork)

cusolver_stat = cusolverSpDestroy(cusolver_Hndl)

! Step 8: deallocate memory on host before exit

!    deallocate(A)
!   deallocate(ATest)
!    deallocate(B)
!    deallocate(X)
!    deallocate(Ipiv)



end program prog

我构建期间的当前错误是

错误 S0188:参数号 # 到 cusolverspcreate/etc:类型不匹配

我不知道如何解决它。该程序是对工作 cusolverDn 的修改,我确信这意味着我犯了很多错误,因为我可以参考的接口示例并不多。

4

1 回答 1

1

implicit none的主程序中没有并且cusolver_Hndl未声明,因此假定为real.

使用implicit none并声明所有变量。cusolver_Hndl应该type(ptr)并且不要忘记设置它的值(如果它不是输出参数,则接口不会显示任何意图)。

于 2017-02-09T09:03:50.727 回答