我使用了Jonathan的代码并试图解决 A*X=B,但我的结果会根据处理器的数量而变化。任何人都可以在这方面帮助我。
! Use MPI-IO to read a diagonal matrix distributed block-cycliCALLy,
! use Scalapack to solve ax=b.
!
Program array
USE mpi
IMPLICIT NONE
INTEGER, PARAMETER :: mk=SELECTED_REAL_KIND(15,307)
INTEGER :: N,Nb
! problem size and block size
INTEGER :: myArows, myAcols, myBcols
! size of local subset of global array
REAL(mk), DIMENSION(:), ALLOCATABLE :: myA,myB
INTEGER, DIMENSION(:), ALLOCATABLE :: ipiv
INTEGER, EXTERNAL :: numroc
! blacs routine
INTEGER :: me,nprocs,ictxt,prow,pcol,myrow,mycol
! blacs data
INTEGER :: info
! scalapack return value
INTEGER, DIMENSION(9) :: desca,descb
! scalapack array desc
INTEGER :: clock
REAL(mk) :: calctime,iotime
CHARACTER(LEN=128) :: matsize,fnamea,fnameb
INTEGER :: myrank
INTEGER :: ierr
INTEGER, DIMENSION(2) :: pdims,dims,distribs,dargs
INTEGER :: infile
INTEGER, DIMENSION(MPI_STATUS_SIZE) :: mpistatus
INTEGER :: darraya,darrayb
!New data type (handle)
INTEGER :: locsizea, nelementsa
INTEGER :: locsizeb, nelementsb,nelementsw
INTEGER(KIND=MPI_ADDRESS_KIND) :: lba,locextenta
INTEGER(KIND=MPI_ADDRESS_KIND) :: lbb,locextentb
!INTEGER(KIND=MPI_OFFSET_KIND) :: disp
INTEGER :: nargs
INTEGER :: m,IPACPY,ipa
INTEGER :: mpik
!MPI real kind precision
CALL MPI_Init(ierr)
CALL MPI_Comm_size(MPI_COMM_WORLD,nprocs,ierr)
CALL MPI_Comm_rank(MPI_COMM_WORLD,myrank,ierr)
! Initialize MPI (for MPI-IO)
pdims = 0
CALL MPI_Dims_create(nprocs, 2, pdims, ierr)
prow = pdims(1)
pcol = pdims(2)
!Get the process grid from MPI_Dims_create
nargs = command_argument_count()
IF (nargs /= 3) THEN
PRINT *,'Usage: mpirun -np np ./a.out N filenamea filenameb'
PRINT *,' Where np is the number of processors'
PRINT *,' Where N is the size of Matrix A'
PRINT *,' Where filenamea = name of A matrix file data.'
PRINT *,' Where filenameb = name of B matrix file data.'
CALL MPI_Abort(MPI_COMM_WORLD,1,ierr)
ENDIF
CALL get_command_argument(1, matsize)
CALL get_command_argument(2, fnamea)
CALL get_command_argument(3, fnameb)
! get filename
READ(matsize(1:LEN_TRIM(matsize)),*) N
!The size of the array we'll be using
SELECT CASE (mk)
CASE (SELECTED_REAL_KIND(6,37))
mpik=MPI_REAL
CASE (SELECTED_REAL_KIND(15,307))
mpik=MPI_DOUBLE_PRECISION
CASE DEFAULT
PRINT*,"The input data type is not defined!"
CALL MPI_Abort(MPI_COMM_WORLD,1,ierr)
END SELECT
Nb = 64
!Block size (for big matrix 32 or 64 are the suitable choice
IF (Nb > (N/prow)) Nb = N/prow
IF (Nb > (N/pcol)) Nb = N/pcol
dims = [N,N]
distribs = [MPI_DISTRIBUTE_CYCLIC, MPI_DISTRIBUTE_CYCLIC]
dargs = [Nb,Nb]
CALL MPI_Type_create_darray(nprocs,myrank,2,dims,distribs,dargs, &
& pdims,MPI_ORDER_FORTRAN,mpik,darraya,ierr)
!Creates a distributed array datatype
CALL MPI_Type_commit(darraya,ierr)
CALL MPI_Type_size(darraya,locsizea,ierr)
nelementsa = locsizea/mk
CALL MPI_Type_get_extent(darraya,lba,locextenta,ierr)
dims = [N,1]
dargs = [Nb,1]
CALL MPI_Type_create_darray(nprocs,myrank,2,dims,distribs,dargs, &
& pdims,MPI_ORDER_FORTRAN,mpik,darrayb,ierr)
!Creates a distributed array datatype
CALL MPI_Type_commit(darrayb,ierr)
CALL MPI_Type_size(darrayb,locsizeb,ierr)
nelementsb = locsizeb/mk
CALL MPI_Type_get_extent(darrayb,lbb,locextentb,ierr)
ALLOCATE(myA(nelementsa))
ALLOCATE(myB(nelementsb))
! Initialize local arrays
CALL MPI_File_open(MPI_COMM_WORLD,TRIM(fnamea),MPI_MODE_RDONLY, &
& MPI_INFO_NULL,infile,ierr)
CALL MPI_File_read_all(infile,myA,nelementsa,mpik,mpistatus,ierr)
CALL MPI_File_close(infile,ierr)
! read in the data
CALL MPI_File_open(MPI_COMM_WORLD,TRIM(fnameb),MPI_MODE_RDONLY, &
& MPI_INFO_NULL,infile,ierr)
CALL MPI_File_read_all(infile,myB,nelementsb,mpik,mpistatus,ierr)
CALL MPI_File_close(infile,ierr)
! read in the data
iotime = tock(clock)
IF (myrank == 0) THEN
PRINT *,'I/O time = ', iotime
ENDIF
! Initialize blacs processor grid
CALL tick(clock)
CALL blacs_pinfo(me,nprocs)
CALL blacs_get (-1,0,ictxt)
CALL blacs_gridinit(ictxt,'R',prow,pcol)
CALL blacs_gridinfo(ictxt,prow,pcol,myrow,mycol)
myArows = numroc(N,Nb,myrow,0,prow)
myAcols = numroc(N,Nb,mycol,0,pcol)
myBcols = numroc(1,Nb,mycol,0,pcol)
!NUMROC computes the NUMber of Rows Or Columns of a distributed
!matrix owned by the process indicated by the 3rd input.
!the 4th input element is the coordinate of the process that possesses the
!first row or column of the distributed matrix
! Construct local arrays
! Global structure: matrix A of n rows and n columns
CALL descinit(desca,N,N,Nb,Nb,0,0,ictxt,max(1,myArows),info)
CALL descinit(descb,N,1,Nb, 1,0,0,ictxt,max(1,myArows),info)
! Prepare array descriptors for ScaLAPACK
nelementsw=descb(9)*myBcols
ALLOCATE(ipiv(nelementsw))
!CALL THE SCALAPACK ROUTINE
!Solve the linear system A * X = B
SELECT CASE (mk)
CASE (SELECTED_REAL_KIND(6,37))
CALL PSGESV(N,1,myA,1,1,desca,ipiv,myB,1,1,descb,info)
CASE (SELECTED_REAL_KIND(15,307))
CALL PDGESV(N,1,myA,1,1,desca,ipiv,myB,1,1,descb,info)
END SELECT
IF (info /= 0) THEN
PRINT *, myrank,'Error -- info = ', info
CALL MPI_Abort(MPI_COMM_WORLD,1,ierr)
ENDIF
WRITE(fnameb,'(A)')'sol.dat'
CALL PDLAWRITE(trim(fnameb),N,1,myB,1,1,DESCB,0,0,ipiv)
!CALL MPI_File_open(MPI_COMM_WORLD,TRIM(fnameb),MPI_MODE_CREATE+ &
!& MPI_MODE_WRONLY,MPI_INFO_NULL,infile,ierr)
!CALL MPI_File_write_all(infile,myB,nelementsb,mpik,mpistatus,ierr)
!CALL MPI_File_close(infile,ierr)
! DEALLOCATE the local arrays
DEALLOCATE(myA,ipiv,myB)
! End blacs for processors that are used
CALL blacs_gridexit(ictxt)
calctime = tock(clock)
! PRINT results
IF (me == 0) THEN
IF (info /= 0) THEN
PRINT *, 'Error -- info = ', info
ENDIF
PRINT *,'Compute time = ', calctime
ENDIF
CALL MPI_Finalize(ierr)
CONTAINS
SUBROUTINE tick(t)
INTEGER, INTENT( OUT) :: t
CALL system_clock(t)
END SUBROUTINE tick
! returns time in seconds from now to time described by t
FUNCTION tock(t)
INTEGER, INTENT(IN ) :: t
REAL(mk) :: tock
INTEGER :: now, clock_rate
CALL system_clock(now,clock_rate)
tock = REAL(now-t,mk)/REAL(clock_rate,mk)
END FUNCTION tock
END PROGRAM array
矩阵 A 和 B 是 scalapack 示例。只是以二进制格式重写它们。所以它们可以被上述程序使用。
19.0 3.0 1.0 12.0 1.0 16.0 1.0 3.0 11.0
-19.0 3.0 1.0 12.0 1.0 16.0 1.0 3.0 11.0
-19.0 -3.0 1.0 12.0 1.0 16.0 1.0 3.0 11.0
-19.0 -3.0 -1.0 12.0 1.0 16.0 1.0 3.0 11.0
-19.0 -3.0 -1.0 -12.0 1.0 16.0 1.0 3.0 11.0
-19.0 -3.0 -1.0 -12.0 -1.0 16.0 1.0 3.0 11.0
-19.0 -3.0 -1.0 -12.0 -1.0 -16.0 1.0 3.0 11.0
-19.0 -3.0 -1.0 -12.0 -1.0 -16.0 -1.0 3.0 11.0
-19.0 -3.0 -1.0 -12.0 -1.0 -16.0 -1.0 -3.0 11.0
和乙
0.0
0.0
1.0
0.0
0.0
0.0
0.0
0.0
0.0
例如对于一个处理器:
mpirun -np 1 ./array 9 A.dat B.dat
9 1
0.000000000000000000D+00
-0.166666666666666657D+00
0.500000000000000000D+00
0.000000000000000000D+00
0.000000000000000000D+00
0.000000000000000000D+00
0.000000000000000000D+00
0.000000000000000000D+00
0.000000000000000000D+00
对于 2 个处理器
9 1
-0.432625129877109577D-01
-0.000000000000000000D+00
-0.677331518039483299D-01
-0.250000000000000000D+00
0.250000000000000056D+00
0.769230769230769273D-01
0.117743386633788305D+00
0.122377622377622383D+00
0.157721108027438911D+00
对于 6 个处理器,我会收到一个错误
mpirun -np 6 ./a.out 9 A.dat B.dat
0 Error -- info = 3
1 Error -- info = 3
2 Error -- info = 3
3 Error -- info = 3
4 Error -- info = 3
5 Error -- info = 3
使用 scalapack 示例不会发生这种情况。