使用 scalapack 考虑以下代码:
! if (norm2(h-x0) < tol) then
tmp_vec = h - x0
call pdnrm2(N,norm,tmp_vec,1,1,descvec,1)
if (norm < tol) then
x=h
converged = .true.
exit
endif
s = r0 - alpha*v
call pdgemv('N', N, N, 1.0, A, 1, 1, descmat, s, 1, 1, descvec, 1, 0.0,t, 1, 1, descvec, 1)
它是我尝试的迭代求解器的一部分,问题是如果我的处理器网格是二维的,我的向量在这些 procs 上没有任何元素,因此 dnrm2 产生零或norm
变量。因此导致一些proc从循环中提前退出,挂起整个循环。
除了手动广播等之外,确保正确分配规范值的正确方法是什么?
注意:这适用于 1-d procs 分布,请参阅: Inconsistent rows allocation in scalapack
下面给出的是我从维基百科文章中写的一个简单的 Bi-CGSTAB 求解器,它分别从文件 b.dat 和 A.dat 读取向量和矩阵。并继续使用 bicgstab_self_sclpk 例程解决它。下面打印的是值norm
对于 rank=4 运行:
...
current norm2 0.0000000000000000
Bi-CG STAB solver converged in iteration # 1 0.0000000000000000
current norm2 0.0000000000000000
Bi-CG STAB solver converged in iteration # 1 0.0000000000000000
...
一切都在这里。
对于排名 = 7 运行
. . .
current norm2 1.2377699991821143E-008
current norm2 1.2377699991821143E-008
Bi-CG STAB solver converged in iteration # 369 1.2377699991821143E-008
current norm2 1.2377699991821143E-008
Bi-CG STAB solver converged in iteration # 369 1.2377699991821143E-008
current norm2 1.2377699991821143E-008
Bi-CG STAB solver converged in iteration # 369 1.2377699991821143E-008
current norm2 1.2377699991821143E-008
Bi-CG STAB solver converged in iteration # 369 1.2377699991821143E-008
current norm2 1.2377699991821143E-008
Bi-CG STAB solver converged in iteration # 369 1.2377699991821143E-008
current norm2 1.2377699991821143E-008
Bi-CG STAB solver converged in iteration # 369 1.2377699991821143E-008
Bi-CG STAB solver converged in iteration # 369 1.2377699991821143E-008
. . .
module bicgstab
contains
subroutine bicgstab_self_sclpk(A,b,N,descvec,descmat,mloc_vec,nloc_vec)
use mpi
implicit none
real :: A(:,:), b(:,:), tol
integer(kind=8) :: N
integer :: maxiter, descvec(:),descmat(:), info, mloc_vec ,nloc_vec
integer :: i, ierr, rank, maxit
real :: rho0, alpha, omega0, rho, omega, beta, norm, tmp_real
real, allocatable :: r0(:,:), r(:,:), x0(:,:), x(:,:),h(:,:),t(:,:), tmp_vec(:,:)
real, allocatable :: rhat0(:,:),v(:,:), p(:,:), v0(:,:),p0(:,:),s(:,:)
logical :: converged
! ================== Initialize ======================
allocate(r0(mloc_vec,nloc_vec))
allocate(r(mloc_vec,nloc_vec))
allocate(rhat0(mloc_vec,nloc_vec))
allocate(x0(mloc_vec,nloc_vec))
allocate(x(mloc_vec,nloc_vec))
allocate(v0(mloc_vec,nloc_vec))
allocate(v(mloc_vec,nloc_vec))
allocate(p0(mloc_vec,nloc_vec))
allocate(p(mloc_vec,nloc_vec))
allocate(h(mloc_vec,nloc_vec))
allocate(s(mloc_vec,nloc_vec))
allocate(t(mloc_vec,nloc_vec))
allocate(tmp_vec(mloc_vec,nloc_vec))
x0 = 0
r0 = 0
r = 0
x = 0
v0 = 0
v = 0
p0 = 0
p = 0
h = 0
s = 0
t = 0
norm= 0
rhat0 = 0
rho0 = 1
rho = 0
alpha = 1
omega0 = 1
omega = 0
beta = 0
converged = .false.
r0(1:mloc_vec,1:nloc_vec) = b(1:mloc_vec,1:nloc_vec)
rhat0 = r0
tol = 1E-6
maxiter = 1000
call MPI_Comm_rank(MPI_COMM_WORLD, rank, ierr)
print *, rank , mloc_vec, nloc_vec
! print *, "rank",rank,"descmat",descmat
! print *, "rank",rank,"descvec",descvec
! ======================================================
! ================Loop==================================
do i = 1, maxiter
! rho = dot_product(rhat0(:,1),r0(:,1))
call pddot(N,rho, rhat0, 1,1,descvec,1,r0,1,1,descvec,1)
beta = (rho/rho0)*(alpha/omega0)
p = r0 + beta*(p0 - omega0*v0)
! v = matmul(A,p)
call pdgemv('N', N, N, 1.0, A, 1, 1, descmat, p, 1, 1, descvec, 1, 0.0,v, 1, 1, descvec, 1)
! alpha = rho/dot_product(rhat0(:,1),v(:,1))
call pddot(N,alpha,rhat0, 1,1,descvec,1,v,1,1,descvec,1)
alpha = rho/alpha
h = x0 + alpha*p
! if (norm2(h-x0) < tol) then
tmp_vec = h - x0
norm = 999.0
call pdnrm2(N,norm,tmp_vec,1,1,descvec,1)
! print *, "current norm1", norm, rank
if (norm < tol) then
x=h
converged = .true.
exit
endif
if (i==1) print *,"rank",rank,"was here"
s = r0 - alpha*v
! t = matmul(A,s)
call pdgemv('N', N, N, 1.0, A, 1, 1, descmat, s, 1, 1, descvec, 1, 0.0,t, 1, 1, descvec, 1)
! call pdgemm('N', 'N', N, 1, N, 1.0, A, 1, 1, descmat, s, 1, 1, descvec, 0.0, tmp_vec, 1, 1, descvec)
! t = tmp_vec
! omega = dot_product(t(:,1),s(:,1))/dot_product(t(:,1),t(:,1))
call pddot(N,omega,t, 1,1,descvec,1,s,1,1,descvec,1)
call pddot(N,tmp_real,t, 1,1,descvec,1,t,1,1,descvec,1)
omega = omega/tmp_real
x = h + omega*s
! if (norm2(x-x0)<tol) then
tmp_vec = x - x0
norm = 1000000
call pdnrm2(N,norm,tmp_vec,1,1,descvec,1)
if (norm < tol) then
print *, "current norm2", norm
converged = .true.
exit
endif
r = s - omega*t
x0 =x
rho0 = rho
p0 = p
r0 = r
v0 = v
omega0 = omega
enddo
! =========================================================
if (converged) then
print *, "Bi-CG STAB solver converged in iteration #", i, norm
else
print *, "Maximum iteration cycles reached"
endif
call MPI_Barrier(MPI_COMM_WORLD,ierr)
b = x
! print *,"rank ",rank
! =================clean up!===============================
deallocate(r0)
deallocate(r)
deallocate(rhat0)
deallocate(x0)
deallocate(x)
deallocate(v0)
deallocate(v)
deallocate(p0)
deallocate(p)
deallocate(h)
deallocate(s)
deallocate(t)
print *,"End of bicgstab"
end subroutine bicgstab_self_sclpk
end module bicgstab
program test_bicgstab
use bicgstab
use mpi
implicit none
character, parameter :: UPLO="U"
character(len=7) :: char_size
integer :: info
integer(kind=8) :: N, i, j
real(kind=8), allocatable :: A_global(:,:), b_global(:,:)
integer(kind=8) :: count_start, count_end,count_rate, dummy_int
real(kind=8) :: time
! =========================BLACS and MPI=======================
integer :: ierr, size, rank,dims(2)
! -------------------------------------------------------------
integer, parameter :: block_size = 100
integer :: context, nprow, npcol, local_nprow, local_npcol
integer :: numroc, indxl2g, descmat(9),descvec(9)
integer :: mloc_mat ,nloc_mat ,mloc_vec ,nloc_vec
real(kind=8), allocatable :: A(:,:), x(:,:), b(:,:)
call blacs_pinfo(rank,size)
dims=0
call MPI_Dims_create(size, 2, dims, ierr)
nprow = dims(1);npcol = dims(2)
call blacs_get(0,0,context)
call blacs_gridinit(context, 'R', nprow, npcol)
call blacs_gridinfo(context, nprow, npcol, local_nprow,local_npcol)
N = 700
allocate(A_global(N,N))
if (rank==0) open(101,file='A.dat')
do i = 1, N
if (rank==0) read(101,*) A_global(i,1:N)
call MPI_Bcast(A_global(i,1:N), N,MPI_DOUBLE,0,MPI_COMM_WORLD, ierr)
enddo
if (rank==0) close(101)
mloc_mat = numroc(N,block_size,local_nprow,0,nprow)
nloc_mat = numroc(N,block_size,local_npcol,0, npcol)
allocate(A(mloc_mat,nloc_mat))
do i = 1, mloc_mat
do j = 1,nloc_mat
A(i,j) = A_global(indxl2g(i,block_size,local_nprow,0, nprow),&
&indxl2g(j,block_size,local_npcol,0, npcol))
enddo
enddo
if (rank==0) print *, "Read matrix"
allocate(b_global(N,1))
if (rank==0) then
open(103,file='b.dat')
do i = 1, N
read(103,*) b_global(i,1)
enddo
close(103)
endif
call MPI_Bcast(b_global(:,1), N,MPI_DOUBLE,0,MPI_COMM_WORLD, ierr)
! set up scalapack shared matrices
if (rank==0) print *, "Matrix broadcasted"
mloc_vec = numroc(N,block_size,local_nprow,0, nprow)
nloc_vec = numroc(1,block_size,local_npcol,0, npcol)
print *,"Rank", rank, mloc_vec, nloc_vec
allocate(b(mloc_vec,nloc_vec))
allocate(x(mloc_vec,nloc_vec))
do i = 1, mloc_vec
do j = 1,nloc_vec
b(i,j) = b_global(indxl2g(i,block_size,local_nprow,0, nprow),&
&indxl2g(j,block_size,local_npcol,0, npcol))
x(i,j) = b_global(indxl2g(i,block_size,local_nprow,0, nprow),&
&indxl2g(j,block_size,local_npcol,0, npcol))
enddo
enddo
call descinit(descmat , N, N, block_size, block_size, 0,0,context,max(1,mloc_mat),info)
call descinit(descvec , N, 1, block_size, block_size, 0,0,context,max(1,mloc_vec),info)
if (rank==0) print *, "Set up done,solving"
! setup done, call in the cavalary
call MPI_Barrier(MPI_COMM_WORLD, ierr)
call bicgstab_self_sclpk(A,x,N, descvec, descmat,mloc_vec,nloc_vec)
! print *,x
call MPI_Barrier(MPI_COMM_WORLD,ierr)
call blacs_gridexit(context)
call blacs_exit(0)
end program test_bicgstab
如果需要,可以在这里下载矩阵文件:https ://github.com/ipcamit/temp_so