1

我尝试做一个 mpi 共享内存示例,但每次我得到一些奇怪的值。

它是一个 1D 模板,只是对位置 i-1、i 和 i+1 处的元素求和

我在 32 MPI 进程的 2 个节点上运行此程序,并且域大小 nx=64,每个等级的域只有 1 个元素。我用 MPI_SENDRECEIVE 和幽灵单元在节点之间进行交换

program mpishared
  USE MPI_F08
  use ISO_C_BINDING
  implicit none
  integer :: rank, rankNode, rankW, rankE
  integer :: nbp, nbNode
  integer :: key
  TYPE(MPI_Comm) :: commNode ! shared node
  integer :: nx ! area global
  integer :: sx,ex ! area local
  integer :: rsx,rex ! real bound of local array with halo
  integer(kind=MPI_ADDRESS_KIND) :: size
  TYPE(C_PTR) :: baseptr
  TYPE(MPI_Win) :: win
  integer, parameter :: dp = kind(1.d0)
  real(kind=dp), dimension(:), contiguous, pointer :: ushared
  real(kind=dp), dimension(:), allocatable :: u
  integer :: iterx,iter,iterp

  !! Init MPI
  CALL MPI_INIT()

  !! Info WORLD
  CALL MPI_COMM_RANK(MPI_COMM_WORLD,rank)
  CALL MPI_COMM_SIZE(MPI_COMM_WORLD,nbp)

  ! Comm 4 Node
  key = 0
  CALL MPI_COMM_SPLIT_TYPE(MPI_COMM_WORLD,MPI_COMM_TYPE_SHARED,key,MPI_INFO_NULL,commNode)
  CALL MPI_COMM_RANK(commNode, rankNode)
  CALL MPI_COMM_SIZE(commNode, nbNode)
  ! Neighbours
  rankW = rank-1
  rankE = rank+1
  if (rank == 0) rankW=MPI_PROC_NULL
  if (rank == nbp-1) rankE=MPI_PROC_NULL

  ! Size of global domain
  nx = 64

  ! Size of local domain
  sx = 1+(rank*nx)/nbp
  ex = ((rank+1)*nx)/nbp
  rsx = sx ! real size only different for first
  rex = ex ! and last rank in node
  if (rankNode == 0) rsx = rsx-1
  if (rankNode == nbNode-1) rex=rex+1

  ! Allocate Shared domain
  size = (rex-rsx+1)
  allocate(u(rex-rsx+1))
  CALL MPI_WIN_ALLOCATE_SHARED(size,1,MPI_INFO_NULL,commNode,baseptr,win)
  CALL C_F_POINTER(baseptr,ushared)

  ! Init local domain
  do iterx=1,rex-rsx+1
    u(iterx) = 0.0_dp
  end do
  if (rank == nbp-1) then
    u(rex-rsx+1) = rex
  end if
  if (rank == 0) then
    u(1) = -1.0_dp
  end if

  ! Main Loop
  CALL MPI_WIN_LOCK_ALL(0,win)
  do iter=1,10

    ! Update sharedold
    do iterx=1,rex-rsx+1
      ushared(iterx)=u(iterx)
    end do
    ! Update bound between node
    if (rankNode == 0) then
      CALL MPI_SENDRECV(ushared(2),nx,MPI_DOUBLE_PRECISION,rankW,100, &
                        ushared(1),nx,MPI_DOUBLE_PRECISION,rankW,100,&
                        MPI_COMM_WORLD,MPI_STATUS_IGNORE)
    end if
    if (rankNode == nbNode-1) then
      CALL MPI_SENDRECV(ushared(ex-rsx+1),nx,MPI_DOUBLE_PRECISION,rankE,100, &
                        ushared(rex-rsx+1),nx,MPI_DOUBLE_PRECISION,rankE,100,&
                        MPI_COMM_WORLD,MPI_STATUS_IGNORE)
    end if

    call MPI_WIN_SYNC(win)
    call MPI_BARRIER(MPI_COMM_WORLD)

    ! Compute
    do iterx=sx-rsx+1,ex-rsx+1
      u(iterx)=(ushared(iterx-1)+ushared(iterx)+ushared(iterx+1))/3.0_dp
      !print *, rank, iterx, u(iterx), ushared(iterx-1), ushared(iterx), ushared(iterx+1)
    end do

    call MPI_BARRIER(MPI_COMM_WORLD)
  end do
  call MPI_WIN_UNLOCK_ALL(win)

  do iterp=0, nbp-1
    if (iterp == rank) then
      do iterx=1,rex-rsx+1
        print * , iter,"u", rank, iterx, u(iterx)
      end do
    end if
    call MPI_BARRIER(MPI_COMM_WORLD)
  end do

  CALL MPI_FINALIZE()
end program 

多次迭代后的值必须等于秩

但是当我运行它时,错误的值开始出现(如 -6.018996517484083E+196 )

由于我是 MPI RMA 的新手,我不知道这是我使用的 MPI 实现的错误还是我做错了什么

4

0 回答 0