fortran - 如何提高这个巨大的嵌套循环的性能？(Fortran 90)

Question

我将在这里发布整个代码段，但唯一的问题是最后的嵌套循环。所有读入矩阵的尺寸均为 180x180，并且循环速度慢得难以忍受。我没有看到简化计算的简单方法，因为索引的三次出现，得到矩阵“AnaInt”的索引乘法不是简单的矩阵乘积。有什么想法吗？谢谢！

program AC
 implicit none
  integer, parameter :: dp = selected_real_kind(15, 307)
  integer :: n, ndim, k, j, i, o, l, m, steps
  real(dp) :: emax, omega, pi, EFermi, auev
  complex(dp) :: Grs,Gas, ACCond, tinyc, cunit, czero, cone

  complex(dp), allocatable :: GammaL(:,:)     
  complex(dp), allocatable :: GammaL_EB(:,:)  
  complex(dp), allocatable :: GammaR(:,:)     
  complex(dp), allocatable :: R(:,:)  
  complex(dp), allocatable :: Yc(:,:)         
  complex(dp), allocatable :: Yd(:,:)         
  complex(dp), allocatable :: AnaInt(:,:)     
  complex(dp), allocatable :: H(:,:)         
  complex(dp), allocatable :: HamEff(:,:)     
  complex(dp), allocatable :: EigVec(:,:)    
  complex(dp), allocatable :: InvEigVec(:,:)  
  complex(dp), allocatable :: EigVal(:)       
  complex(dp), allocatable :: ctemp(:,:)      
  complex(dp), allocatable :: ctemp2(:,:)      
  complex(dp), allocatable :: S(:,:)          
  complex(dp), allocatable :: SelfL(:,:)     
  complex(dp), allocatable :: SelfR(:,:)     
  complex(dp), allocatable :: SHalf(:,:)      
  complex(dp), allocatable :: InvSHalf(:,:)   
  complex(dp), allocatable :: HEB(:,:)
  complex(dp), allocatable :: Integrand(:,:)


!Lapack arrays and variables
  integer :: info, lwork
  complex(dp), allocatable :: work(:)       
  real(dp), allocatable :: rwork(:)    
  integer,allocatable :: ipiv(:)

!########################################################################

!Constants
    auev = 27.211385
    pi = 3.14159265359
    cunit = (0,1)
    czero = (0,0)
    cone = (1,0)
    tinyc = (0.0, 0.000000000001)


!System and calculation parameters
    open(unit=123, file="ForAC.dat", action='read', form='formatted')
    read(123,*) ndim, EFermi
    lwork = ndim*ndim

    emax = 5.0/auev
    steps = 1000 


    allocate(HEB(ndim,ndim))
    allocate(H(ndim,ndim))
    allocate(Yc(ndim,ndim))
    allocate(Yd(ndim,ndim))
    allocate(S(ndim,ndim))
    allocate(SelfL(ndim,ndim))
    allocate(SelfR(ndim,ndim))
    allocate(HamEff(ndim,ndim))
    allocate(GammaR(ndim,ndim))
    allocate(GammaL(ndim,ndim))
    allocate(AnaInt(ndim,ndim))
    allocate(EigVec(ndim,ndim))
    allocate(EigVal(ndim))
    allocate(InvEigVec(ndim,ndim))
    allocate(R(ndim,ndim))
    allocate(GammaL_EB(ndim,ndim))
    allocate(Integrand(ndim,ndim))

!################################################



    read(123,*) H, S, SelfL, SelfR
    close(unit=123)

    HamEff(:,:)=(H(:,:) + SelfL(:,:) + SelfR(:,:))   



    allocate(SHalf(ndim, ndim))
    allocate(InvSHalf(ndim,ndim))
    SHalf(:,:) = (cmplx(real(S(:,:),dp),0.0_dp,dp))

    call zpotrf('l', ndim, SHalf, ndim, info)         
    InvSHalf(:,:) = SHalf(:,:)
    call ztrtri('l', 'n', ndim, InvSHalf, ndim, info) 

    call ztrmm('l', 'l', 'n', 'n', ndim, ndim, cone, InvSHalf, ndim, HamEff, ndim) 
    call ztrmm('r', 'l', 't', 'n', ndim, ndim, cone, InvSHalf, ndim, HamEff, ndim) 
    call ztrmm('l', 'l', 'n', 'n', ndim, ndim, cone, InvSHalf, ndim, GammaL, ndim) 
    call ztrmm('r', 'l', 't', 'n', ndim, ndim, cone, InvSHalf, ndim, GammaL, ndim) 
    call ztrmm('l', 'l', 'n', 'n', ndim, ndim, cone, InvSHalf, ndim, GammaR, ndim)
    call ztrmm('r', 'l', 't', 'n', ndim, ndim, cone, InvSHalf, ndim, GammaR, ndim)

    deallocate(SHalf)
    deallocate(InvSHalf)




!In the PDF: B = EigVec, B^(-1) = InvEigVec, Hk = EigVal

    allocate(ctemp(ndim,ndim))
    ctemp(:,:) = HamEff(:,:)
    allocate(work(lwork),rwork(2*ndim))
    call zgeev('N', 'V', ndim, ctemp, ndim, EigVal, InvEigVec, ndim, EigVec, ndim, work, lwork, rwork, info)
    if(info/=0)write(*,*) "Warning: zgeev info=", info
    deallocate(work,rwork)
    deallocate(ctemp) 

    InvEigVec(:,:)=EigVec(:,:)
    lwork = 3*ndim
    allocate(ipiv(ndim))
    allocate(work(lwork))
    call zgetrf(ndim,ndim,InvEigVec,ndim,ipiv,info)
    if(info/=0)write(*,*) "Warning: zgetrf info=", info   ! LU decomposition
    call zgetri(ndim,InvEigVec,ndim,ipiv,work,lwork,info)
    if(info/=0)write(*,*) "Warning: zgetri info=", info ! Inversion by LU decomposition (Building of InvEigVec)
    deallocate(work)
    deallocate(ipiv)


 R(:,:) = 0.0_dp
 do j=1,ndim
 do m=1,ndim
 do k=1,ndim
 do l=1,ndim
 R(j,m) = R(j,m) + InvEigVec(j,k) * GammaR(k,l) * conjg(InvEigVec(m,l))
 end do
 end do
 end do
 end do





!!!THIS IS THE LOOP IN QUESTION. MATRIX DIMENSION 180x180, STEPS=1000

 open(unit=125,file="ACCond.dat")

     !Looping over omega
     do o=1,steps
         omega=real(o,dp)*emax/real(steps,dp) 
         AnaInt(:,:) = 0.0_dp
         do i=1,ndim
             do n=1,ndim
                 do j=1,ndim
                      do m=1,ndim
                           Grs = log((EFermi-(EigVal(j)+tinyc)+omega)/(EFermi-(EigVal(j)+tinyc)))
                           Gas = log((EFermi-conjg(EigVal(m)+tinyc))/(EFermi-omega-conjg(EigVal(m)+tinyc)))
                           Integrand = (Grs-Gas)/(EigVal(j)-tinyc-omega-conjg(EigVal(m)-tinyc))

                           AnaInt(i,n)= AnaInt(i,n) + EigVec(i,j) * R(j,m) * Integrand(j,m) * conjg(EigVec(n,m))
                      end do
                 end do
             end do
        end do 

         Yc = 1/(2.0*pi*omega) * matmul(AnaInt,GammaL)
         Yd(:,:) = - 1/(2.0*pi) * cunit * AnaInt(:,:)

          ACCond = czero
          do k=1,ndim
              ACCond=ACCond+Yc(k,k) + 1/(2.0) * Yd(k,k)
          end do
          write(125,*) omega, real(ACCond,dp), aimag(ACCond)
      end do



!#############################################

    deallocate(Integrand)
    deallocate(HEB)
    deallocate(Yc)
    deallocate(Yd)
    deallocate(HamEff)
    deallocate(GammaR)
    deallocate(GammaL)
    deallocate(AnaInt)
    deallocate(EigVec)
    deallocate(EigVal)
    deallocate(InvEigVec)
    deallocate(H)
    deallocate(S)
    deallocate(SelfL)
    deallocate(SelfR)
    deallocate(R)
    deallocate(GammaL_EB)
end program AC

因此，这是根据建议进行的第一次改编：

HermEigVec(:,:) = 0.0_dp
do i=1, ndim
do j=1, ndim
HermEigVec(i,j) = conjg(EigVec(j,i))
end do
end do

HermInvEigVec(:,:) = 0.0_dp
do i=1, ndim
do j=1, ndim
HermInvEigVec(i,j) = conjg(InvEigVec(j,i))
end do
end do


R(:,:) = 0.0_dp

R = matmul(InvEigVec,matmul(GammaR,HermInvEigVec))


open(unit=125,file="ACCond.dat")

    !Looping over omega
     do o=1,steps
         omega=real(o,dp)*emax/real(steps,dp)

         AnaInt(:,:) = 0.0_dp
             do j=1,ndim
             do m=1,ndim
                 Grs = log((EFermi-(EigVal(j)+tinyc)+omega)/(EFermi-(EigVal(j)+tinyc)))
                 Gas = log((EFermi-conjg(EigVal(m)+tinyc))/(EFermi-omega-conjg(EigVal(m)+tinyc)))
                 Integrand(j,m) = (Grs-Gas)/(EigVal(j)-tinyc-omega-conjg(EigVal(m)-tinyc))
                 T(j,m) = R(j,m) * Integrand(j,m)
             end do
             end do
         AnaInt = matmul(EigVec,matmul(T,HermEigVec))


         Yc = 1/(2.0*pi*omega) * matmul(AnaInt,GammaL)                      
         Yd(:,:) = - 1/(2.0*pi) * cunit * AnaInt(:,:)

         ACCond = czero
         do k=1,ndim
             ACCond=ACCond+Yc(k,k) + 1/(2.0) * Yd(k,k)
         end do
       write(125,*) omega, real(ACCond,dp), aimag(ACCond)
     end do

score 2 · Accepted Answer

您的代码中有几个问题。让我们从你强调的循环之前的循环开始（它更容易理解，但下面的大循环或多或少存在相同的问题）。

所以我们在 i, j, k, l 上有一个循环。

您可以考虑重新排序循环，以获得更好的缓存访问。您最内部的循环位于 l 上，它仅显示为列索引。使用Fortran 中的以列为主的数组，您可以预期性能会很差。j 上的内部循环可能会更好。

更糟糕的是，您的整个循环是由三个矩阵 (InvEigVec * GammaR * InvEigVec^H) 的乘积进行的矩阵更新，但您在 O(ndim^4) 中执行此操作。每个矩阵乘积为 O(n^3) （如果您使用Strassen algorithm调用优化的 ZGEMM ，则可能更少）。因此，通过存储矩阵乘积，两个乘积应该是 O(n^3)，而不是 O(n^4)。也就是说，您可以进行矩阵乘积，然后进行矩阵乘积更新。

现在，你的大循环：步数超过 i、n、j、m。

如果我读得好，你写

Integrand = (Grs-Gas)/(EigVal(j)-tinyc-omega-conjg(EigVal(m)-tinyc))

其中右侧的所有变量都是标量，但 Integrand 是 ndim*ndim 矩阵。在多个地方复制一个值需要做很多工作。但是然后你在积分上循环，在那里你可以简单地使用一个标量。或者也许这是一个错误，你应该在左侧有 Integrand(j, m) 或类似的东西？

然后，您的四个内部循环就像前面的评论一样，使用数组乘积 EigVec * (R .* Integrand) * EigVec^H 更新 AnaInt，其中 .* 是数组的（逐项）标量积（或只是 EigVec * R * EigVec^H 如果 Integrand 只是一个标量）。

同样，尝试使用 ZGEMM 编写它可能会很好，从而大大降低了复杂性。

score 1 · Accepted Answer

您是否考虑过使用 OPENMP 对循环进行并行化？这很容易实现。如果有兴趣我可以给你一些提示。

试试看这里：openMP DO 教程

fortran - 如何提高这个巨大的嵌套循环的性能？(Fortran 90)

2 回答 2

Related

Reference