fortran - Fortran 可分配数组的低性能

Question

我使用 IVF2013 和 IVF2019 的 Intel Visual Fortran。使用可分配数组时，程序比使用静态内存分配的程序慢得多。也就是说，如果我从

方法一：使用固定数组

do i = 1, 1000
  call A
end do 

subroutine A
  real(8) :: x(30)
  do things
end subroutine A

类似于

方法2：使用可分配数组

module module_size_is_defined
  n = 30
end module

do i = 1, 1000
    call A
end do 

subroutine A
  use module_size_is_defined
  real(8), allocatable :: x(:)
  allocate(x(n))
  do things
end subroutine A

代码要慢得多。对于我的代码，静态分配需要 1 分 30 秒，而动态分配需要 2 分 30 秒。然后，我认为可能是因为运行分配操作需要太多时间，因为它在循环中，然后我尝试了以下两种方法：

方法3：使用模块只分配一次数组

module module_x_is_allocated 
  n = 30
  allocat(x(n))
end module

do i = 1, 1000
  call A
end do 

subroutine A
  use module_x_is_allocated
  do things
end subroutine A

方法四：使用自动数组

module module_size_is_defined
  n = 30
end module


do i = 1, 1000
  call A
end do 

subroutine A
  use module_size_is_defined
  real(x) :: x(n)
  do things
end subroutine A

方法 3 和方法 4 所用的时间几乎与使用动态分配数组方法 2 的时间相同。两者都在 2 分钟 30 秒左右。所有案例都使用相同的优化进行编译。我尝试了 IVF 2013 和 IVF 2019，结果相同。我不知道为什么。尤其是方法 3，虽然 allocate 只运行一次，但仍然需要相同的时间。似乎动态分配的数组存储在比静态分配的数组慢的地方，并且分配不需要额外的时间（因为方法2和3需要相同的时间）。

以更有效的方式分配数组以减少性能损失的任何想法和建议？谢谢。

!=================================================== ========================= 编辑1：

我的程序太长，无法在此处发布。因此，我尝试了一些小代码。结果有点奇怪。我试了三个案例，

方法一：耗时28.98s

module module_size_is_defined
  implicit none
  integer(4) :: n
end module

program main
  use module_size_is_defined
  implicit none
  
  integer(4) :: i
  real(8) :: y(50,50),z(50,50),t
  
  n = 50
  do i =1,50000
    t=dble(i) * 2.0D0
    call A(y,t)
    z = z + y
  end do
  write(*,*) z(1,1)
end
  
subroutine A(y,t)
  use module_size_is_defined
  implicit none
  real(8),intent(out):: y(n,n)
  real(8),intent(in) :: t
  integer(4) :: j
  real(8) :: x(1,50)
  
  y=0.0D0
  do j = 1, 200
    call getX(x,t,j)
    y = y + matmul( transpose(x) + dble(j)**2, x )
  end do
endsubroutine A
  
  
subroutine getX(x,t,j)
  use module_size_is_defined
  implicit none
  real(8),intent(out) :: x(1,n)
  real(8),intent(in) :: t
  integer(4),intent(in) :: j
  integer(4) :: i
  
  do i =1, n
    x(1,i)  = dble(i+j) * t ** (1.5D00) 
  end do
endsubroutine getX

方法2：耗时30.56s

module module_size_is_defined
  implicit none
  integer(4) :: n
end module

program main
  use module_size_is_defined
  implicit none
  
  integer(4) :: i
  real(8) :: y(50,50),z(50,50),t
  
  n = 50
  do i =1,50000
    t=dble(i) * 2.0D0
    call A(y,t)
    z = z + y
  end do
  write(*,*) z(1,1)
end
  
subroutine A(y,t)
  use module_size_is_defined
  implicit none
  real(8),intent(out):: y(n,n)
  real(8),intent(in) :: t
  integer(4) :: j
  real(8),allocatable :: x(:,:)
  allocate(x(1,n))
  
  y=0.0D0
  do j = 1, 200
    call getX(x,t,j)
    y = y + matmul( transpose(x) + dble(j)**2, x )
  end do
endsubroutine A
  
  
subroutine getX(x,t,j)
  use module_size_is_defined
  implicit none
  real(8),intent(out) :: x(1,n)
  real(8),intent(in) :: t
  integer(4),intent(in) :: j
  integer(4) :: i
  
  do i =1, n
    x(1,i)  = dble(i+j) * t ** (1.5D00) 
  end do
endsubroutine getX

方法3：耗时78.72s

module module_size_is_defined
  implicit none
  integer(4) :: n
endmodule

module module_array_is_allocated
  use module_size_is_defined
  implicit none
  real(8), allocatable,save :: x(:,:)

  contains
  subroutine init
    implicit none
    allocate(x(1,n))
  endsubroutine
endmodule module_array_is_allocated

program main
  use module_size_is_defined
  use module_array_is_allocated
  implicit none
  
  integer(4) :: i
  real(8) :: y(50,50),z(50,50),t
  
  n = 50
  call init
  do i =1,50000
    t=dble(i) * 2.0D0
    call A(y,t)
    z = z + y
  end do
  write(*,*) z(1,1)
end
  
subroutine A(y,t)
  use module_size_is_defined
  use module_array_is_allocated
  implicit none
  real(8),intent(out):: y(n,n)
  real(8),intent(in) :: t
  integer(4) :: j
  
  y=0.0D0
  do j = 1, 200
    call getX(x,t,j)
    y = y + matmul( transpose(x) + dble(j)**2, x )
  end do
endsubroutine A
  
  
subroutine getX(x,t,j)
  use module_size_is_defined
  implicit none
  real(8),intent(out) :: x(1,n)
  real(8),intent(in) :: t
  integer(4),intent(in) :: j
  integer(4) :: i
  
  do i =1, n
    x(1,i)  = dble(i+j) * t ** (1.5D00) 
  end do
endsubroutine getX

现在，对于更小的尺寸问题，方法 1 和方法 2 几乎是同时进行的。但是方法 3 应该比方法 2 更好，因为它只分配 x(1,n) 一次。但它要慢得多。但是在我之前的程序中，方法2给出的时间几乎和方法3一样。很奇怪。

我在 Windows 和 Linux 中都使用了版本设置，-O2 优化，以及不同版本的 IVF。

fortran - Fortran 可分配数组的低性能

0 回答 0

Related

Reference