我一直在尝试调试使我的模拟软件崩溃的段错误/sigabort。我已经能够将此跟踪到某个 LAPACK 子例程,更改(或实际上解除分配,我认为)子例程中的一个网格计数器,该子例程(通过一些其他子例程)调用此 LAPACK 子例程。这是我跟踪此错误的 gdb 调试会话:
(gdb) break trifactorize
Breakpoint 1 at 0x44ad28: file /home/nspeelman/chem1d/src/solver.f, line 925.
(gdb) run
Starting program: /home/nspeelman/chem1d/bin/chem1d
(gdb) watch k
Hardware watchpoint 2: k
(gdb) c
Continuing.
Hardware watchpoint 2: k
Old value = 4
New value = 1
0x000000000044ad48 in trifactorize (lsing=@0x7fffffffdedc) at /home/nspeelman/chem1d/src/solver.f:927
927 DO k = 1, Npoint
(gdb) c
Continuing.
Hardware watchpoint 2: k
Old value = 1
New value = -214061846
dlatrs (uplo=@0x4ffea4, trans=@0x4ffe98, diag=@0x4ffe94, normin=@0x7fffffffbeab, n=@0x3043664, a=0x16867060,
lda=@0x4fcda4, x=0x7fffffffcf90, scale=@0x7fffffffbe98, cnorm=0x7fffffffd910, info=@0x7fffffffdc2c, _uplo=5,
_trans=12, _diag=4, _normin=1) at /home/nspeelman/chem1d/src/lapack/src/dlatrs.f:334
334 DO 20 J = 1, N - 1
(gdb) c
Continuing.
Program received signal SIGABRT, Aborted.
0x00002aaaab480265 in raise () from /lib64/libc.so.6
和回溯:
(gdb) bt
#0 0x00002aaaab480265 in raise () from /lib64/libc.so.6
#1 0x00002aaaab481d10 in abort () from /lib64/libc.so.6
#2 0x00002aaaaad88c9e in internal_unpack (d=0x62e9, s=0x62e9)
at ../../../gcc-4.3.4/libgfortran/runtime/in_unpack_generic.c:104
#3 0x000000000044b41c in trifactorize (lsing=@0x7fffffffdedc) at /home/nspeelman/chem1d/src/solver.f:940
#4 0x3f68b52055ec1bbd in ?? ()
#5 0x3f62d2224f8b7801 in ?? ()
#6 0x3f62e59e02e2572f in ?? ()
#7 0x3f70bab1bd0628c7 in ?? ()
#8 0x3f70cdc893daf1cf in ?? ()
#9 0x3f5a3418697ab4dc in ?? ()
#10 0x3f6b117db1893c97 in ?? ()
#11 0x3f6e0dd1b55652b4 in ?? ()
#12 0x3f6864101f64d2f0 in ?? ()
#13 0x3f7359216186a4dc in ?? ()
#14 0x3f527ee1ff8feb69 in ?? ()
#15 0x3f672c7c504a10f8 in ?? ()
#16 0x3f68f2c8e0ee6963 in ?? ()
#17 0x3f54726715d81583 in ?? ()
#18 0x3f68f2c8e0ee6963 in ?? ()
#19 0x3f6df6a7d0f5e9a5 in ?? ()
#20 0x3f5ef57fdf747822 in ?? ()
#21 0x3f56ef95d71519b0 in ?? ()
#22 0x3f6b736cc1c1feb4 in ?? ()
#23 0x3f60fb91d9400ca4 in ?? ()
#24 0x3f56ef95d71519b0 in ?? ()
#25 0x3f4b4753f00f24d5 in ?? ()
#26 0x3f5a8b9cc465a316 in ?? ()
#27 0x3f3855b423b18a6b in ?? ()
#28 0x3f568360294ec05f in ?? ()
#29 0x3f3679d6e42a4759 in ?? ()
#30 0x3f228fe18a3e97ab in ?? ()
#31 0x3f50df603cf17c50 in ?? ()
#32 0x3f5d0cea5690c8f8 in ?? ()
#33 0x3f550552679170e1 in ?? ()
#34 0x3f3d0ebaa18f7a6f in ?? ()
#35 0x3f66a1b9ef4a7dc4 in ?? ()
#36 0x3f345e9bec8a3d7a in ?? ()
#37 0x3f43854a676ff7cb in ?? ()
#38 0x3f4acbe712d1ba00 in ?? ()
#39 0x3f191497deb0cd86 in ?? ()
#40 0x3f48e220ec5df7ee in ?? ()
#41 0x3f4326498a95447b in ?? ()
#42 0x3f2e05ee4edaa6ff in ?? ()
#43 0x3f285f8e79fe6b92 in ?? ()
#44 0x3f58240bec575a1d in ?? ()
#45 0x3f47c79be94754f7 in ?? ()
#46 0x3f5cf356ce58b75a in ?? ()
#47 0x3f28c2a87c82305d in ?? ()
#48 0x3f35fca48157c9e4 in ?? ()
#49 0x3f41c924b53cdbae in ?? ()
#50 0x3f477c6c115fb520 in ?? ()
#51 0x0000000000000000 in ?? ()
我已经能够使用带有 gfortran 4.6.1 的 Ubuntu 11.10、带有 gfortran 4.6.3 的 Ubuntu 12.04、带有 gfortran 4.3.4 的 Scientific Linux 5.6 和带有 gfortran 4.5.0-1 的 Microsoft Windows 进行复制。当我在 Linux 机器上使用英特尔编译器时,我无法重现此错误,但我无法在 Windows 上使用 ifort,因为我使用的是学术许可证。但我需要用 gfortran 解决这个问题,因为我需要为一些学生准备一个 Windows 版本。我将编译器标志 -funroll-all-loops -fno-f2c -O3 用于发布版本,将标志 -fno-f2c -O0 -g3 用于调试版本。这两种选择都会产生这些问题。
此外,此错误仅在使用大型阵列时才可重现。我正在使用最大大小的数组(500,Ns)和大小的工作数组(Ns,Ns,500)。模拟不崩溃使用 Ns = 53,当它崩溃时 Ns = 153,但 Ns 的声明大小为 200。
最后我会展示崩溃的代码:solver.f, subroutine trifactorize:
lSing = .FALSE.
DO k = 1, Npoint
c---- Compute (jacB(k)-jacA(k)*jacC(k-1)). -----------------------------
SELECT CASE ( k )
CASE ( 1 )
CASE DEFAULT
CALL DGEMM( 'N', 'N', Ns, Ns, Ns, -1.0D0, jacA(:,:,k),
> NsMax, jacC(:,:,k-1), NsMax, 1.0d0,
> jacB(:,:,k), NsMax )
END SELECT
c---- Factor with Gaussian elimination and estimate condition number.---
norm = DLANGE( '1', Ns, Ns, jacB(:,:,k), NsMax, Work )
CALL DGETRF( Ns, Ns, jacB(:,:,k), NsMax, ip(:,k), INFO )
CALL DGECON( '1', Ns, jacB(:,:,k), NsMax, norm, Condit(k),
> Work, IWork, INFO )
c WRITE(*,*)k,condit(k)
IF ((1.0d0+condit(k)).EQ.1.0d0 .AND. iLogging.EQ.iDebug) THEN
Write(line,10) 'Singular Jacobian Matrix'
CALL ScreenWrite(line, iNormal)
Write(line,11) 'Gridnumber: ', k
CALL ScreenWrite(line, iNormal)
lSing = .TRUE.
RETURN
ENDIF
c---- Compute jacC/jacB'-matrix ----------------------------------------
CALL DGETRS( 'N', Ns, Ns, jacB(:,:,k), NsMax, ip(:,k),
> jacC(:,:,k), NsMax, INFO )
ENDDO
10 FORMAT(9X,3('-'),1X,9('-'),1X,9('-'),1X,A)
11 FORMAT(9X,3('-'),1X,9('-'),1X,9('-'),1X,A,i4)
CALL LogWrite('==> Decompose : Finished', iDebug)
RETURN
dlatrs.f:
*
* A is lower triangular.
*
DO 20 J = 1, N - 1
CNORM( J ) = DASUM( N-J, A( J+1, J ), 1 )
20 CONTINUE
CNORM( N ) = ZERO
我一直想知道我是否使用了错误的编译器标志,或者我是否偶然发现了一个罕见的 gfortran 错误。希望有人知道如何解决这个问题。