0

我正在尝试使用计算节点通过 crontab 运行 WRF(real.exe、wrf.exe),但计算节点无法运行 slurm 作业。我认为 MPI 库在通过 cron 环境运行时存在一些问题。

尝试在运行 crontab 作业时复制终端路径变量。下面附上在终端和 crontab 上运行作业时生成的日志文件。

从终端运行时的 Slurm 日志如下

SLURM_NODELIST=compute-dy-c5n18xlarge-1
LDFLAGS=-L/shared/gccWRF/grib2/lib
SLURM_JOB_NAME=real_slurm_run_wrf.sh
MANPATH=/opt/intel/compilers_and_libraries_2020.2.254/linux/mpi/man:/opt/slurm/share/man
XDG_SESSION_ID=2226
WRF_DIR=/shared/gccWRF/WRF
SLURMD_NODENAME=compute-dy-c5n18xlarge-1
SLURM_TOPOLOGY_ADDR=compute-dy-c5n18xlarge-1
HOSTNAME=compute-dy-c5n18xlarge-1
SLURM_PRIO_PROCESS=0
SLURM_NODE_ALIASES=(null)
SHELL=/bin/bash             ---------------------------
TERM=xterm          -----------------------------------
HISTSIZE=1000              ------------------------
CPPFLAGS=-I/shared/gccWRF/grib2/include
TMPDIR=/tmp             
SLURM_TOPOLOGY_ADDR_PATTERN=node
SSH_CLIENT=157.33.216.217 58485 22                -----------------------------
LIBRARY_PATH=/opt/intel/compilers_and_libraries_2020.2.254/linux/mpi/intel64/libfabric/lib
JASPERINC=/shared/gccWRF/grib2/include
QTDIR=/usr/lib64/qt-3.3      ----------------------------------------
DIR=/shared/gccWRF
QTINC=/usr/lib64/qt-3.3/include                      ----------------------------
SSH_TTY=/dev/pts/2        ------------------------
SHARED_DIR=/shared
SLURM_NNODES=1
USER=ec2-user
LD_LIBRARY_PATH=/opt/intel/compilers_and_libraries_2020.2.254/linux/mpi/intel64/libfabric/lib:/opt/intel/compilers_and_libraries_2020.2.254/linux/mpi/intel64/lib/release:/opt/intel/compilers_and_libraries_2020.2.254/linux/mpi/intel64/lib
LS_COLORS=rs=0:di=01;34:ln=01;36:mh=00:pi=40;33:so=01;35:do=01;35:bd=40;33;01:cd=40;33;01:or=40;31;01:mi=01;05;37;41:su=37;41:sg=30;43:ca=30;41:tw=30;42:ow=34;42:st=37;44:ex=01;32:*.tar=01;31:*.tgz=01;31:*.arc=01;31:*.arj=01;31:*.taz=01;31:*.lha=01;31:*.lz4=01;31:*.lzh=01;31:*.lzma=01;31:*.tlz=01;31:*.txz=01;31:*.tzo=01;31:*.t7z=01;31:*.zip=01;31:*.z=01;31:*.Z=01;31:*.dz=01;31:*.gz=01;31:*.lrz=01;31:*.lz=01;31:*.lzo=01;31:*.xz=01;31:*.bz2=01;31:*.bz=01;31:*.tbz=01;31:*.tbz2=01;31:*.tz=01;31:*.deb=01;31:*.rpm=01;31:*.jar=01;31:*.war=01;31:*.ear=01;31:*.sar=01;31:*.rar=01;31:*.alz=01;31:*.ace=01;31:*.zoo=01;31:*.cpio=01;31:*.7z=01;31:*.rz=01;31:*.cab=01;31:*.jpg=01;35:*.jpeg=01;35:*.gif=01;35:*.bmp=01;35:*.pbm=01;35:*.pgm=01;35:*.ppm=01;35:*.tga=01;35:*.xbm=01;35:*.xpm=01;35:*.tif=01;35:*.tiff=01;35:*.png=01;35:*.svg=01;35:*.svgz=01;35:*.mng=01;35:*.pcx=01;35:*.mov=01;35:*.mpg=01;35:*.mpeg=01;35:*.m2v=01;35:*.mkv=01;35:*.webm=01;35:*.ogm=01;35:*.mp4=01;35:*.m4v=01;35:*.mp4v=01;35:*.vob=01;35:*.qt=01;35:*.nuv=01;35:*.wmv=01;35:*.asf=01;35:*.rm=01;35:*.rmvb=01;35:*.flc=01;35:*.avi=01;35:*.fli=01;35:*.flv=01;35:*.gl=01;35:*.dl=01;35:*.xcf=01;35:*.xwd=01;35:*.yuv=01;35:*.cgm=01;35:*.emf=01;35:*.axv=01;35:*.anx=01;35:*.ogv=01;35:*.ogx=01;35:*.aac=01;36:*.au=01;36:*.flac=01;36:*.mid=01;36:*.midi=01;36:*.mka=01;36:*.mp3=01;36:*.mpc=01;36:*.ogg=01;36:*.ra=01;36:*.wav=01;36:*.axa=01;36:*.oga=01;36:*.spx=01;36:*.xspf=01;36:           ------------------------------------------
TARGET_DIR=/shared/FORECAST/domains/test/     ------------------------
FI_PROVIDER_PATH=/opt/intel/compilers_and_libraries_2020.2.254/linux/mpi/intel64/libfabric/lib/prov           
SLURM_JOBID=72
SLURM_NTASKS=8
SLURM_TASKS_PER_NODE=8
PATH=/opt/intel/compilers_and_libraries_2020.2.254/linux/mpi/intel64/libfabric/bin:/opt/intel/compilers_and_libraries_2020.2.254/linux/mpi/intel64/bin:/shared/gccWRF/netcdf/bin:/shared/gccWRF/bin:/usr/lib64/qt-3.3/bin:/opt/amazon/openmpi/bin/:/opt/amazon/efa/bin/:/usr/local/bin:/usr/bin:/usr/local/sbin:/usr/sbin:/opt/slurm/bin:/home/ec2-user/.local/bin:/home/ec2-user/bin
MAIL=/var/spool/mail/ec2-user           ----------------------------------------
FCFLAGS=-m64
SLURM_WORKING_CLUSTER=parallelcluster:172.31.32.246:6820:9216:109
SLURM_CONF=/opt/slurm/etc/slurm.conf
SLURM_JOB_ID=72
JASPERLIB=/shared/gccWRF/grib2/lib
SLURM_JOB_USER=ec2-user
PWD=/shared/foreasting/wrf          -----------------------
_LMFILES_=/opt/intel/impi/2019.8.254/intel64/modulefiles//intelmpi
LANG=en_US.UTF-8
MODULEPATH=/opt/intel/impi/2019.8.254/intel64/modulefiles/
SLURM_JOB_UID=1000
LOADEDMODULES=intelmpi
SLURM_NODEID=0
F77=gfortran
SLURM_SUBMIT_DIR=/shared/foreasting/wrf
SLURM_TASK_PID=8972
SLURM_NPROCS=8
SLURM_CPUS_ON_NODE=8
CXX=g++
SLURM_PROCID=0
ENVIRONMENT=BATCH
HISTCONTROL=ignoredups   ------------------------------
SLURM_JOB_NODELIST=compute-dy-c5n18xlarge-1
BUILDDIR=/shared/build/gccWRF
HOME=/home/ec2-user
SHLVL=2 --------------------
SLURM_LOCALID=0
KMP_AFFINITY=granularity=fine,compact,1,0
SETUP_DIR=/shared/hpc-workshop-wrf
WPS_DIR=/shared/gccWRF/WPS
SLURM_JOB_GID=1000                    
SLURM_JOB_CPUS_PER_NODE=8
SLURM_CLUSTER_NAME=parallelcluster
FC=gfortran
SLURM_GTIDS=0
SLURM_SUBMIT_HOST=ip-172-31-32-246
NETCDF=/shared/gccWRF/netcdf
SLURM_JOB_PARTITION=compute
LOGNAME=ec2-user
QTLIB=/usr/lib64/qt-3.3/lib           ---------------------------
CLASSPATH=/opt/intel/compilers_and_libraries_2020.2.254/linux/mpi/intel64/lib/mpi.jar      
SSH_CONNECTION=157.33.216.217 58485 172.31.32.246 22   ------------------
XDG_DATA_DIRS=/home/ec2-user/.local/share/flatpak/exports/share:/var/lib/flatpak/exports/share:/usr/local/share:/usr/share               ----------------------------
KMP_STACKSIZE=128M
GEOG_BASE_DIR=/shared/FORECAST/domains/
SCRIPTDIR=/shared/gccWRF/bin
SLURM_JOB_NUM_NODES=1
MODULESHOME=/usr/share/Modules        --------------------------------
OMP_NUM_THREADS=2
LESSOPEN=||/usr/bin/lesspipe.sh %s      -------------------------------------
CC=gcc
XDG_RUNTIME_DIR=/run/user/1000
I_MPI_ROOT=/opt/intel/compilers_and_libraries_2020.2.254/linux/mpi
FFLAGS=-m64
BASH_FUNC_module()=() {  eval `/usr/bin/modulecmd bash $*`   ----------------
}              -------------------------
_=/usr/bin/env
#################################
compute-dy-c5n18xlarge-1
#################################
Processes 4
 starting wrf task            0  of            4
 starting wrf task            1  of            4
 starting wrf task            2  of            4
 starting wrf task            3  of            4

从 crontab 运行时的 slurm 日志如下

SLURM_NODELIST=compute-dy-c5n18xlarge-1
SLURM_JOB_NAME=real_slurm_run_wrf.sh
LDFLAGS=-L/shared/gccWRF/grib2/lib
MANPATH=/opt/intel/compilers_and_libraries_2020.2.254/linux/mpi/man:/usr/share/man:/usr/local/share/man -------------------------------------
XDG_SESSION_ID=2233
SLURMD_NODENAME=compute-dy-c5n18xlarge-1
SLURM_TOPOLOGY_ADDR=compute-dy-c5n18xlarge-1
HOSTNAME=compute-dy-c5n18xlarge-1
WRF_DIR=/shared/gccWRF/WRF
SLURM_PRIO_PROCESS=0
SLURM_NODE_ALIASES=(null)
SHELL=/bin/sh
TMPDIR=/tmp
SLURM_TOPOLOGY_ADDR_PATTERN=node
CPPFLAGS=-I/shared/gccWRF/grib2/include
LIBRARY_PATH=/opt/intel/compilers_and_libraries_2020.2.254/linux/mpi/intel64/libfabric/lib
JASPERINC=/shared/gccWRF/grib2/include
DIR=/shared/gccWRF
SHARED_DIR=/shared
SLURM_NNODES=1
USER=ec2-user
LD_LIBRARY_PATH=/opt/intel/compilers_and_libraries_2020.2.254/linux/mpi/intel64/libfabric/lib:/opt/intel/compilers_and_libraries_2020.2.254/linux/mpi/intel64/lib/release:/opt/intel/compilers_and_libraries_2020.2.254/linux/mpi/intel64/lib
TARGET_DIR=/shared/FORECAST/domains/test/
FI_PROVIDER_PATH=/opt/intel/compilers_and_libraries_2020.2.254/linux/mpi/intel64/libfabric/lib/prov
SLURM_JOBID=70
SLURM_NTASKS=8
SLURM_TASKS_PER_NODE=8
PATH=/opt/intel/compilers_and_libraries_2020.2.254/linux/mpi/intel64/libfabric/bin:/opt/intel/compilers_and_libraries_2020.2.254/linux/mpi/intel64/bin:/shared/gccWRF/netcdf/bin:/shared/gccWRF/bin:/shared/gccWRF/netcdf/bin:/shared/gccWRF/bin:/opt/intel/impi/2019.8.254/intel64/modulefiles/:/usr/bin:/bin --------------------------------
SLURM_WORKING_CLUSTER=parallelcluster:172.31.32.246:6820:9216:109
SLURM_CONF=/opt/slurm/etc/slurm.conf
FCFLAGS=-m64
SLURM_JOB_ID=70
JASPERLIB=/shared/gccWRF/grib2/lib
SLURM_JOB_USER=ec2-user
PWD=/shared/foreasting/wrf
_LMFILES_=/opt/intel/impi/2019.8.254/intel64/modulefiles//intelmpi
LANG=en_US.UTF-8
MODULEPATH=/opt/intel/impi/2019.8.254/intel64/modulefiles/
LOADEDMODULES=intelmpi
SLURM_JOB_UID=1000         
SLURM_NODEID=0
SLURM_SUBMIT_DIR=/shared/foreasting/wrf
F77=gfortran
SLURM_TASK_PID=8643
SLURM_NPROCS=8
SLURM_CPUS_ON_NODE=8
SLURM_PROCID=0
ENVIRONMENT=BATCH
CXX=g++
SLURM_JOB_NODELIST=compute-dy-c5n18xlarge-1
SHLVL=3
HOME=/home/ec2-user
BUILDDIR=/shared/build/gccWRF
SLURM_LOCALID=0
SETUP_DIR=/shared/hpc-workshop-wrf
KMP_AFFINITY=granularity=fine,compact,1,0
SLURM_JOB_GID=1000
SLURM_JOB_CPUS_PER_NODE=8
SLURM_CLUSTER_NAME=parallelcluster
WPS_DIR=/shared/gccWRF/WPS
SLURM_GTIDS=0
SLURM_SUBMIT_HOST=ip-172-31-32-246
FC=gfortran
SLURM_JOB_PARTITION=compute
NETCDF=/shared/gccWRF/netcdf
LOGNAME=ec2-user
CLASSPATH=/opt/intel/compilers_and_libraries_2020.2.254/linux/mpi/intel64/lib/mpi.jar
SLURM_JOB_NUM_NODES=1
SCRIPTDIR=/shared/gccWRF/bin
GEOG_BASE_DIR=/shared/FORECAST/domains/
KMP_STACKSIZE=128M
OMP_NUM_THREADS=2
XDG_RUNTIME_DIR=/run/user/1000
CC=gcc
I_MPI_ROOT=/opt/intel/compilers_and_libraries_2020.2.254/linux/mpi
FFLAGS=-m64
_=/usr/bin/env
#################################
compute-dy-c5n18xlarge-1
#################################
Processes 4
[mpiexec@compute-dy-c5n18xlarge-1] check_exit_codes (../../../../../src/pm/i_hydra/libhydra/demux/hydra_demux_poll.c:117): unable to run bstrap_proxy on compute-dy-c5n18xlarge-1 (pid 8652, exit code 65280)
[mpiexec@compute-dy-c5n18xlarge-1] poll_for_event (../../../../../src/pm/i_hydra/libhydra/demux/hydra_demux_poll.c:159): check exit codes error
[mpiexec@compute-dy-c5n18xlarge-1] HYD_dmx_poll_wait_for_proxy_event (../../../../../src/pm/i_hydra/libhydra/demux/hydra_demux_poll.c:212): poll for event error
[mpiexec@compute-dy-c5n18xlarge-1] HYD_bstrap_setup (../../../../../src/pm/i_hydra/libhydra/bstrap/src/intel/i_hydra_bstrap.c:772): error waiting for event
[mpiexec@compute-dy-c5n18xlarge-1] main (../../../../../src/pm/i_hydra/mpiexec/mpiexec.c:1938): error setting up the boostrap proxies

感谢您调查问题。

4

0 回答 0