我习惯了 Torque,所以我希望一些 Sun SGE 大师可以提供帮助。我不知道为什么我的工作会在 3 小时后终止。在新安装的 ROCKS 6.2 上,作业被提交到没有竞争的空队列。少于 3 小时的工作没有任何问题。
这是服务器配置(我认为)
[user@machine]$ qconf -ssconf
algorithm default
schedule_interval 0:0:05
maxujobs 0
queue_sort_method load
job_load_adjustments np_load_avg=0.50
load_adjustment_decay_time 0:10:30
load_formula slots
schedd_job_info false
flush_submit_sec 1
flush_finish_sec 10
params none
reprioritize_interval 0:0:0
halftime 168
usage_weight_list cpu=1.000000,mem=0.000000,io=0.000000
compensation_factor 5.000000
weight_user 0.250000
weight_project 0.250000
weight_department 0.250000
weight_job 0.250000
weight_tickets_functional 0
weight_tickets_share 0
share_override_tickets TRUE
share_functional_shares TRUE
max_functional_jobs_to_schedule 200
report_pjob_tickets TRUE
max_pending_tasks_per_job 50
halflife_decay_list none
policy_hierarchy OFS
weight_ticket 0.010000
weight_waiting_time 0.000000
weight_deadline 3600000.000000
weight_urgency 0.100000
weight_priority 1.000000
max_reservation 0
default_duration INFINITY
队列配置
[user@machine]$ qconf qconf -sql
all.q
[user@machine]$ qconf qconf -sq all.q
qname all.q
hostlist @allhosts
seq_no 0
load_thresholds np_load_avg=1.75
suspend_thresholds NONE
nsuspend 1
suspend_interval 00:05:00
priority 0
min_cpu_interval 00:05:00
processors UNDEFINED
qtype BATCH INTERACTIVE
ckpt_list NONE
pe_list make mpi mpich orte
rerun FALSE
slots 1,[compute-0-0.local=4],[compute-1-0.local=4], \
[compute-1-1.local=4],[compute-1-2.local=4], \
[compute-1-3.local=4],[compute-1-4.local=4], \
[compute-2-0.local=4],[compute-2-1.local=4], \
[compute-2-2.local=4],[compute-2-4.local=4]
tmpdir /tmp
shell /bin/bash
prolog NONE
epilog NONE
shell_start_mode posix_compliant
starter_method NONE
suspend_method NONE
resume_method NONE
terminate_method NONE
notify 00:00:60
owner_list NONE
user_lists NONE
xuser_lists NONE
subordinate_list NONE
complex_values NONE
projects NONE
xprojects NONE
calendar NONE
initial_state default
s_rt INFINITY
h_rt INFINITY
s_cpu INFINITY
h_cpu INFINITY
s_fsize INFINITY
h_fsize INFINITY
s_data INFINITY
h_data INFINITY
s_stack INFINITY
h_stack INFINITY
s_core INFINITY
h_core INFINITY
s_rss INFINITY
h_rss INFINITY
s_vmem INFINITY
h_vmem INFINITY
这是一个正在进行的工作状态示例。
[user@machine]$ qstat -j 255
==============================================================
job_number: 255
exec_file: job_scripts/255
submission_time: Mon Jul 3 11:19:07 2017
owner: <user>
uid: 500
group: <user>
gid: 502
sge_o_home: /home/<user>
sge_o_log_name: <user>
sge_o_path: /home/<user>/<programDIR>/bin/linux:/home/<user>/<gitRootDIR>/<codeDIR>/bin/linux:/opt/openmpi/bin:/usr/lib64/qt-3.3/bin:/usr/local/bin:/bin:/usr/bin:/usr/local/sbin:/usr/sbin:/sbin:/usr/java/latest/bin:/opt/maven/bin:/opt/rocks/bin:/opt/rocks/sbin:/opt/gridengine/bin/linux-x64:/home/<user>/scripts/:/home/<user>/bin
sge_o_shell: /bin/bash
sge_o_workdir: <CORRECT working DIR>
sge_o_host: <HOSTNAME>
account: sge
merge: y
mail_options: ae
mail_list: <user@domain.tld>
notify: FALSE
job_name: STDIN
stdout_path_list: NONE:NONE:test3_w5.eo
jobshare: 0
env_list: HOSTNAME=<HOSTNAME>.FQDN,SHELL=/bin/bash,TERM=xterm,HISTSIZE=1000,EGS_HOME=/home/<user>/<programDIR>/,SSH_CLIENT=172.24.56.106 56512 22,SGE_ARCH=linux-x64,SGE_CELL=default,MPICH_PROCESS_GROUP=no,QTDIR=/usr/lib64/qt-3.3,QTINC=/usr/lib64/qt-3.3/include,SSH_TTY=/dev/pts/0,ROCKSROOT=/opt/rocks/share/devel,ANT_HOME=/opt/rocks,USER=<user>,LS_COLORS=rs=0:di=01;34:ln=01;36:mh=00:pi=40;33:so=01;35:do=01;35:bd=40;33;01:cd=40;33;01:or=40;31;01:mi=01;05;37;41:su=37;41:sg=30;43:ca=30;41:tw=30;42:ow=34;42:st=37;44:ex=01;32:*.tar=01;31:*.tgz=01;31:*.arj=01;31:*.taz=01;31:*.lzh=01;31:*.lzma=01;31:*.tlz=01;31:*.txz=01;31:*.zip=01;31:*.z=01;31:*.Z=01;31:*.dz=01;31:*.gz=01;31:*.lz=01;31:*.xz=01;31:*.bz2=01;31:*.tbz=01;31:*.tbz2=01;31:*.bz=01;31:*.tz=01;31:*.deb=01;31:*.rpm=01;31:*.jar=01;31:*.rar=01;31:*.ace=01;31:*.zoo=01;31:*.cpio=01;31:*.7z=01;31:*.rz=01;31:*.jpg=01;35:*.jpeg=01;35:*.gif=01;35:*.bmp=01;35:*.pbm=01;35:*.pgm=01;35:*.ppm=01;35:*.tga=01;35:*.xbm=01;35:*.xpm=01;35:*.tif=01;35:*.tiff=01;35:*.png=01;35:*.svg=01;35:*.svgz=01;35:*.mng=01;35:*.pcx=01;35:*.mov=01;35:*.mpg=01;35:*.mpeg=01;35:*.m2v=01;35:*.mkv=01;35:*.ogm=01;35:*.mp4=01;35:*.m4v=01;35:*.mp4v=01;35:*.vob=01;35:*.qt=01;35:*.nuv=01;35:*.wmv=01;35:*.asf=01;35:*.rm=01;35:*.rmvb=01;35:*.flc=01;35:*.avi=01;35:*.fli=01;35:*.flv=01;35:*.gl=01;35:*.dl=01;35:*.xcf=01;35:*.xwd=01;35:*.yuv=01;35:*.cgm=01;35:*.emf=01;35:*.axv=01;35:*.anx=01;35:*.ogv=01;35:*.ogx=01;35:*.aac=01;36:*.au=01;36:*.flac=01;36:*.mid=01;36:*.midi=01;36:*.mka=01;36:*.mp3=01;36:*.mpc=01;36:*.ogg=01;36:*.ra=01;36:*.wav=01;36:*.axa=01;36:*.oga=01;36:*.spx=01;36:*.xspf=01;36:,LD_LIBRARY_PATH=/opt/gridengine/lib/linux-x64:/opt/openmpi/lib,ROCKS_ROOT=/opt/rocks,<DEFAULT_BATCH_SYSTEM>=sge,PATH=/home/<user>/<programDIR>/bin/linux:/home/<user>/<gitRootDIR>/<codeDIR>/bin/linux:/opt/openmpi/bin:/usr/lib64/qt-3.3/bin:/usr/local/bin:/bin:/usr/bin:/usr/local/sbin:/usr/sbin:/sbin:/usr/java/latest/bin:/opt/maven/bin:/opt/rocks/bin:/opt/rocks/sbin:/opt/gridengine/bin/linux-x64:/home/<user>/scripts/:/home/<user>/bin,MAIL=/var/spool/mail/<user>,MAVEN_HOME=/opt/maven,<codeDIR>=/home/<user>/<gitRootDIR>/<codeDIR>/,PWD=/home/<user>/<programDIR>/dosxyznrc,JAVA_HOME=/usr/java/latest,_LMFILES_=/usr/share/Modules/modulefiles/rocks-openmpi,SGE_EXECD_PORT=537,LANG=en_US.iso885915,MODULEPATH=/usr/share/Modules/modulefiles:/etc/modulefiles,SGE_QMASTER_PORT=536,LOADEDMODULES=rocks-openmpi,SGE_ROOT=/opt/gridengine,HISTCONTROL=ignoredups,SSH_ASKPASS=/usr/libexec/openssh/gnome-ssh-askpass,HOME=/home/<user>,SHLVL=2,ROLLSROOT=/opt/rocks/share/devel/src/roll,MPIHOME=/opt/openmpi,LOGNAME=<user>,CVS_RSH=ssh,QTLIB=/usr/lib64/qt-3.3/lib,SSH_CONNECTION=172.24.56.106 56512 172.24.59.111 22,MODULESHOME=/usr/share/Modules,LESSOPEN=||/usr/bin/lesspipe.sh %s,OMEGA_HOME=/home/<user>/<gitRootDIR>/<codeDIR>/omega,EGS_CONFIG=/home/<user>/<gitRootDIR>/<codeDIR>/specs/linux.conf,G_BROKEN_FILENAMES=1,OMPI_MCA_btl=self,sm,tcp,BASH_FUNC_module()=() { eval `/usr/bin/modulecmd bash $*`
},_=/opt/gridengine/bin/linux-x64/qsub
script_file: STDIN
usage 1: cpu=00:13:50, mem=833.52093 GBs, io=0.05473, vmem=1.109G, maxvmem=1.109G
scheduling info: (Collecting of scheduler job information is turned off)