这是我的完整解决方案,它增加了时间控制并给出了失败作业的数量。如果需要,还会注意杀死失败作业的子进程,并处理僵尸或不间断进程:
function Logger {
echo "$1"
}
# Portable child (and grandchild) kill function tester under Linux, BSD and MacOS X
function KillChilds {
local pid="${1}" # Parent pid to kill childs
local self="${2:-false}" # Should parent be killed too ?
if children="$(pgrep -P "$pid")"; then
KillChilds "$child" true
done
fi
# Try to kill nicely, if not, wait 15 seconds to let Trap actions happen before killing
if ( [ "$self" == true ] && kill -0 $pid > /dev/null 2>&1); then
kill -s TERM "$pid"
if [ $? != 0 ]; then
sleep 15
Logger "Sending SIGTERM to process [$pid] failed."
kill -9 "$pid"
if [ $? != 0 ]; then
Logger "Sending SIGKILL to process [$pid] failed."
return 1
fi
else
return 0
fi
else
return 0
fi
}
function WaitForTaskCompletion {
local pids="${1}" # pids to wait for, separated by semi-colon
local soft_max_time="${2}" # If program with pid $pid takes longer than $soft_max_time seconds, will log a warning, unless $soft_max_time equals 0.
local hard_max_time="${3}" # If program with pid $pid takes longer than $hard_max_time seconds, will stop execution, unless $hard_max_time equals 0.
local caller_name="${4}" # Who called this function
local counting="${5:-true}" # Count time since function has been launched if true, since script has been launched if false
local keep_logging="${6:-0}" # Log a standby message every X seconds. Set to zero to disable logging
local soft_alert=false # Does a soft alert need to be triggered, if yes, send an alert once
local log_ttime=0 # local time instance for comparaison
local seconds_begin=$SECONDS # Seconds since the beginning of the script
local exec_time=0 # Seconds since the beginning of this function
local retval=0 # return value of monitored pid process
local errorcount=0 # Number of pids that finished with errors
local pid # Current pid working on
local pidCount # number of given pids
local pidState # State of the process
local pidsArray # Array of currently running pids
local newPidsArray # New array of currently running pids
IFS=';' read -a pidsArray <<< "$pids"
pidCount=${#pidsArray[@]}
WAIT_FOR_TASK_COMPLETION=""
while [ ${#pidsArray[@]} -gt 0 ]; do
newPidsArray=()
Spinner
if [ $counting == true ]; then
exec_time=$(($SECONDS - $seconds_begin))
else
exec_time=$SECONDS
fi
if [ $keep_logging -ne 0 ]; then
if [ $((($exec_time + 1) % $keep_logging)) -eq 0 ]; then
if [ $log_ttime -ne $exec_time ]; then # Fix when sleep time lower than 1s
log_ttime=$exec_time
fi
fi
fi
if [ $exec_time -gt $soft_max_time ]; then
if [ $soft_alert == true ] && [ $soft_max_time -ne 0 ]; then
Logger "Max soft execution time exceeded for task [$caller_name] with pids [$(joinString , ${pidsArray[@]})]."
soft_alert=true
SendAlert true
fi
if [ $exec_time -gt $hard_max_time ] && [ $hard_max_time -ne 0 ]; then
Logger "Max hard execution time exceeded for task [$caller_name] with pids [$(joinString , ${pidsArray[@]})]. Stopping task execution."
for pid in "${pidsArray[@]}"; do
KillChilds $pid true
if [ $? == 0 ]; then
Logger "Task with pid [$pid] stopped successfully." "NOTICE"
else
Logger "Could not stop task with pid [$pid]." "ERROR"
fi
done
SendAlert true
errrorcount=$((errorcount+1))
fi
fi
for pid in "${pidsArray[@]}"; do
if [ $(IsNumeric $pid) -eq 1 ]; then
if kill -0 $pid > /dev/null 2>&1; then
# Handle uninterruptible sleep state or zombies by ommiting them from running process array (How to kill that is already dead ? :)
#TODO(high): have this tested on *BSD, Mac & Win
pidState=$(ps -p$pid -o state= 2 > /dev/null)
if [ "$pidState" != "D" ] && [ "$pidState" != "Z" ]; then
newPidsArray+=($pid)
fi
else
# pid is dead, get it's exit code from wait command
wait $pid
retval=$?
if [ $retval -ne 0 ]; then
errorcount=$((errorcount+1))
Logger "${FUNCNAME[0]} called by [$caller_name] finished monitoring [$pid] with exitcode [$retval]. "DEBUG"
if [ "$WAIT_FOR_TASK_COMPLETION" == "" ]; then
WAIT_FOR_TASK_COMPLETION="$pid:$retval"
else
WAIT_FOR_TASK_COMPLETION=";$pid:$retval"
fi
fi
fi
fi
done
pidsArray=("${newPidsArray[@]}")
# Trivial wait time for bash to not eat up all CPU
sleep .05
done
# Return exit code if only one process was monitored, else return number of errors
if [ $pidCount -eq 1 ] && [ $errorcount -eq 0 ]; then
return $errorcount
else
return $errorcount
fi
}
用法:
让我们进行 3 个睡眠作业,获取它们的 pid 并将它们发送到 WaitforTaskCompletion:
sleep 10 &
pids="$!"
sleep 15 &
pids="$pids;$!"
sleep 20 &
pids="$pids;$!"
WaitForTaskCompletion $pids 1800 3600 ${FUNCNAME[0]} false 1800
前面的示例会在执行时间超过 1 小时时向您发出警告,在 2 小时后停止执行,并每半小时发送一条“活动”日志消息。