Ensure queue is disable before starting sge_execd

This commit is contained in:
Jeremy Gardais 2021-11-23 17:14:48 +01:00
parent 9d31d4ab02
commit 2340ced9b8
Signed by: jegardai
GPG Key ID: E759BAA22501AF32
1 changed files with 42 additions and 18 deletions

View File

@ -114,7 +114,7 @@ define_vars() { # {{{
fi
## }}}
## If the host to manage is the current one
## If the host to manage is the current one {{{
if is_current_host "${sge_hostname}" ; then
debug_message "define_vars \
${sge_hostname} is the current host."
@ -130,8 +130,9 @@ ${sge_hostname} is not the current host."
## Force to (re)enable SGE queue(s) in any case
FORCE_MODE="0"
fi
## }}}
## If FORCE_MODE was defined and enabled
## If FORCE_MODE was defined and enabled {{{
if [ -n "${FORCE_MODE}" ] && [ "${FORCE_MODE}" -eq "0" ]; then
## Disable upgrade checking (remote host, asked behaviour,…)
CHECK_UPGRADE="1"
@ -143,8 +144,9 @@ ${sge_hostname} is not the current host."
## Ensure to define a value
FORCE_MODE="1"
fi
## }}}
## Get all queues name
## Get all queues name {{{
sge_queues_name="$(qhost -h "${sge_hostname:=/dev/null}" -q -xml \
| grep "queue name" \
| cut -d"'" -f2 )"
@ -152,16 +154,21 @@ ${sge_hostname} is not the current host."
| grep "queue name" \
| cut -d"'" -f2 \
| tr -s '\n' ' ' )"
## }}}
## List of process pattern to monitor
## List of process pattern to monitor {{{
maco_proc_pattern="(/opt/maco/bin/maco.autoupdate.sh)"
apt_proc_pattern="(aptitude.*full-upgrade|/usr/bin/dpkg.*--configure|dpkg-deb|/bin/sh /usr/lib/needrestart/dpkg-status)"
sge_proc_pattern="(/usr/lib/gridengine/sge_execd)"
## List of files to monitor
## }}}
## List of files to monitor {{{
file_nologin_path="/etc/nologin"
cluster_dir="/opt/ipr/cluster"
sge_queue_flag_pattern="${cluster_dir}/.sge.*.disable"
## }}}
## Script used to disable SGE queue(s)
sge_disable_host_queue_script="${PROGDIR}/sge.disable.host.queue.sh"
}
# }}}
is_sge_host() { # {{{
@ -520,59 +527,73 @@ main() { # {{{
## If we need to watch for processes
if [ "${CHECK_PROCESS}" -eq "0" ]; then
## If nothing related to SGE is currently running
## Ensure the SGE queue(s) are really disable without creation of any flag file
sh "${sge_disable_host_queue_script}" --force
## Wait few seconds
sleep "${sleep_delay}"
## If nothing related to SGE is currently running {{{
### Try to start the SGE execd systemd service
### Exit with error if the service can't start
is_proc_running "${sge_proc_pattern}" \
|| systemctl --quiet start sge_execd.service > /dev/null 2>&1 \
|| exit 4
## }}}
## Wait some seconds
## Wait few seconds
sleep "${sleep_delay}"
fi
## If we need to watch for upgrades
if [ "${CHECK_UPGRADE}" -eq "0" ]; then
## If APT package upgrade is available
## If APT package upgrade is available {{{
### Exit (wait for APT upgrade to be applied)
is_apt_upgrade_present \
&& exit 0
## }}}
## If Maco upgrade is present
## If Maco upgrade is present {{{
### Exit (wait for Maco upgrade to be applied)
is_maco_upgrade_present \
&& exit 0
## }}}
fi
## If Maco status is ok, CONTINUE
## If Maco status is ok, CONTINUE {{{
### Else Exit (wait for upgrade/maintenance)
is_maco_status_ok \
|| exit 0
## }}}
## If we need to watch for processes
if [ "${CHECK_PROCESS}" -eq "0" ]; then
## If anything related to APT is currently running
## If anything related to APT is currently running {{{
### Exit
is_proc_running "${apt_proc_pattern}" \
&& exit 0
## }}}
## If anything related to maco is currently running
## If anything related to maco is currently running {{{
### Exit
is_proc_running "${maco_proc_pattern}" \
&& exit 0
## }}}
fi
## If we need to watch files
if [ "${CHECK_FILE}" -eq "0" ]; then
## If nologin file exist (error on upgrade,…)
## If nologin file exist (error on upgrade,…) {{{
### Exit
is_file_present "${file_nologin_path}" \
&& exit 0
## }}}
## If all SGE queue(s) were manually disabled (not any flag file)
## If all SGE queue(s) were manually disabled (not any flag file) {{{
### Exit
is_file_absent "${sge_queue_flag_pattern}" \
&& exit 0
## }}}
fi
## Simple debug message with color to valid current variables
@ -580,13 +601,14 @@ main() { # {{{
SGE queue(s): ${RED}${sge_queues_name_print:=/dev/null}${COLOR_DEBUG}\
for host: ${RED}${sge_hostname:=/dev/null}${COLOR_DEBUG}."
## If the queue(s) are already enable
## If the queue(s) are already enable {{{
### Ensure to remove any potential flag file
# shellcheck disable=SC2086
### Exit
is_all_queue_enable "${sge_hostname}" "${sge_queues_name}" \
&& find ${sge_queue_flag_pattern} -delete \
&& exit 0
## }}}
## Test all queues one by one
for loop_queue in ${sge_queues_name}; do
@ -595,12 +617,13 @@ for host: ${RED}${sge_hostname:=/dev/null}${COLOR_DEBUG}."
## "automatically disabled" for an upgrade
sge_queue_flag_file="${cluster_dir}/.sge.${loop_queue}.disable"
## If the queue is disable
## If the queue is disable {{{
### Try to enable it
is_queue_disable "${sge_hostname}" "${loop_queue}" \
&& enable_sge_queue "${sge_hostname}" "${loop_queue}"
## }}}
## Don't consider manually disabled queue as an error except if FORCE_MODE was specified
## Don't consider manually disabled queue as an error except if FORCE_MODE was specified {{{
if [ -f "${sge_queue_flag_file}" ] || [ "${FORCE_MODE}" -eq "0" ]; then
## If the queue is still disable
### Exit with error
@ -608,6 +631,7 @@ for host: ${RED}${sge_hostname:=/dev/null}${COLOR_DEBUG}."
&& printf '%b\n' "${RED}ERROR ${loop_queue}@${sge_hostname} is still disable.${RESET}" \
&& exit 5
fi
## }}}
done