Ensure queue is disable before starting sge_execd

This commit is contained in:
Jeremy Gardais 2021-11-23 17:14:48 +01:00
parent 9d31d4ab02
commit 2340ced9b8
Signed by: jegardai
GPG Key ID: E759BAA22501AF32
1 changed files with 42 additions and 18 deletions

View File

@ -114,7 +114,7 @@ define_vars() { # {{{
fi fi
## }}} ## }}}
## If the host to manage is the current one ## If the host to manage is the current one {{{
if is_current_host "${sge_hostname}" ; then if is_current_host "${sge_hostname}" ; then
debug_message "define_vars \ debug_message "define_vars \
${sge_hostname} is the current host." ${sge_hostname} is the current host."
@ -130,8 +130,9 @@ ${sge_hostname} is not the current host."
## Force to (re)enable SGE queue(s) in any case ## Force to (re)enable SGE queue(s) in any case
FORCE_MODE="0" FORCE_MODE="0"
fi fi
## }}}
## If FORCE_MODE was defined and enabled ## If FORCE_MODE was defined and enabled {{{
if [ -n "${FORCE_MODE}" ] && [ "${FORCE_MODE}" -eq "0" ]; then if [ -n "${FORCE_MODE}" ] && [ "${FORCE_MODE}" -eq "0" ]; then
## Disable upgrade checking (remote host, asked behaviour,…) ## Disable upgrade checking (remote host, asked behaviour,…)
CHECK_UPGRADE="1" CHECK_UPGRADE="1"
@ -143,8 +144,9 @@ ${sge_hostname} is not the current host."
## Ensure to define a value ## Ensure to define a value
FORCE_MODE="1" FORCE_MODE="1"
fi fi
## }}}
## Get all queues name ## Get all queues name {{{
sge_queues_name="$(qhost -h "${sge_hostname:=/dev/null}" -q -xml \ sge_queues_name="$(qhost -h "${sge_hostname:=/dev/null}" -q -xml \
| grep "queue name" \ | grep "queue name" \
| cut -d"'" -f2 )" | cut -d"'" -f2 )"
@ -152,16 +154,21 @@ ${sge_hostname} is not the current host."
| grep "queue name" \ | grep "queue name" \
| cut -d"'" -f2 \ | cut -d"'" -f2 \
| tr -s '\n' ' ' )" | tr -s '\n' ' ' )"
## }}}
## List of process pattern to monitor ## List of process pattern to monitor {{{
maco_proc_pattern="(/opt/maco/bin/maco.autoupdate.sh)" maco_proc_pattern="(/opt/maco/bin/maco.autoupdate.sh)"
apt_proc_pattern="(aptitude.*full-upgrade|/usr/bin/dpkg.*--configure|dpkg-deb|/bin/sh /usr/lib/needrestart/dpkg-status)" apt_proc_pattern="(aptitude.*full-upgrade|/usr/bin/dpkg.*--configure|dpkg-deb|/bin/sh /usr/lib/needrestart/dpkg-status)"
sge_proc_pattern="(/usr/lib/gridengine/sge_execd)" sge_proc_pattern="(/usr/lib/gridengine/sge_execd)"
## }}}
## List of files to monitor ## List of files to monitor {{{
file_nologin_path="/etc/nologin" file_nologin_path="/etc/nologin"
cluster_dir="/opt/ipr/cluster" cluster_dir="/opt/ipr/cluster"
sge_queue_flag_pattern="${cluster_dir}/.sge.*.disable" sge_queue_flag_pattern="${cluster_dir}/.sge.*.disable"
## }}}
## Script used to disable SGE queue(s)
sge_disable_host_queue_script="${PROGDIR}/sge.disable.host.queue.sh"
} }
# }}} # }}}
is_sge_host() { # {{{ is_sge_host() { # {{{
@ -520,59 +527,73 @@ main() { # {{{
## If we need to watch for processes ## If we need to watch for processes
if [ "${CHECK_PROCESS}" -eq "0" ]; then if [ "${CHECK_PROCESS}" -eq "0" ]; then
## If nothing related to SGE is currently running ## Ensure the SGE queue(s) are really disable without creation of any flag file
sh "${sge_disable_host_queue_script}" --force
## Wait few seconds
sleep "${sleep_delay}"
## If nothing related to SGE is currently running {{{
### Try to start the SGE execd systemd service ### Try to start the SGE execd systemd service
### Exit with error if the service can't start ### Exit with error if the service can't start
is_proc_running "${sge_proc_pattern}" \ is_proc_running "${sge_proc_pattern}" \
|| systemctl --quiet start sge_execd.service > /dev/null 2>&1 \ || systemctl --quiet start sge_execd.service > /dev/null 2>&1 \
|| exit 4 || exit 4
## }}}
## Wait some seconds ## Wait few seconds
sleep "${sleep_delay}" sleep "${sleep_delay}"
fi fi
## If we need to watch for upgrades ## If we need to watch for upgrades
if [ "${CHECK_UPGRADE}" -eq "0" ]; then if [ "${CHECK_UPGRADE}" -eq "0" ]; then
## If APT package upgrade is available ## If APT package upgrade is available {{{
### Exit (wait for APT upgrade to be applied) ### Exit (wait for APT upgrade to be applied)
is_apt_upgrade_present \ is_apt_upgrade_present \
&& exit 0 && exit 0
## }}}
## If Maco upgrade is present ## If Maco upgrade is present {{{
### Exit (wait for Maco upgrade to be applied) ### Exit (wait for Maco upgrade to be applied)
is_maco_upgrade_present \ is_maco_upgrade_present \
&& exit 0 && exit 0
## }}}
fi fi
## If Maco status is ok, CONTINUE ## If Maco status is ok, CONTINUE {{{
### Else Exit (wait for upgrade/maintenance) ### Else Exit (wait for upgrade/maintenance)
is_maco_status_ok \ is_maco_status_ok \
|| exit 0 || exit 0
## }}}
## If we need to watch for processes ## If we need to watch for processes
if [ "${CHECK_PROCESS}" -eq "0" ]; then if [ "${CHECK_PROCESS}" -eq "0" ]; then
## If anything related to APT is currently running ## If anything related to APT is currently running {{{
### Exit ### Exit
is_proc_running "${apt_proc_pattern}" \ is_proc_running "${apt_proc_pattern}" \
&& exit 0 && exit 0
## }}}
## If anything related to maco is currently running ## If anything related to maco is currently running {{{
### Exit ### Exit
is_proc_running "${maco_proc_pattern}" \ is_proc_running "${maco_proc_pattern}" \
&& exit 0 && exit 0
## }}}
fi fi
## If we need to watch files ## If we need to watch files
if [ "${CHECK_FILE}" -eq "0" ]; then if [ "${CHECK_FILE}" -eq "0" ]; then
## If nologin file exist (error on upgrade,…) ## If nologin file exist (error on upgrade,…) {{{
### Exit ### Exit
is_file_present "${file_nologin_path}" \ is_file_present "${file_nologin_path}" \
&& exit 0 && exit 0
## }}}
## If all SGE queue(s) were manually disabled (not any flag file) ## If all SGE queue(s) were manually disabled (not any flag file) {{{
### Exit ### Exit
is_file_absent "${sge_queue_flag_pattern}" \ is_file_absent "${sge_queue_flag_pattern}" \
&& exit 0 && exit 0
## }}}
fi fi
## Simple debug message with color to valid current variables ## Simple debug message with color to valid current variables
@ -580,13 +601,14 @@ main() { # {{{
SGE queue(s): ${RED}${sge_queues_name_print:=/dev/null}${COLOR_DEBUG}\ SGE queue(s): ${RED}${sge_queues_name_print:=/dev/null}${COLOR_DEBUG}\
for host: ${RED}${sge_hostname:=/dev/null}${COLOR_DEBUG}." for host: ${RED}${sge_hostname:=/dev/null}${COLOR_DEBUG}."
## If the queue(s) are already enable ## If the queue(s) are already enable {{{
### Ensure to remove any potential flag file ### Ensure to remove any potential flag file
# shellcheck disable=SC2086 # shellcheck disable=SC2086
### Exit ### Exit
is_all_queue_enable "${sge_hostname}" "${sge_queues_name}" \ is_all_queue_enable "${sge_hostname}" "${sge_queues_name}" \
&& find ${sge_queue_flag_pattern} -delete \ && find ${sge_queue_flag_pattern} -delete \
&& exit 0 && exit 0
## }}}
## Test all queues one by one ## Test all queues one by one
for loop_queue in ${sge_queues_name}; do for loop_queue in ${sge_queues_name}; do
@ -595,12 +617,13 @@ for host: ${RED}${sge_hostname:=/dev/null}${COLOR_DEBUG}."
## "automatically disabled" for an upgrade ## "automatically disabled" for an upgrade
sge_queue_flag_file="${cluster_dir}/.sge.${loop_queue}.disable" sge_queue_flag_file="${cluster_dir}/.sge.${loop_queue}.disable"
## If the queue is disable ## If the queue is disable {{{
### Try to enable it ### Try to enable it
is_queue_disable "${sge_hostname}" "${loop_queue}" \ is_queue_disable "${sge_hostname}" "${loop_queue}" \
&& enable_sge_queue "${sge_hostname}" "${loop_queue}" && enable_sge_queue "${sge_hostname}" "${loop_queue}"
## }}}
## Don't consider manually disabled queue as an error except if FORCE_MODE was specified ## Don't consider manually disabled queue as an error except if FORCE_MODE was specified {{{
if [ -f "${sge_queue_flag_file}" ] || [ "${FORCE_MODE}" -eq "0" ]; then if [ -f "${sge_queue_flag_file}" ] || [ "${FORCE_MODE}" -eq "0" ]; then
## If the queue is still disable ## If the queue is still disable
### Exit with error ### Exit with error
@ -608,6 +631,7 @@ for host: ${RED}${sge_hostname:=/dev/null}${COLOR_DEBUG}."
&& printf '%b\n' "${RED}ERROR ${loop_queue}@${sge_hostname} is still disable.${RESET}" \ && printf '%b\n' "${RED}ERROR ${loop_queue}@${sge_hostname} is still disable.${RESET}" \
&& exit 5 && exit 5
fi fi
## }}}
done done