From 2340ced9b84369ae46a354b28c91c8459b0e540c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gardais=20J=C3=A9r=C3=A9my?= Date: Tue, 23 Nov 2021 17:14:48 +0100 Subject: [PATCH] Ensure queue is disable before starting sge_execd --- cluster/sge.enable.host.queue.sh | 60 ++++++++++++++++++++++---------- 1 file changed, 42 insertions(+), 18 deletions(-) diff --git a/cluster/sge.enable.host.queue.sh b/cluster/sge.enable.host.queue.sh index 0f8500b..5198bc7 100755 --- a/cluster/sge.enable.host.queue.sh +++ b/cluster/sge.enable.host.queue.sh @@ -114,7 +114,7 @@ define_vars() { # {{{ fi ## }}} - ## If the host to manage is the current one + ## If the host to manage is the current one {{{ if is_current_host "${sge_hostname}" ; then debug_message "define_vars − \ ${sge_hostname} is the current host." @@ -130,8 +130,9 @@ ${sge_hostname} is not the current host." ## Force to (re)enable SGE queue(s) in any case FORCE_MODE="0" fi + ## }}} - ## If FORCE_MODE was defined and enabled + ## If FORCE_MODE was defined and enabled {{{ if [ -n "${FORCE_MODE}" ] && [ "${FORCE_MODE}" -eq "0" ]; then ## Disable upgrade checking (remote host, asked behaviour,…) CHECK_UPGRADE="1" @@ -143,8 +144,9 @@ ${sge_hostname} is not the current host." ## Ensure to define a value FORCE_MODE="1" fi + ## }}} - ## Get all queues name + ## Get all queues name {{{ sge_queues_name="$(qhost -h "${sge_hostname:=/dev/null}" -q -xml \ | grep "queue name" \ | cut -d"'" -f2 )" @@ -152,16 +154,21 @@ ${sge_hostname} is not the current host." | grep "queue name" \ | cut -d"'" -f2 \ | tr -s '\n' ' ' )" + ## }}} - ## List of process pattern to monitor + ## List of process pattern to monitor {{{ maco_proc_pattern="(/opt/maco/bin/maco.autoupdate.sh)" apt_proc_pattern="(aptitude.*full-upgrade|/usr/bin/dpkg.*--configure|dpkg-deb|/bin/sh /usr/lib/needrestart/dpkg-status)" sge_proc_pattern="(/usr/lib/gridengine/sge_execd)" - - ## List of files to monitor + ## }}} + ## List of files to monitor {{{ file_nologin_path="/etc/nologin" cluster_dir="/opt/ipr/cluster" sge_queue_flag_pattern="${cluster_dir}/.sge.*.disable" + ## }}} + + ## Script used to disable SGE queue(s) + sge_disable_host_queue_script="${PROGDIR}/sge.disable.host.queue.sh" } # }}} is_sge_host() { # {{{ @@ -520,59 +527,73 @@ main() { # {{{ ## If we need to watch for processes if [ "${CHECK_PROCESS}" -eq "0" ]; then - ## If nothing related to SGE is currently running + ## Ensure the SGE queue(s) are really disable without creation of any flag file + sh "${sge_disable_host_queue_script}" --force + + ## Wait few seconds + sleep "${sleep_delay}" + + ## If nothing related to SGE is currently running {{{ ### Try to start the SGE execd systemd service ### Exit with error if the service can't start is_proc_running "${sge_proc_pattern}" \ || systemctl --quiet start sge_execd.service > /dev/null 2>&1 \ || exit 4 + ## }}} - ## Wait some seconds + ## Wait few seconds sleep "${sleep_delay}" fi ## If we need to watch for upgrades if [ "${CHECK_UPGRADE}" -eq "0" ]; then - ## If APT package upgrade is available + ## If APT package upgrade is available {{{ ### Exit (wait for APT upgrade to be applied) is_apt_upgrade_present \ && exit 0 + ## }}} - ## If Maco upgrade is present + ## If Maco upgrade is present {{{ ### Exit (wait for Maco upgrade to be applied) is_maco_upgrade_present \ && exit 0 + ## }}} fi - ## If Maco status is ok, CONTINUE + ## If Maco status is ok, CONTINUE {{{ ### Else Exit (wait for upgrade/maintenance) is_maco_status_ok \ || exit 0 + ## }}} ## If we need to watch for processes if [ "${CHECK_PROCESS}" -eq "0" ]; then - ## If anything related to APT is currently running + ## If anything related to APT is currently running {{{ ### Exit is_proc_running "${apt_proc_pattern}" \ && exit 0 + ## }}} - ## If anything related to maco is currently running + ## If anything related to maco is currently running {{{ ### Exit is_proc_running "${maco_proc_pattern}" \ && exit 0 + ## }}} fi ## If we need to watch files if [ "${CHECK_FILE}" -eq "0" ]; then - ## If nologin file exist (error on upgrade,…) + ## If nologin file exist (error on upgrade,…) {{{ ### Exit is_file_present "${file_nologin_path}" \ && exit 0 + ## }}} - ## If all SGE queue(s) were manually disabled (not any flag file) + ## If all SGE queue(s) were manually disabled (not any flag file) {{{ ### Exit is_file_absent "${sge_queue_flag_pattern}" \ && exit 0 + ## }}} fi ## Simple debug message with color to valid current variables @@ -580,13 +601,14 @@ main() { # {{{ SGE queue(s): ${RED}${sge_queues_name_print:=/dev/null}${COLOR_DEBUG}\ for host: ${RED}${sge_hostname:=/dev/null}${COLOR_DEBUG}." - ## If the queue(s) are already enable + ## If the queue(s) are already enable {{{ ### Ensure to remove any potential flag file # shellcheck disable=SC2086 ### Exit is_all_queue_enable "${sge_hostname}" "${sge_queues_name}" \ && find ${sge_queue_flag_pattern} -delete \ && exit 0 + ## }}} ## Test all queues one by one for loop_queue in ${sge_queues_name}; do @@ -595,12 +617,13 @@ for host: ${RED}${sge_hostname:=/dev/null}${COLOR_DEBUG}." ## "automatically disabled" for an upgrade sge_queue_flag_file="${cluster_dir}/.sge.${loop_queue}.disable" - ## If the queue is disable + ## If the queue is disable {{{ ### Try to enable it is_queue_disable "${sge_hostname}" "${loop_queue}" \ && enable_sge_queue "${sge_hostname}" "${loop_queue}" + ## }}} - ## Don't consider manually disabled queue as an error except if FORCE_MODE was specified + ## Don't consider manually disabled queue as an error except if FORCE_MODE was specified {{{ if [ -f "${sge_queue_flag_file}" ] || [ "${FORCE_MODE}" -eq "0" ]; then ## If the queue is still disable ### Exit with error @@ -608,6 +631,7 @@ for host: ${RED}${sge_hostname:=/dev/null}${COLOR_DEBUG}." && printf '%b\n' "${RED}ERROR ${loop_queue}@${sge_hostname} is still disable.${RESET}" \ && exit 5 fi + ## }}} done