#!/bin/sh # Vars {{{ readonly PROGNAME=$(basename "${0}") readonly PROGDIR=$(readlink -m $(dirname "${0}")) readonly ARGS="${*}" readonly NBARGS="${#}" ## Test if DEBUG is already defined (by parent script,…) [ -z "${DEBUG}" ] && DEBUG=1 # If output message should be displayed [ -z "${OUTPUT_MESSAGE}" ] && OUTPUT_MESSAGE=0 # APT temp file to monitor readonly APT_TMP_FILE="/tmp/.apt.upgrade" # Maco temp file readonly MACO_LOCAL_DIR="/opt/maco" readonly MACO_TMP_FILE="${MACO_LOCAL_DIR}/.maco.upgrade" readonly MACO_TMP_URGENT_FILE="${MACO_LOCAL_DIR}/.maco.urgent.upgrade" # Maco status file readonly MACO_STATUS_FILE="/var/fr.univ-rennes1.ipr.maco.machinestate.txt" ## Colors readonly PURPLE='\033[1;35m' readonly RED='\033[0;31m' readonly RESET='\033[0m' readonly COLOR_DEBUG="${PURPLE}" # }}} usage() { # {{{ cat <<- EOF usage: $PROGNAME [--help] [-d|-f|-h|-q] [hostname] Try to enable all SGE queues of the current host (default), if no pending upgrades (Maco, APT) are present, or to the one passed as first argument (no upgrades checking). EXAMPLES : - Enable SGE's queue(s) of the current host ${PROGNAME} - Enable SGE's queue(s) of "marvin.domain.tld" host ${PROGNAME} marvin.domain.tld ${PROGNAME} -h marvin.domain.tld OPTIONS : -d,--debug Enable debug messages. -f,--force Try to (re)enable a queue even if it was previously manually disabled (by a user) and avoid all checks (pending upgrades, running processes, files,…). --help Print this help message. -h,--host,--hostname SGE_HOST_TO_MANAGE Manage SGE's queue(s) of "SGE_HOST_TO_MANAGE" host. -q,--quiet Disable messages on standard output (except for error). EOF } # }}} debug_message() { # {{{ local_debug_message="${1}" ## Print message if DEBUG is enable (=0) [ "${DEBUG}" -eq "0" ] && printf '\e[1;35m%-6b\e[m\n' "DEBUG − ${PROGNAME} : ${local_debug_message}" return 0 } # }}} message() { # {{{ local_message="${1}" ## Print message if OUTPUT_MESSAGE is enable (=0) [ "${OUTPUT_MESSAGE}" -eq "0" ] && printf '%b\n' "${local_message}" return 0 } # }}} define_vars() { # {{{ ## If sge_hostname wasn't defined if [ -z "${sge_hostname}" ]; then ## Use local host for sge_hostname sge_hostname="$(hostname -f)" fi ## If the host to manage is the current one if is_current_host "${sge_hostname}" ; then debug_message "define_vars − \ ${sge_hostname} is the current host." ## Enable to verify if pending upgrades are present CHECK_UPGRADE="0" ## Enable to watch if some files are present CHECK_FILE="0" ## Enable to verify if upgrades are running CHECK_PROCESS="0" else ## In case of a remote host debug_message "define_vars − \ ${sge_hostname} is not the current host." ## Force to (re)enable SGE queue(s) in any case FORCE_MODE="0" fi ## If FORCE_MODE was defined and enabled if [ -n "${FORCE_MODE}" ] && [ "${FORCE_MODE}" -eq "0" ]; then ## Disable upgrade checking (remote host, asked behaviour,…) CHECK_UPGRADE="1" ## Disable files monitoring CHECK_FILE="1" ## Disable process checking (remote host, asked behaviour,…) CHECK_PROCESS="1" else ## Ensure to define a value FORCE_MODE="1" fi ## Get all queues name sge_queues_name="$(qhost -h "${sge_hostname:=/dev/null}" -q -xml \ | grep "queue name" \ | cut -d"'" -f2 )" sge_queues_name_print="$(qhost -h "${sge_hostname:=/dev/null}" -q -xml \ | grep "queue name" \ | cut -d"'" -f2 \ | tr -s '\n' ' ' )" ## List of process pattern to monitor maco_proc_pattern="(/opt/maco/bin/maco.autoupdate.sh)" apt_proc_pattern="(aptitude.*full-upgrade|/usr/bin/dpkg.*--configure|dpkg-deb|/bin/sh /usr/lib/needrestart/dpkg-status)" sge_proc_pattern="(/usr/lib/gridengine/sge_execd)" ## List of files to monitor file_nologin_path="/etc/nologin" cluster_dir="/opt/ipr/cluster" sge_queue_flag_pattern="${cluster_dir}/.sge.*.disable" } # }}} is_sge_host() { # {{{ ## Check if SGE commands (qhost) are available if [ "$(command -v qhost)" ]; then return_is_sge_host="0" debug_message "is_sge_host − \ SGE seems present on this host." else return_is_sge_host="1" debug_message "is_sge_host − \ SGE is not present on this host." fi return "${return_is_sge_host}" } # }}} is_sge_master_available() { # {{{ ## Check with Netcat if SGE master (sge_qmaster) is reachable from this host. ### -z: Only scan for listening daemons, without sending any data to them. ### -w 10: Timeout the test after 10 seconds. if nc -z -w 10 "${sge_master_uri}" "${sge_master_port}"; then return_is_sge_master_available="0" debug_message "is_sge_master_available − \ SGE Master (${sge_master_uri}:${sge_master_port}) is reachable from this host." else return_is_sge_master_available="1" debug_message "is_sge_master_available − \ SGE Master (${sge_master_uri}:${sge_master_port}) is not reachable from this host." fi return "${return_is_sge_master_available}" } # }}} is_current_host() { # {{{ local_current_host="${1}" local_current_fqdn=$(hostname -f) ## Test if the sge_host to manage is the current host if [ "${local_current_host}" = "${local_current_fqdn}" ]; then local_current_host_return="0" else local_current_host_return="1" fi return "${local_current_host_return}" } # }}} is_apt_upgrade_present() { # {{{ ## No pending upgrade by default return_apt_upgrade_present="1" ### Check if temp APT upgrade file exists if [ -f "${APT_TMP_FILE}" ]; then return_apt_upgrade_present="0" debug_message "is_apt_upgrade_absent − \ APT upgrade seems available for this system." else return_apt_upgrade_present="1" debug_message "is_apt_upgrade_absent − \ NO APT upgrade available for this system." fi return "${return_apt_upgrade_present}" } # }}} is_maco_upgrade_present() { # {{{ ## No pending upgrades by default return_maco_upgrade_present="1" ## Check if temp Maco upgrade file is present if [ -f "${MACO_TMP_FILE}" ]; then return_maco_upgrade_present="0" debug_message "is_maco_upgrade_present − \ Maco upgrade seems available." ## Check if temp Maco urgent upgrade file is present elif [ -f "${MACO_TMP_URGENT_FILE}" ]; then return_maco_upgrade_present="0" debug_message "is_maco_upgrade_present − \ Maco urgent upgrade seems available." else debug_message "is_maco_upgrade_present − \ No Maco upgrade require." fi return "${return_maco_upgrade_present}" } # }}} is_maco_status_ok() { # {{{ ## Maco status not ok by default return_maco_status_ok="1" ## Check if Maco status file is present if [ -f "${MACO_STATUS_FILE}" ]; then debug_message "is_maco_status_ok − \ Maco status file (${MACO_STATUS_FILE}) exists." local_maco_status=$(grep --max-count=1 -- MacoStatus "${MACO_STATUS_FILE}" | cut --delimiter="=" --fields=2) ## Check current Maco status if [ "${local_maco_status}" = "last-update-succeeded" ]; then debug_message "is_maco_status_ok − \ Last Maco upgrade succeed (${local_maco_status})." return_maco_status_ok="0" else debug_message "is_maco_status_ok − \ Maco require upgrade/maintenance (current state: ${local_maco_status})." fi else debug_message "is_maco_status_ok − \ Maco status file (${MACO_STATUS_FILE}) doesn't exists." fi return "${return_maco_status_ok}" } # }}} is_proc_running() { # {{{ local_proc_pattern="${1}" local_count_proc_pattern="$(pgrep -f -- "${local_proc_pattern}" | wc -l)" case "${local_count_proc_pattern}" in 0 ) ## No procs related to this pattern are running return_proc_running="1" ;; * ) ## At least one proc seems running return_proc_running="0" ;; esac ## Simple debug message to valid current variables debug_message "is_proc_running − \ procs running (with the pattern: ${RED}${local_proc_pattern}${COLOR_DEBUG}) on the current host: ${RED}${local_count_proc_pattern}${COLOR_DEBUG}." return "${return_proc_running}" } # }}} is_file_present() { # {{{ local_file_present="${1}" ## File doesn't exist by default return_is_file_present="1" ### Check if the file exists # shellcheck disable=SC2086 if find ${local_file_present} > /dev/null 2>&1; then return_is_file_present="0" debug_message "is_file_present − \ The file ${RED}${local_file_present}${COLOR_DEBUG} exists." else return_is_file_present="1" debug_message "is_file_present − \ The file ${RED}${local_file_present}${COLOR_DEBUG} doesn't exist." fi return "${return_is_file_present}" } # }}} is_file_absent() { # {{{ local_file_absent="${1}" ## File exists by default return_is_file_absent="1" ### Check if the file exists # shellcheck disable=SC2086 if find ${local_file_absent} > /dev/null 2>&1; then return_is_file_absent="1" debug_message "is_file_absent − \ The file ${RED}${local_file_absent}${COLOR_DEBUG} exists." else return_is_file_absent="0" debug_message "is_file_absent − \ The file ${RED}${local_file_absent}${COLOR_DEBUG} doesn't exist." fi return "${return_is_file_absent}" } # }}} is_queue_enable() { # {{{ local_queue_enable_hostname="${1}" local_queue_enable_name="${2}" ## List all queues with 'disable' state and filter to the expected queue name ## with a fake_user to avoid pending jobs for this queue ### And count returned lines local_queue_enable_test=$(qstat -f -qs d -q "${local_queue_enable_name:=/dev/null}@${local_queue_enable_hostname:=/dev/null}" -u fake_user \ | wc -l) case "${local_queue_enable_test}" in 0 ) ## No result so the queue is enable local_sge_queue_state="enable" return_queue_enable="0" ;; 3 ) ## Results (header + queue name) so the queue is disable local_sge_queue_state="disable" return_queue_enable="1" ;; * ) ## Unexpected result printf '%b\n' "${RED}Not able to determine the state of ${local_sge_queue_name:=/dev/null}@${local_queue_enable_hostname:=/dev/null} queue (command return ${local_queue_enable_test} lines).${RESET}" exit 2 ;; esac ## Simple debug message to valid current variables debug_message "is_queue_enable − \ SGE queue: ${RED}${local_queue_enable_name:=/dev/null}${COLOR_DEBUG} \ state is: ${RED}${local_sge_queue_state:=/dev/null}${COLOR_DEBUG}." return "${return_queue_enable}" } # }}} is_queue_disable() { # {{{ local_queue_disable_hostname="${1}" local_queue_disable_name="${2}" ## List all queues with 'disable' state and filter to the expected queue name ## add a fake_user to avoid pending jobs for this queue ### And count returned lines local_queue_disable_test=$(qstat -f -qs d -q "${local_queue_disable_name:=/dev/null}@${local_queue_disable_hostname:=/dev/null}" -u fake_user \ | wc -l) case "${local_queue_disable_test}" in 0 ) ## No result so the queue is enable local_sge_queue_state="enable" return_queue_disable="1" ## Ensure to remove any previously setted file # shellcheck disable=SC2086 find ${sge_queue_flag_file} -delete ;; 3 ) ## Results (header + queue name) so the queue is disable local_sge_queue_state="disable" return_queue_disable="0" ;; * ) ## Unexpected result printf '%b\n' "${RED}Not able to determine the state of ${local_queue_disable_name:=/dev/null}@${local_queue_disable_hostname:=/dev/null} queue (command return ${local_queue_disable_test} lines).${RESET}" exit 3 ;; esac ## Simple debug message to valid current variables debug_message "is_queue_disable − \ SGE queue: ${RED}${local_queue_disable_name:=/dev/null}${COLOR_DEBUG} \ state is: ${RED}${local_sge_queue_state:=/dev/null}${COLOR_DEBUG}." return "${return_queue_disable}" } # }}} is_all_queue_enable() { # {{{ local_all_queue_enable_hostname="${1}" local_all_queue_enable_name="${2}" ## By default, all queues are enable return_all_queue_enable="0" ## Test all queues one by one for loop_enable_queue in ${local_all_queue_enable_name}; do ### If a queue is not enable #### Change the return value is_queue_enable "${local_all_queue_enable_hostname}" "${loop_enable_queue}" \ || return_all_queue_enable="1" done return "${return_all_queue_enable}" } # }}} enable_sge_queue() { # {{{ local_sge_hostname="${1}" local_sge_queue_name="${2}" ## If the queue was previously disabled by another script OR if FORCE_MODE is enable if [ -f "${sge_queue_flag_file}" ] || [ "${FORCE_MODE}" -eq "0" ]; then debug_message "enable_sge_queue − \ Previously disabled by a script (or FORCE is enable), try to enable SGE queue: ${RED}${local_sge_queue_name:=/dev/null}@${local_sge_hostname:=/dev/null}${COLOR_DEBUG}." ## SGE command to enable the queue qmod --enable "${local_sge_queue_name}@${local_sge_hostname}" > /dev/null \ && message "Enable SGE queue: ${RED}${local_sge_queue_name:=/dev/null}@${local_sge_hostname:=/dev/null}${RESET}" \ && return_enable_queue="${?}" else message "SGE queue: ${RED}${local_sge_queue_name:=/dev/null}@${local_sge_hostname:=/dev/null}${RESET} was manually disabled, please re-enable it ${RED}manually${RESET} (or use --force option)." return_enable_queue="1" fi return "${return_enable_queue}" } # }}} main() { # {{{ ## If SGE is not yet available on this host {{{ ### Exit is_sge_host \ || exit 0 ## }}} ## Test if SGE Master is reachable {{{ ### If sge_master_uri wasn't defined (environment variable,…) {{{ if [ -z "${sge_master_uri}" ]; then ## Use local host for sge_master_uri sge_master_uri="physix-master.ipr.univ-rennes1.fr" fi ### }}} ### If sge_master_port wasn't defined (environment variable,…) {{{ if [ -z "${sge_master_port}" ]; then ## Use local host for sge_master_port sge_master_port="6444" fi ### }}} ### If SGE Master is not reachable from this host {{{ #### Exit is_sge_master_available \ || exit 0 ### }}} ## }}} ## Define all vars according the selected options define_vars ## If we need to watch for upgrades if [ "${CHECK_UPGRADE}" -eq "0" ]; then ## If APT package upgrade is available ### Exit (wait for APT upgrade to be applied) is_apt_upgrade_present \ && exit 0 ## If Maco upgrade is present ### Exit (wait for Maco upgrade to be applied) is_maco_upgrade_present \ && exit 0 fi ## If Maco status is ok, CONTINUE ### Else Exit (wait for upgrade/maintenance) is_maco_status_ok \ || exit 0 ## If we need to watch for processes if [ "${CHECK_PROCESS}" -eq "0" ]; then ## If anything related to APT is currently running ### Exit is_proc_running "${apt_proc_pattern}" \ && exit 0 ## If anything related to maco is currently running ### Exit is_proc_running "${maco_proc_pattern}" \ && exit 0 ## If nothing related to SGE is currently running ### Try to start the SGE execd systemd service ### Exit with error if the service can't start is_proc_running "${sge_proc_pattern}" \ || systemctl --quiet start sge_execd.service > /dev/null 2>&1 \ || exit 4 fi ## If we need to watch files if [ "${CHECK_FILE}" -eq "0" ]; then ## If nologin file exist (error on upgrade,…) ### Exit is_file_present "${file_nologin_path}" \ && exit 0 ## If all SGE queue(s) were manually disabled (not any flag file) ### Exit is_file_absent "${sge_queue_flag_pattern}" \ && exit 0 fi ## Simple debug message with color to valid current variables debug_message "main − Try to manage \ SGE queue(s): ${RED}${sge_queues_name_print:=/dev/null}${COLOR_DEBUG}\ for host: ${RED}${sge_hostname:=/dev/null}${COLOR_DEBUG}." ## If the queue(s) are already enable ### Ensure to remove any potential flag file # shellcheck disable=SC2086 ### Exit is_all_queue_enable "${sge_hostname}" "${sge_queues_name}" \ && find ${sge_queue_flag_pattern} -delete \ && exit 0 ## Test all queues one by one for loop_queue in ${sge_queues_name}; do ## File previously set if the queue was disabled by a script ## "automatically disabled" for an upgrade sge_queue_flag_file="${cluster_dir}/.sge.${loop_queue}.disable" ## If the queue is disable ### Try to enable it is_queue_disable "${sge_hostname}" "${loop_queue}" \ && enable_sge_queue "${sge_hostname}" "${loop_queue}" ## Don't consider manually disabled queue as an error except if FORCE_MODE was specified if [ -f "${sge_queue_flag_file}" ] || [ "${FORCE_MODE}" -eq "0" ]; then ## If the queue is still disable ### Exit with error is_queue_disable "${sge_hostname}" "${loop_queue}" \ && printf '%b\n' "${RED}ERROR ${loop_queue}@${sge_hostname} is still disable.${RESET}" \ && exit 5 fi done } # }}} # Manage arguments # {{{ # This code can't be in a function due to arguments if [ ! "${NBARGS}" -eq "0" ]; then manage_arg="0" ## If the first argument is not an option if ! printf -- '%s' "${1}" | grep -q -E -- "^-+"; then ## Use this argument for sge_hostname sge_hostname="${1}" ## Switch to the next argument shift manage_arg=$((manage_arg+1)) fi # Parse all options (start with a "-") one by one while printf -- '%s' "${1}" | grep -q -E -- "^-+"; do case "${1}" in -d|--debug ) ## debug DEBUG=0 ;; -f|--force ) ## Force to enable SGE queue FORCE_MODE=0 ;; --help ) ## help usage ## Exit after help informations exit 0 ;; -h|--host|--hostname ) ## Specify a different host to manage ## Move to the next argument shift ## Override previous definition of sge_hostname sge_hostname="${1}" ;; -q|--quiet ) ## Silent mode ## Avoid to display any message on standard output OUTPUT_MESSAGE=1 ;; -- ) ## End of options list ## End the while loop break ;; * ) ## unknow option printf '%b\n' "${RED}Invalid option: ${1}${RESET}" printf '%b\n' "---" usage exit 1 ;; esac debug_message "Arguments management − \ ${RED}${1}${COLOR_DEBUG} option managed." ## Move to the next argument shift manage_arg=$((manage_arg+1)) done debug_message "Arguments management − \ ${RED}${manage_arg}${COLOR_DEBUG} argument(s) successfully managed." else debug_message "Arguments management − \ No arguments/options to manage." fi # }}} main exit 0