108 lines
5.4 KiB
Bash
Executable File
108 lines
5.4 KiB
Bash
Executable File
#!/bin/sh
|
||
# Purpose {{{
|
||
## If Xymon server says that a service is in error on a remote host, try to
|
||
## restart this service.
|
||
## 1. Create a ssh keyring for xymon user {{{
|
||
# sudo mkdir -p -- /var/lib/xymon/.ssh/
|
||
# sudo ssh-keygen -f /var/lib/xymon/.ssh/id_rsa -N '' -q
|
||
# sudo chown -R xymon:xymon /var/lib/xymon/.ssh/
|
||
## }}}
|
||
## 2. Remote user {{{
|
||
# Ensure to have the ${REMOTE_SSH_USER} available on remote hosts and allowed
|
||
# to connect with SSH.
|
||
# Restrict the SSH access to a single SSH key from the Xymon server IP
|
||
# (~${REMOTE_SSH_USER}/.ssh/authorized_keys) :
|
||
## from="IP.SRV.XYM.ON" ssh-rsa AAAAA…
|
||
# Allow sudo commands to restart services (/etc/sudoers.d/xymon-ssh) :
|
||
## xymon-ssh ALL=(root:root) NOPASSWD: /bin/systemctl restart *
|
||
## }}}
|
||
## 3. Xymon Configuration {{{
|
||
# PROC monitoring need to display the real service name in it's description :
|
||
## PROC %^/sbin/rpcbind MIN=1 MAX=1 COLOR=red "TEXT=rpcbind"
|
||
# You can add more information about this proc if you an underscore "_" :
|
||
## PROC %^/usr/sbin/rpc.idmapd MIN=1 MAX=1 COLOR=red "TEXT=NFS-server_rpc.idmapd"
|
||
## This way, the script will only take the text before the underscore "_" as the
|
||
## service name to be restarted.
|
||
# Don't add whitespaces in the description of a process.
|
||
|
||
## }}}
|
||
# }}}
|
||
|
||
# Vars {{{
|
||
DEBUG=1
|
||
|
||
REMOTE_SSH_USER="xymon-ssh"
|
||
|
||
temp_dir=$(mktemp -d -t xymon-procs-alert-XXXXXX.tmp)
|
||
debug_stdout="${temp_dir}/debug.stdout"
|
||
debug_stderr="${temp_dir}/debug.stderr"
|
||
service_list="${temp_dir}/services.error.list"
|
||
# }}}
|
||
|
||
# Create log files
|
||
touch "${debug_stdout}" "${debug_stderr}"
|
||
|
||
# Manage only procs probe {{{
|
||
if [ "${BBSVCNAME}" = "procs" ]; then
|
||
[ "${DEBUG}" -eq "0" ] && printf '\e[1;35m%-6s\e[m\n' "DEBUG : ${BBHOSTNAME} — ${BBSVCNAME} error" >> "${debug_stdout}"
|
||
else
|
||
[ "${DEBUG}" -eq "0" ] && printf '\e[1;35m%-6s\e[m\n' "DEBUG : ${BBHOSTNAME} — ${BBSVCNAME} probe is not managed." >> "${debug_stderr}"
|
||
[ "${DEBUG}" -eq "0" ] || rm -rf -- "${temp_dir}"
|
||
exit 0
|
||
fi
|
||
# }}}
|
||
|
||
# Get the list of processes with an error
|
||
echo "${BBALPHAMSG}" | grep -E "&(red|yellow)" | cut -d" " -f2- | tr '[:upper:]' '[:lower:]' > "${service_list}"
|
||
|
||
# If any error on a process
|
||
if [ -s "${service_list}" ]; then
|
||
[ "${DEBUG}" -eq "0" ] && printf '\e[1;35m%-6s\e[m\n' "DEBUG : process list — Some processes seems to be in error." >> "${debug_stdout}"
|
||
while IFS= read -r line; do
|
||
## Pattern "req. between" {{{
|
||
if echo "${line}" | grep -q -E -- ".* \\(found .*, req. between .* and .*\\)" ; then
|
||
[ "${DEBUG}" -eq "0" ] && printf '\e[1;35m%-6s\e[m\n' "DEBUG : while process loop — Pattern \"req. between\"." >> "${debug_stdout}"
|
||
service_name="$(echo "${line}" | cut -d" " -f1 | sed 's/_.*//')"
|
||
process_found="$(echo "${line}" | cut -d" " -f3 | tr -d ',')"
|
||
process_min="$(echo "${line}" | cut -d" " -f6)"
|
||
process_max="$(echo "${line}" | cut -d" " -f8 | tr -d ')')"
|
||
fi
|
||
## }}}
|
||
## Pattern "req. .* or more" {{{
|
||
if echo "${line}" | grep -q -E -- ".* \\(found .*, req. .* or more\\)" ; then
|
||
[ "${DEBUG}" -eq "0" ] && printf '\e[1;35m%-6s\e[m\n' "DEBUG : while process loop — Pattern \"req. .* or more\"." >> "${debug_stdout}"
|
||
service_name="$(echo "${line}" | cut -d" " -f1 | sed 's/_.*//')"
|
||
process_found="$(echo "${line}" | cut -d" " -f3 | tr -d ',')"
|
||
process_min="$(echo "${line}" | cut -d" " -f5)"
|
||
process_max="nolimit"
|
||
fi
|
||
## }}}
|
||
[ "${DEBUG}" -eq "0" ] && printf '\e[1;35m%-6s\e[m\n' "DEBUG : while process loop — Found ${process_found} process(es) for ${service_name} service and require between ${process_min} and ${process_max}." >> "${debug_stdout}"
|
||
# Restart service if needed {{{
|
||
if [ "${process_found}" ] && [ "${process_min}" ] && [ "${process_found}" -lt "${process_min}" ]; then
|
||
[ "${DEBUG}" -eq "0" ] && printf '\e[1;35m%-6s\e[m\n' "DEBUG : while process loop — ${service_name} need to be restarted." >> "${debug_stdout}"
|
||
[ "${DEBUG}" -eq "0" ] && printf '\e[1;35m%-6s\e[m\n' "DEBUG : while process loop — ssh -n -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${REMOTE_SSH_USER}@${BBHOSTNAME} sudo systemctl restart ${service_name}.service" >> "${debug_stdout}"
|
||
ssh -n -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null "${REMOTE_SSH_USER}"@"${BBHOSTNAME}" "sudo systemctl restart ${service_name}.service" >> "${debug_stdout}" 2>> "${debug_stderr}"
|
||
else
|
||
[ "${DEBUG}" -eq "0" ] && printf '\e[1;35m%-6s\e[m\n' "DEBUG : while process loop — ${service_name} service is not managed." >> "${debug_stdout}"
|
||
fi
|
||
# }}}
|
||
done < "${service_list}"
|
||
|
||
# Also restart xymon-client service {{{
|
||
[ "${DEBUG}" -eq "0" ] && printf '\e[1;35m%-6s\e[m\n' "DEBUG : process list — xymon-client also need to be restarted." >> "${debug_stdout}"
|
||
[ "${DEBUG}" -eq "0" ] && printf '\e[1;35m%-6s\e[m\n' "DEBUG : process list — ssh -n -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${REMOTE_SSH_USER}@${BBHOSTNAME} sudo systemctl restart xymon-client.service" >> "${debug_stdout}"
|
||
ssh -n -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null "${REMOTE_SSH_USER}"@"${BBHOSTNAME}" "sudo systemctl restart xymon-client.service" >> "${debug_stdout}" 2>> "${debug_stderr}"
|
||
# }}}
|
||
else
|
||
[ "${DEBUG}" -eq "0" ] && printf '\e[1;35m%-6s\e[m\n' "DEBUG : process list — No error on any process." >> "${debug_stdout}"
|
||
fi
|
||
|
||
# Remove empty error file
|
||
[ -s "${debug_stderr}" ] || rm -f "${debug_stderr}"
|
||
|
||
# Remove temp_dir if DEBUG is disable
|
||
[ "${DEBUG}" -eq "0" ] || rm -rf -- "${temp_dir}"
|
||
|
||
exit 0
|