2018-08-14 17:25:55 +02:00
#!/bin/sh
2018-08-21 16:36:29 +02:00
# Purpose {{{
2018-08-22 10:27:35 +02:00
## If Xymon server says that a service is in error on a remote host, try to
## restart this service.
2018-08-21 16:36:29 +02:00
## 1. Create a ssh keyring for xymon user {{{
# sudo mkdir -p -- /var/lib/xymon/.ssh/
# sudo ssh-keygen -f /var/lib/xymon/.ssh/id_rsa -N '' -q
# sudo chown -R xymon:xymon /var/lib/xymon/.ssh/
## }}}
## 2. Remote user {{{
2018-08-22 10:27:35 +02:00
# Ensure to have the ${REMOTE_SSH_USER} available on remote hosts and allowed
# to connect with SSH.
# Restrict the SSH access to a single SSH key from the Xymon server IP
# (~${REMOTE_SSH_USER}/.ssh/authorized_keys) :
2018-08-21 16:36:29 +02:00
## from="IP.SRV.XYM.ON" ssh-rsa AAAAA…
# Allow sudo commands to restart services (/etc/sudoers.d/xymon-ssh) :
## xymon-ssh ALL=(root:root) NOPASSWD: /bin/systemctl restart *
## }}}
2018-08-22 10:27:35 +02:00
## 3. Xymon Configuration {{{
# PROC monitoring need to display the real service name in it's description :
## PROC %^/sbin/rpcbind MIN=1 MAX=1 COLOR=red "TEXT=rpcbind"
# You can add more information about this proc if you an underscore "_" :
## PROC %^/usr/sbin/rpc.idmapd MIN=1 MAX=1 COLOR=red "TEXT=NFS-server_rpc.idmapd"
## This way, the script will only take the text before the underscore "_" as the
## service name to be restarted.
# Don't add whitespaces in the description of a process.
2018-08-14 17:25:55 +02:00
2018-08-22 10:27:35 +02:00
## }}}
2018-08-21 16:36:29 +02:00
# }}}
2018-08-14 17:25:55 +02:00
2018-08-21 16:36:29 +02:00
# Vars {{{
DEBUG = 1
2018-08-14 17:25:55 +02:00
2018-09-06 17:31:29 +02:00
REMOTE_SSH_USER = "xymon-ssh"
2018-08-21 13:41:35 +02:00
temp_dir = $( mktemp -d -t xymon-procs-alert-XXXXXX.tmp)
debug_stdout = " ${ temp_dir } /debug.stdout "
debug_stderr = " ${ temp_dir } /debug.stderr "
2018-08-14 17:25:55 +02:00
service_list = " ${ temp_dir } /services.error.list "
# }}}
2018-08-21 13:41:35 +02:00
# Create log files
touch " ${ debug_stdout } " " ${ debug_stderr } "
2018-08-14 17:25:55 +02:00
2018-08-21 13:41:35 +02:00
# Manage only procs probe {{{
if [ " ${ BBSVCNAME } " = "procs" ] ; then
[ " ${ DEBUG } " -eq "0" ] && printf '\e[1;35m%-6s\e[m\n' " DEBUG : ${ BBHOSTNAME } — ${ BBSVCNAME } error " >> " ${ debug_stdout } "
else
[ " ${ DEBUG } " -eq "0" ] && printf '\e[1;35m%-6s\e[m\n' " DEBUG : ${ BBHOSTNAME } — ${ BBSVCNAME } probe is not managed. " >> " ${ debug_stderr } "
[ " ${ DEBUG } " -eq "0" ] || rm -rf -- " ${ temp_dir } "
exit 0
2018-08-14 17:25:55 +02:00
fi
2018-08-21 13:41:35 +02:00
# }}}
2018-08-14 17:25:55 +02:00
# Get the list of processes with an error
2018-08-21 14:53:12 +02:00
echo " ${ BBALPHAMSG } " | grep -E "&(red|yellow)" | cut -d" " -f2- | tr '[:upper:]' '[:lower:]' > " ${ service_list } "
2018-08-14 17:25:55 +02:00
# If any error on a process
if [ -s " ${ service_list } " ] ; then
2018-08-21 13:41:35 +02:00
[ " ${ DEBUG } " -eq "0" ] && printf '\e[1;35m%-6s\e[m\n' "DEBUG : process list — Some processes seems to be in error." >> " ${ debug_stdout } "
2018-08-21 14:53:12 +02:00
while IFS = read -r line; do
## Pattern "req. between" {{{
if echo " ${ line } " | grep -q -E -- ".* \\(found .*, req. between .* and .*\\)" ; then
[ " ${ DEBUG } " -eq "0" ] && printf '\e[1;35m%-6s\e[m\n' "DEBUG : while process loop — Pattern \"req. between\"." >> " ${ debug_stdout } "
2018-08-22 10:27:35 +02:00
service_name = " $( echo " ${ line } " | cut -d" " -f1 | sed 's/_.*//' ) "
2018-08-21 14:53:12 +02:00
process_found = " $( echo " ${ line } " | cut -d" " -f3 | tr -d ',' ) "
process_min = " $( echo " ${ line } " | cut -d" " -f6) "
process_max = " $( echo " ${ line } " | cut -d" " -f8 | tr -d ')' ) "
fi
## }}}
## Pattern "req. .* or more" {{{
if echo " ${ line } " | grep -q -E -- ".* \\(found .*, req. .* or more\\)" ; then
[ " ${ DEBUG } " -eq "0" ] && printf '\e[1;35m%-6s\e[m\n' "DEBUG : while process loop — Pattern \"req. .* or more\"." >> " ${ debug_stdout } "
2018-08-22 10:27:35 +02:00
service_name = " $( echo " ${ line } " | cut -d" " -f1 | sed 's/_.*//' ) "
2018-08-21 14:53:12 +02:00
process_found = " $( echo " ${ line } " | cut -d" " -f3 | tr -d ',' ) "
process_min = " $( echo " ${ line } " | cut -d" " -f5) "
process_max = "nolimit"
fi
## }}}
[ " ${ DEBUG } " -eq "0" ] && printf '\e[1;35m%-6s\e[m\n' " DEBUG : while process loop — Found ${ process_found } process(es) for ${ service_name } service and require between ${ process_min } and ${ process_max } . " >> " ${ debug_stdout } "
2018-08-21 16:36:29 +02:00
# Restart service if needed {{{
if [ " ${ process_found } " ] && [ " ${ process_min } " ] && [ " ${ process_found } " -lt " ${ process_min } " ] ; then
[ " ${ DEBUG } " -eq "0" ] && printf '\e[1;35m%-6s\e[m\n' " DEBUG : while process loop — ${ service_name } need to be restarted. " >> " ${ debug_stdout } "
2018-08-21 16:38:31 +02:00
[ " ${ DEBUG } " -eq "0" ] && printf '\e[1;35m%-6s\e[m\n' " DEBUG : while process loop — ssh -n -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${ REMOTE_SSH_USER } @ ${ BBHOSTNAME } sudo systemctl restart ${ service_name } .service " >> " ${ debug_stdout } "
2018-08-21 16:36:29 +02:00
ssh -n -o StrictHostKeyChecking = no -o UserKnownHostsFile = /dev/null " ${ REMOTE_SSH_USER } " @" ${ BBHOSTNAME } " " sudo systemctl restart ${ service_name } .service " >> " ${ debug_stdout } " 2>> " ${ debug_stderr } "
else
[ " ${ DEBUG } " -eq "0" ] && printf '\e[1;35m%-6s\e[m\n' " DEBUG : while process loop — ${ service_name } service is not managed. " >> " ${ debug_stdout } "
fi
# }}}
2018-08-14 17:25:55 +02:00
done < " ${ service_list } "
2018-08-21 16:36:29 +02:00
# Also restart xymon-client service {{{
[ " ${ DEBUG } " -eq "0" ] && printf '\e[1;35m%-6s\e[m\n' "DEBUG : process list — xymon-client also need to be restarted." >> " ${ debug_stdout } "
[ " ${ DEBUG } " -eq "0" ] && printf '\e[1;35m%-6s\e[m\n' " DEBUG : process list — ssh -n -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${ REMOTE_SSH_USER } @ ${ BBHOSTNAME } sudo systemctl restart xymon-client.service " >> " ${ debug_stdout } "
ssh -n -o StrictHostKeyChecking = no -o UserKnownHostsFile = /dev/null " ${ REMOTE_SSH_USER } " @" ${ BBHOSTNAME } " "sudo systemctl restart xymon-client.service" >> " ${ debug_stdout } " 2>> " ${ debug_stderr } "
# }}}
2018-08-14 17:25:55 +02:00
else
2018-08-21 13:41:35 +02:00
[ " ${ DEBUG } " -eq "0" ] && printf '\e[1;35m%-6s\e[m\n' "DEBUG : process list — No error on any process." >> " ${ debug_stdout } "
2018-08-14 17:25:55 +02:00
fi
2018-08-21 13:41:35 +02:00
# Remove empty error file
[ -s " ${ debug_stderr } " ] || rm -f " ${ debug_stderr } "
# Remove temp_dir if DEBUG is disable
[ " ${ DEBUG } " -eq "0" ] || rm -rf -- " ${ temp_dir } "
2018-08-14 17:25:55 +02:00
exit 0