scripts/cluster/disk-watchdog.sh

201 lines
12 KiB
Bash
Raw Normal View History

#!/bin/bash
# The role of this script is to measure the size of the directories in $SHARED_DISK_ROOT and send an e-mail to $DEST_EMAIL if its fullness goes beyond $FULLNESS_THRESHOLD
# see https://bugzilla.ipr.univ-rennes1.fr/show_bug.cgi?id=3193 for details
# test run:
2023-08-16 05:58:32 +02:00
# graffy@work:~/bug3193$ ./disk-watchdog.sh /mnt/work/graffy/workspaces/meniscus $HOME/var/run/ipr/cluster/disk-watchdog /opt/ipr/cluster/work.global 90 guillaume.raffy@univ-rennes.fr "manual trigger" 20
# production run:
# graffy@work:~/bug3193$ sudo ./disk-watchdog.sh /mnt/work /var/run/ipr/cluster/disk-watchdog /opt/ipr/cluster/work.global 90 ipr-cluster@listes.univ-rennes1.fr "cron" 20
SHARED_DISK_ROOT="$1" # eg '/mnt/work'
REPORTS_ROOT_PATH="$2" # eg "$HOME/var/run/ipr/cluster/disk-watchdog"
DISK_PUBLIC_PATH="$3" # 'eg /opt/ipr/cluster/work.global'
FULLNESS_THRESHOLD="$4" # disk fullness percentage above which an e-mail is sent (eg. '90' for 90%)
DEST_EMAIL="$5" # eg ipr-cluster@listes.univ-rennes1.fr
TRIGGER_REASON="$6" # eg "daily cron"
TOP_SIZE="$7" # # number of biggest directories reported (eg 20)"
RETURNCODE_SUCCESS=0
RETURNCODE_ERROR=1
function log()
{
message="$1"
logger -t 'disk-watchdog' "$message"
}
function error()
{
message="$1"
echo 1>&2 "$(date) : ERROR : $message"
}
function sheet_get_column()
{
local sheet_file_path="$1" # eg /var/run/ipr/cluster/disk-watchdog/meniscus-2022-05-07-16-43-38/total.txt
local column_name="$2" # eg Use%
cat "$sheet_file_path" | awk -v col="${column_name}" '\
NR==1 \
{\
for (i=1; i<=NF; i++)\
if ($i == col)\
{\
c=i; break\
}\
}\
NR != 1\
{\
print $c\
}'
}
function create_html_contents()
{
local language_id="$1" # "french" or "english"
local disk_usage_table_as_html="$2"
local disk_public_path="$3"
local disk_size_in_gb="$4"
local fullness_threshold="$5"
local admin_list_as_html="$6"
local html_content=''
case ${language_id} in
'english')
html_content="${html_content}<body>\n"
html_content="${html_content}<p>This is an automatic message, please don't reply.</p>\n"
html_content="${html_content}<p>The shared disk <code>${disk_public_path}</code> (${disk_size_in_gb} Gb) is $disk_fullness%% full. As a full disk will cause the jobs to unnecessarily fail, it's urgent to do some cleanup especially for the top biggest users:</p>\n"
html_content="${html_content}<table>\n"
html_content="${html_content}<th>rank</th><th>used size (in Mb)</th><th>owner</th><th>folder</th>\n"
html_content="${html_content}${disk_usage_table_as_html}"
html_content="${html_content}</table>\n"
html_content="${html_content}<p>This message is automatically sent daily to all cluster users until the disk usage goes below ${fullness_threshold}%%. So if you're wondering \"All this is good, but can I do something to avoid this annoying message?\", then you'll probably be happy to hear that the answer is yes:</p>\n"
html_content="${html_content}<ul>\n"
html_content="${html_content}<li>if your name is not on the list above, then you can go and harrass a random user that is in the list;\n</li>\n"
html_content="${html_content}<li>if your name is high on the list above, then you'd better do a cleanup of your files before the other cluster users find you! 😉\n</li>\n"
html_content="${html_content}</ul>\n"
html_content="${html_content}Reminder (cf <a href=http://intranet.ipr.univ-rennes1.fr/simpaweb/cluster/PhysixUserGuide>IPR cluster user guide</a>): the disk <code>${disk_public_path}</code> is not intended to be used for archiving (it's not even backed up!); it's a workspace whose purpose is to temporarily store the data required to make the jobs work. Except in some special use cases (eg reuse the output data as input for future jobs), the users are expected to transfer or delete the output files on <code>${disk_public_path}</code> once their job has ended.\n"
html_content="${html_content}<p>The cluster user committee decided not to put quotas per user on <code>${disk_public_path}</code> because it would result in significantly less disk space per user and less flexibility. However, using this shared disk without quotas can only work if all users act in a responsible manner, cleaning up their own data after usage. Please make this work, it's in the interest of all users, including you. If you think the size of <code>${disk_public_path}</code> is too small for your needs, please contact the cluster administrators.</p>\n"
html_content="${html_content}<p>Thank you for your understanding</p>\n"
html_content="${html_content}<p>The IPR cluster administrators:</p>\n"
html_content="${html_content}<ul>\n"
html_content="${html_content}${admin_list_as_html}"
html_content="${html_content}</ul>\n"
html_content="${html_content}</body>\n"
;;
'french')
html_content="${html_content}<body>\n"
html_content="${html_content}<p>Ceci est un message automatique, ne pas répondre svp.</p>\n"
html_content="${html_content}<p>Le disque partagé <code>${disk_public_path}</code> (${disk_size_in_gb} Gb) est plein $disk_fullness%%. Etant donné qu'un disque plein fait échouer les jobs et que cela peut être évité, il est urgent de procéder à un nettoyage, surtout pour les plus gros consommateurs:</p>\n"
html_content="${html_content}<table>\n"
html_content="${html_content}<th>rang</th><th>espace occupé (en Mb)</th><th>propriétaire</th><th>dossier</th>\n"
html_content="${html_content}${disk_usage_table_as_html}"
html_content="${html_content}</table>\n"
html_content="${html_content}<p>Ce message est quotidiennement et automatiquement envoyé à tous les utilisateurs du cluster, jusqu'à ce que le taux de remplissage du disque retombe en dessous de ${fullness_threshold}%%. Si vous vous demandez \"c'est très bien tout ça, mais qu'est-ce que je peux y faire ?\", voici quelques éléments de réponse:</p>\n"
html_content="${html_content}<ul>\n"
html_content="${html_content}<li>si votre nom ne figure pas dans la liste ci-dessus, vous avez toujours la possibilité d'aller harceler un utilisateur dont le nom y figure;\n</li>\n"
html_content="${html_content}<li>si votre nom figure dans la liste, il vous est chaudement recommandé de procéder à un nettoyage de vos fichiers avant que les autres utilisateurs ne vous trouvent! 😉\n</li>\n"
html_content="${html_content}</ul>\n"
html_content="${html_content}Pour rappel (cf <a href=http://intranet.ipr.univ-rennes1.fr/simpaweb/cluster/PhysixUserGuide>guide d'utilisation du cluster IPR</a>), le disque <code>${disk_public_path}</code> n'a pas vocation à servir pour de l'archivage (il n'est d'ailleurs pas sauvegardé!); c'est un espace de travail don't le rôle est de stocker temporairement les données nécéssaires à vos jobs. Sauf exception (par exemple, réutilisation des données de sortie pour des jobs futurs), les utilisteurs sont censés rapatrier ou effacer leurs données une fois leurs jobs terminés.\n"
2022-05-09 18:51:52 +02:00
html_content="${html_content}<p>Le comité des utilisateurs du cluster n'a pas opté pour la mise en place de quotas sur <code>${disk_public_path}</code>, jugeant que non seulement cela affecterait la flexibilité, mais que les utilisateurs auraient au final moins d'espace disponible en pratique. Cependant, l'utilisation de ce disque partagé sans quotas ne peut fonctionner que si tout le monde joue le jeu et agit de façon responsable, en nettoyant ses données après utilisation. Svp faites en sorte que l'on puisse continuer de fonctionner ainsi, dans l'intérêt de tous. Si vous estimez que la taille du disque <code>${disk_public_path}</code> est trop petite pour vos besoins, n'hésitez pas à contacter les administrateurs du cluster.</p>\n"
html_content="${html_content}<p>Merci de votre compréhension</p>\n"
html_content="${html_content}<p>Les administrateurs du cluster IPR:</p>\n"
html_content="${html_content}<ul>\n"
html_content="${html_content}${admin_list_as_html}"
html_content="${html_content}</ul>\n"
html_content="${html_content}</body>\n"
;;
*)
error "unexpected value for language_id : ${language_id}"
return RETURNCODE_ERROR
esac
echo "$html_content"
}
function check_disk_usage()
{
local shared_disk_root="$1" # the eg /mnt/work
local reports_root_path="$2" # "$HOME/var/run/ipr/cluster/disk-watchdog"
local disk_public_path="$3" # eg /opt/ipr/cluster/work.global
local fullness_threshold="$4" # eg '90' for 90%
local dest_email="$5" # eg ipr-cluster@listes.univ-rennes1.fr
local trigger_reason="$6" # eg "cron.daily"
local top_size="$7" # number of biggest directories reported (eg 20)
local trigger_date="$(date)"
disk_id=$(basename "${shared_disk_root}")
log "check of ${shared_disk_root} triggered by ${trigger_reason} with threshold ${fullness_threshold} %"
mkdir -p "{$reports_root_path}"
this_report_dir="${reports_root_path}/${disk_id}-$(date +'%Y-%m-%d-%H-%M-%S')"
mkdir -p "${this_report_dir}"
disk_global_usage_report_file_path="${this_report_dir}/total.txt"
df -m ${shared_disk_root} > "$disk_global_usage_report_file_path"
disk_usage_report_file_path="${this_report_dir}/subdirs.txt"
tmp_disk_usage_report_file_path="${this_report_dir}/subdirs.tmp"
log "listing usage of $shared_disk_root into $disk_usage_report_file_path"
du -sm ${shared_disk_root}/* | tee "${tmp_disk_usage_report_file_path}"
cat ${tmp_disk_usage_report_file_path} | awk '{ printf("%s ", $1); system("stat --printf=''%U'' " $2); printf(" %s\n", $2); }' > "${disk_usage_report_file_path}"
rm "${tmp_disk_usage_report_file_path}"
# sed "s|${shared_disk_root}|${disk_public_path}|g" |
local disk_fullness=$(sheet_get_column $disk_global_usage_report_file_path 'Use%' | sed 's/%//')
local disk_size_in_mb=$(sheet_get_column $disk_global_usage_report_file_path '1M-blocks' | sed 's/%//')
log "disk fullness = $disk_fullness %"
if [ "$disk_fullness" -ge "${fullness_threshold}" ]
then
local to="$dest_email"
local from="info-ipr@univ-rennes.fr"
local subject="warning: The shared disk ${disk_public_path} is full at $disk_fullness%% !"
local html_content=''
local disk_usage_table_as_html="$(cat "${disk_usage_report_file_path}" | sort -r -n | head -${top_size} | awk '{printf("<tr><td>%s</td><td align=\"right\">%d</td><td align=\"center\">%s</td><td>%s</td></tr>\\n", NR, $1, $2, $3)}')"
local disk_size_in_gb="$((disk_size_in_mb / 1024))"
local admin_list_as_html=''
2023-08-16 05:58:32 +02:00
admin_list_as_html="${admin_list_as_html}<li><a href=\"mailto:guillaume.raffy@univ-rennes.fr\">guillaume.raffy@univ-rennes.fr</a>\n</li>\n"
admin_list_as_html="${admin_list_as_html}<li><a href=\"mailto:jeremy.gardais@univ-rennes.fr\">jeremy.gardais@univ-rennes.fr</a>\n</li>\n"
local localized_message=''
for language_id in 'french' 'english'
do
html_content="${html_content}<h1>${language_id} version</h1>"
localized_message=$(create_html_contents "${language_id}" "${disk_usage_table_as_html}" "${disk_public_path}" "${disk_size_in_gb}" "${fullness_threshold}" "${admin_list_as_html}")
if [ $? != "${RETURNCODE_SUCCESS}" ]
then
return "${RETURNCODE_ERROR}"
fi
html_content="${html_content}${localized_message}"
done
html_content="${html_content}<small>disk-watchdog.ipr.univ-rennes1.fr v1.0 - triggered from $(hostname --fqdn) by : $(whoami) (${trigger_reason}) on ${trigger_date}</small>"
local sendmail_stdin="To: ${to}\nFrom: ${from}\nSubject: ${subject}\nContent-Type: text/html; charset=\"UTF-8\"\n<html>${html_content}</html>\n"
printf "$sendmail_stdin" | /usr/sbin/sendmail "${to}"
log "e-mail sent to ${to}"
fi
}
set -o errexit
if [ $# = 7 ]
then
check_disk_usage "${SHARED_DISK_ROOT}" "${REPORTS_ROOT_PATH}" "${DISK_PUBLIC_PATH}" "${FULLNESS_THRESHOLD}" "${DEST_EMAIL}" "${TRIGGER_REASON}" "${TOP_SIZE}"
else
error "wrong number of arguments"
exit "${RETURNCODE_ERROR}"
fi