From 3c0c41d142e7905071939c7970d26e239d19e5d6 Mon Sep 17 00:00:00 2001 From: Guillaume Raffy Date: Mon, 9 May 2022 17:48:58 +0200 Subject: [PATCH] added a script to send a report on /opt/ipr/cluster/work.global usage when it's full This is to address https://bugzilla.ipr.univ-rennes1.fr/show_bug.cgi?id=3193 but this script will need to be triggered by cron.daily on work.ipr.univ-rennes1.fr --- cluster/disk-watchdog.sh | 207 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 207 insertions(+) create mode 100644 cluster/disk-watchdog.sh diff --git a/cluster/disk-watchdog.sh b/cluster/disk-watchdog.sh new file mode 100644 index 0000000..d656376 --- /dev/null +++ b/cluster/disk-watchdog.sh @@ -0,0 +1,207 @@ +#!/bin/bash +# The role of this script is to measure the size of the directories in $SHARED_DISK_ROOT and send an e-mail to $DEST_EMAIL if its fullness goes beyond $FULLNESS_THRESHOLD +# see https://bugzilla.ipr.univ-rennes1.fr/show_bug.cgi?id=3193 for details + +# test run: +# graffy@work:~/bug3193$ ./disk-watchdog.sh /mnt/work/graffy/workspaces/meniscus $HOME/var/run/ipr/cluster/disk-watchdog /opt/ipr/cluster/work.global 90 guillaume.raffy@univ-rennes1.fr "graffy (manual trigger)" 20 + +# production run: +# graffy@work:~/bug3193$ sudo ./disk-watchdog.sh /mnt/work /var/run/ipr/cluster/disk-watchdog /opt/ipr/cluster/work.global 90 ipr-cluster@listes.univ-rennes1.fr "graffy (manual trigger)" 20 + +SHARED_DISK_ROOT="$1" # eg '/mnt/work' +REPORTS_ROOT_PATH="$2" # eg "$HOME/var/run/ipr/cluster/disk-watchdog" +DISK_PUBLIC_PATH="$3" # 'eg /opt/ipr/cluster/work.global' +FULLNESS_THRESHOLD="$4" # disk fullness percentage above which an e-mail is sent (eg. '90' for 90%) +DEST_EMAIL="$5" # eg ipr-cluster@listes.univ-rennes1.fr +TRIGGERER="$6" # eg "daily cron" +TOP_SIZE="$7" # # number of biggest directories reported (eg 20)" + +RETURNCODE_SUCCESS=0 +RETURNCODE_ERROR=1 + + +function log() +{ + message="$1" + logger -t 'disk-watchdog' "$message" +} + +function error() +{ + message="$1" + echo 1>&2 "$(date) : ERROR : $message" +} + +function sheet_get_column() +{ + local sheet_file_path="$1" # eg /var/run/ipr/cluster/disk-watchdog/meniscus-2022-05-07-16-43-38/total.txt + local column_name="$2" # eg Use% + cat "$sheet_file_path" | awk -v col="${column_name}" '\ + NR==1 \ + {\ + for (i=1; i<=NF; i++)\ + if ($i == col)\ + {\ + c=i; break\ + }\ + }\ + NR != 1\ + {\ + print $c\ + }' +} + +function create_html_contents() +{ + local language_id="$1" # "french" or "english" + local disk_usage_table_as_html="$2" + local disk_public_path="$3" + local disk_size_in_gb="$4" + local fullness_threshold="$5" + local admin_list_as_html="$6" + + local html_content='' + + case ${language_id} in + 'english') + html_content="${html_content}\n" + html_content="${html_content}

This is an automatic message, please don't reply.

\n" + html_content="${html_content}

The shared disk ${disk_public_path} (${disk_size_in_gb} Gb) is $disk_fullness%% full. As a full disk will cause the jobs to unnecessarily fail, it's urgent to do some cleanup especially for the top biggest users:

\n" + html_content="${html_content}\n" + html_content="${html_content}\n" + html_content="${html_content}${disk_usage_table_as_html}" + html_content="${html_content}
rankused size (in Mb)ownerfolder
\n" + + html_content="${html_content}

This message is automatically sent daily to all cluster users until the disk usage goes below ${fullness_threshold}%%. So if you're wondering \"All this is good, but can I do something to avoid this annoying message?\", then you'll probably be happy to hear that the answer is yes:

\n" + html_content="${html_content}\n" + + html_content="${html_content}Reminder (cf IPR cluster user guide): the disk ${disk_public_path} is not intended to be used for archiving (it's not even backed up!); it's a workspace whose purpose is to temporarily store the data required to make the jobs work. Except in some special use cases (eg reuse the output data as input for future jobs), the users are expected to transfer or delete the output files on ${disk_public_path} once their job has ended.\n" + + html_content="${html_content}

The cluster user committee decided not to put quotas per user on ${disk_public_path} because it would result in significantly less disk space per user and less flexibility. However, using this shared disk without quotas can only work if all users act in a responsible manner, cleaning up their own data after usage. Please make this work, it's in the interest of all users, including you. If you think the size of ${disk_public_path} is too small for your needs, please contact the cluster administrators.

\n" + + html_content="${html_content}

Thank you for your understanding

\n" + html_content="${html_content}

The IPR cluster administrators:

\n" + html_content="${html_content}\n" + html_content="${html_content}\n" + ;; + + 'french') + html_content="${html_content}\n" + html_content="${html_content}

Ceci est un message automatique, ne pas répondre svp.

\n" + html_content="${html_content}

Le disque partagé ${disk_public_path} (${disk_size_in_gb} Gb) est plein $disk_fullness%%. Etant donné qu'un disque plein fait échouer les jobs et que cela peut être évité, il est urgent de procéder à un nettoyage, surtout pour les plus gros consommateurs:

\n" + html_content="${html_content}\n" + html_content="${html_content}\n" + html_content="${html_content}${disk_usage_table_as_html}" + html_content="${html_content}
rangespace occupé (en Mb)propriétairedossier
\n" + + html_content="${html_content}

Ce message est quotidiennement et automatiquement envoyé à tous les utilisateurs du cluster, jusqu'à ce que le taux de remplissage du disque retombe en dessous de ${fullness_threshold}%%. Si vous vous demandez \"c'est très bien tout ça, mais qu'est-ce que je peux y faire ?\", voici quelques éléments de réponse:

\n" + html_content="${html_content}\n" + + html_content="${html_content}Pour rappel (cf guide d'utilisation du cluster IPR), le disque ${disk_public_path} n'a pas vocation à servir pour de l'archivage (il n'est d'ailleurs pas sauvegardé!); c'est un espace de travail don't le rôle est de stocker temporairement les données nécéssaires à vos jobs. Sauf exception (par exemple, réutilisation des données de sortie pour des jobs futurs), les utilisteurs sont censés rapatrier ou effacer leurs données une fois leurs jobs terminés.\n" + + html_content="${html_content}

Le commité des utilisateurs du cluster n'a pas opté pour la mise en place de quotas sur ${disk_public_path}, jugeant que non seulement cela affecterait la flexibilité, mais que les utilisateurs auraient au final moins d'espace disponible en pratique. Cependant, l'utilisation de ce disque partagé sans quotas ne peut fonctionner que si tout le monde joue le jeu et agit de façon responsable, en nettoyant ses données après utilisation. Svp faites en sorte que l'on puisse continuer de fonctionner ainsi, dans l'intérêt de tous. Si vous estimez que la taille du disque ${disk_public_path} est trop petite pour vos besoins, n'hésitez pas à contacter les administrateurs du cluster.

\n" + + html_content="${html_content}

Merci de votre compréhension

\n" + html_content="${html_content}

Les administrateurs du cluster IPR:

\n" + html_content="${html_content}\n" + html_content="${html_content}\n" + ;; + *) + error "unexpected value for language_id : ${language_id}" + return RETURNCODE_ERROR + esac + echo "$html_content" +} + +function check_disk_usage() +{ + local shared_disk_root="$1" # the eg /mnt/work + local reports_root_path="$2" # "$HOME/var/run/ipr/cluster/disk-watchdog" + local disk_public_path="$3" # eg /opt/ipr/cluster/work.global + local fullness_threshold="$4" # eg '90' for 90% + local dest_email="$5" # eg ipr-cluster@listes.univ-rennes1.fr + local triggerer="$6" # eg "work.ipr.univ-rennes1.fr:/etc/cron.daily" + local top_size="$7" # number of biggest directories reported (eg 20) + local trigger_date="$(date)" + + disk_id=$(basename "${shared_disk_root}") + log "check of ${shared_disk_root} triggered by ${triggerer} with threshold ${fullness_threshold} %" + + mkdir -p "{$reports_root_path}" + this_report_dir="${reports_root_path}/${disk_id}-$(date +'%Y-%m-%d-%H-%M-%S')" + mkdir -p "${this_report_dir}" + + disk_global_usage_report_file_path="${this_report_dir}/total.txt" + df -m ${shared_disk_root} > "$disk_global_usage_report_file_path" + + disk_usage_report_file_path="${this_report_dir}/subdirs.txt" + tmp_disk_usage_report_file_path="${this_report_dir}/subdirs.tmp" + log "listing usage of $shared_disk_root into $disk_usage_report_file_path" + du -sm ${shared_disk_root}/* | tee "${tmp_disk_usage_report_file_path}" + + cat ${tmp_disk_usage_report_file_path} | awk '{ printf("%s ", $1); system("stat --printf=''%U'' " $2); printf(" %s\n", $2); }' > "${disk_usage_report_file_path}" + rm "${tmp_disk_usage_report_file_path}" + + # for path in $(cat ${disk_usage_report_file_path} | awk '{printf("%s ", $2)}') + # do + # echo $path + # local owner=$(ls -l $path | awk '{print $2}') + # echo $owner + # done + # sed "s|${shared_disk_root}|${disk_public_path}|g" | + + local disk_fullness=$(sheet_get_column $disk_global_usage_report_file_path 'Use%' | sed 's/%//') + local disk_size_in_mb=$(sheet_get_column $disk_global_usage_report_file_path '1M-blocks' | sed 's/%//') + log "disk fullness = $disk_fullness %" + + if [ "$disk_fullness" -ge "${fullness_threshold}" ] + then + + local to="$dest_email" + local from="disk-watchdog@work.ipr.univ-rennes1.fr" + local subject="warning: The shared disk ${disk_public_path} is full at $disk_fullness%% !" + local html_content='' + local disk_usage_table_as_html="$(cat "${disk_usage_report_file_path}" | sort -r -n | head -${top_size} | awk '{printf("%s%d%s%s\\n", NR, $1, $2, $3)}')" + local disk_size_in_gb="$((disk_size_in_mb / 1024))" + local admin_list_as_html='' + admin_list_as_html="${admin_list_as_html}
  • jeremy.gardais@univ-rennes1.fr\n
  • \n" + admin_list_as_html="${admin_list_as_html}
  • guillaume.raffy@univ-rennes1.fr\n
  • \n" + + local localized_message='' + for language_id in 'french' 'english' + do + html_content="${html_content}

    ${language_id} version

    " + localized_message=$(create_html_contents "${language_id}" "${disk_usage_table_as_html}" "${disk_public_path}" "${disk_size_in_gb}" "${fullness_threshold}" "${admin_list_as_html}") + if [ $? != "${RETURNCODE_SUCCESS}" ] + then + return "${RETURNCODE_ERROR}" + fi + html_content="${html_content}${localized_message}" + done + html_content="${html_content}disk-watchdog.ipr.univ-rennes1.fr v1.0 - triggered from $(hostname --fqdn) by : ${triggerer} on ${trigger_date}" + + local sendmail_stdin="To: ${to}\nFrom: ${from}\nSubject: ${subject}\nContent-Type: text/html; charset=\"UTF-8\"\n${html_content}\n" + printf "$sendmail_stdin" | /usr/sbin/sendmail "${to}" + log "e-mail sent to ${to}" + fi +} + +set -o errexit + +if [ $# = 7 ] +then + check_disk_usage "${SHARED_DISK_ROOT}" "${REPORTS_ROOT_PATH}" "${DISK_PUBLIC_PATH}" "${FULLNESS_THRESHOLD}" "${DEST_EMAIL}" "${TRIGGERER}" "${TOP_SIZE}" +else + error "wrong number of arguments" + exit "${RETURNCODE_ERROR}" +fi