added mechanism to download benchmark results on work.global on success

2022-06-09 08:58:04 +02:00 · 2022-06-09 08:58:04 +02:00 · 733fda5517
parent 75c4b98be0
commit 733fda5517
2 changed files with 37 additions and 12 deletions
--- a/.ipr/hibench.job
+++ b/.ipr/hibench.job
@ -1,5 +1,5 @@
 #!/usr/bin/env bash
-
+# this job file is a template file used by launch-perf-jobs.sh

 git_repos_url="$1" # eg "https://github.com/hibridon/hibridon"
 git_user="$2" # eg 'g-raffy'
@ -10,16 +10,23 @@ benchmark_command="$6"  # eg 'ctest -L ^arch4_quick$'
 env_vars_bash_commands="$7"  # defines extra environment variables prior to launch starbench. eg "export MKLROOT=/opt/intel/compilers_and_libraries_2020.1.217/linux/mkl"

 cmake_path='/opt/cmake/cmake-3.23.0/bin/cmake'
+executed_by_sge=''

 if [ "${JOB_ID}" = '' ]
 then
+	executed_by_sge='false'
 	# this script is not executed by sge... set dummy values for test
 	TMPDIR=/tmp
 	JOB_ID=666666
 	NSLOTS=2
+else
+	executed_by_sge='true'
 fi

-echo "Executing job ${JOB_ID} on $(hostname)"
+launch_dir="$(pwd)"
+
+echo "Executing job ${JOB_ID} on $(hostname) from ${launch_dir}"
+echo "date: $(date --iso-8601=seconds)"
 temp_dir=${TMPDIR}/$(whoami)/${JOB_ID}
 if [ -d "${temp_dir}" ]
 then
@ -59,5 +66,22 @@ do
 	command="${command} --cmake-option=${cmake_option}"
 done
 command="${command} --benchmark-command=\"${benchmark_command}\""
-echo "command: $command"
-eval $command
+
+echo "command: ${command}"
+eval ${command}
+if [ "$?" = '0' ]
+then
+	echo "the command ${command} succeeded"
+	rsync -va --exclude 'build' --exclude 'source.git' "${output_dir}/"  ${launch_dir}/  # exclude the source.git and build directories (one for each worker) because they are big and not that precious
+	# TMPDIR will be deleted by sge at the end of the job
+else
+	if [ ${executed_by_sge} = 'true' ]
+	then
+		# TMPDIR will be deleted by sge at the end of the job. Backup data for investigation
+		backup_dir="/opt/ipr/cluster/work.local/$(whoami)/${JOB_ID}"
+		echo "moving ${output_dir} to ${backup_dir} to that it doesn't get deleted by sge at the end of the job. This way, data gets a chance to be investagated then manually deleted." 
+		mv "${output_dir}" "${backup_dir}"
+	fi
+	echo "the command ${command} failed... the output data dir (${output_dir}) is expected to be cleaned up manually after investigation"
+	exit 1
+fi
--- a/.ipr/starbench.py
+++ b/.ipr/starbench.py
@ -235,8 +235,9 @@ def starbench_cmake_app(git_repos_url: str, code_version: str, tmp_dir: Path, nu
        subprocess.run(['git', 'checkout', '%s' % (code_version)], cwd=str(src_dir), check=True)

    # we need one build for each parallel run, otherwise running ctest on parallel would overwrite the same file, which causes the test to randomly fail depnding on race conditions
-    build_dir = tmp_dir / 'worker<worker_id>'
-    print('creating build directory %s' % build_dir)
+    worker_dir = tmp_dir / 'worker<worker_id>'
+    build_dir = worker_dir / 'build'
+    print('creating build directory %s' % worker_dir)
    create_build_dir = StarBencher(
        run_command=['mkdir', '-p', build_dir],
        num_cores_per_run=1,
@ -259,8 +260,8 @@ def starbench_cmake_app(git_repos_url: str, code_version: str, tmp_dir: Path, nu
        max_num_cores=num_cores,
        stop_condition=StopAfterSingleRun(),
        run_command_cwd=build_dir,
-        stdout_filepath=build_dir / 'configure_stdout.txt',
-        stderr_filepath=build_dir / 'configure_stderr.txt')
+        stdout_filepath=worker_dir / 'configure_stdout.txt',
+        stderr_filepath=worker_dir / 'configure_stderr.txt')
    configure_duration = configure.run()  # noqa: F841

    print('building %s ...' % (build_dir))
@ -271,8 +272,8 @@ def starbench_cmake_app(git_repos_url: str, code_version: str, tmp_dir: Path, nu
        max_num_cores=num_cores,
        stop_condition=StopAfterSingleRun(),
        run_command_cwd=build_dir,
-        stdout_filepath=build_dir / 'build_stdout.txt',
-        stderr_filepath=build_dir / 'build_stderr.txt')
+        stdout_filepath=worker_dir / 'build_stdout.txt',
+        stderr_filepath=worker_dir / 'build_stderr.txt')
    build_duration = build.run()  # noqa: F841

    print('benchmarking %s ...' % (build_dir))
@ -284,8 +285,8 @@ def starbench_cmake_app(git_repos_url: str, code_version: str, tmp_dir: Path, nu
        max_num_cores=num_cores,
        stop_condition=stop_condition,
        run_command_cwd=build_dir,
-        stdout_filepath=build_dir / 'bench_stdout.txt',
-        stderr_filepath=build_dir / 'bench_stderr.txt')
+        stdout_filepath=worker_dir / 'bench_stdout.txt',
+        stderr_filepath=worker_dir / 'bench_stderr.txt')
    mean_duration = bench.run()
    print('duration : %.3f s' % (mean_duration))