2024-10-04 18:27:21 +02:00
#!/usr/bin/env python3
# this script launches jobs to run hibridon benchmarks on physix cluster for the given version of hibridon (commit number)
from typing import List , Tuple
from argparse import ArgumentParser
import os
from os import getenv , makedirs
from pathlib import Path
import subprocess
HostFqdn = str # eg 'physix90.ipr.univ-rennes1.fr'
GitCommitTag = str # commit number eg 'a3bed1c3ccfbca572003020d3e3d3b1ff3934fad'
HostGroupId = str # eg 'xeon_gold_6140'
CompilerId = str # eg 'gfortran'
def substitute_tag_with_filecontents ( input_file_path : Path , tag : str , contents_file : Path , output_file_path : Path ) :
contents = open ( contents_file , ' rt ' , encoding = ' utf8 ' ) . read ( )
with open ( input_file_path , ' rt ' , encoding = ' utf8 ' ) as template_file , open ( output_file_path , ' wt ' , encoding = ' utf8 ' ) as out_file :
for template_line in template_file . readlines ( ) :
line = template_line . replace ( tag , contents )
out_file . write ( line )
def get_host_group_info ( host_group_id : HostGroupId ) - > Tuple [ List [ HostFqdn ] , int ] :
if host_group_id == ' intel_xeon_x5550 ' :
hosts = [ ' physix48.ipr.univ-rennes1.fr ' ]
num_cores = ' 8 '
elif host_group_id == ' intel_xeon_x5650 ' :
hosts = [
' physix49.ipr.univ-rennes1.fr ' ,
' physix50.ipr.univ-rennes1.fr ' ,
' physix51.ipr.univ-rennes1.fr ' ,
' physix52.ipr.univ-rennes1.fr ' ,
' physix53.ipr.univ-rennes1.fr ' ,
' physix54.ipr.univ-rennes1.fr ' ,
' physix55.ipr.univ-rennes1.fr ' ,
' physix56.ipr.univ-rennes1.fr ' ,
' physix57.ipr.univ-rennes1.fr ' ,
' physix58.ipr.univ-rennes1.fr ' ,
' physix59.ipr.univ-rennes1.fr ' , ]
num_cores = ' 12 '
elif host_group_id == ' intel_xeon_e5-2660 ' :
hosts = [
' physix60.ipr.univ-rennes1.fr ' ,
' physix61.ipr.univ-rennes1.fr ' ,
' physix62.ipr.univ-rennes1.fr ' ,
' physix63.ipr.univ-rennes1.fr ' ,
' physix64.ipr.univ-rennes1.fr ' ,
' physix65.ipr.univ-rennes1.fr ' ,
' physix66.ipr.univ-rennes1.fr ' ,
' physix67.ipr.univ-rennes1.fr ' ,
' physix68.ipr.univ-rennes1.fr ' ,
' physix69.ipr.univ-rennes1.fr ' ,
' physix70.ipr.univ-rennes1.fr ' ,
' physix71.ipr.univ-rennes1.fr ' ]
num_cores = ' 16 '
elif host_group_id == ' intel_xeon_e5-2660v2 ' :
hosts = [
' physix72.ipr.univ-rennes1.fr ' ,
' physix73.ipr.univ-rennes1.fr ' ,
' physix74.ipr.univ-rennes1.fr ' ,
' physix75.ipr.univ-rennes1.fr ' ,
' physix76.ipr.univ-rennes1.fr ' ,
' physix77.ipr.univ-rennes1.fr ' ,
' physix78.ipr.univ-rennes1.fr ' ,
' physix79.ipr.univ-rennes1.fr ' ,
' physix80.ipr.univ-rennes1.fr ' ,
' physix81.ipr.univ-rennes1.fr ' ,
' physix82.ipr.univ-rennes1.fr ' ,
' physix84.ipr.univ-rennes1.fr ' ]
num_cores = ' 20 '
elif host_group_id == ' intel_xeon_e5-2660v4 ' :
hosts = [
' physix84.ipr.univ-rennes1.fr ' ,
' physix85.ipr.univ-rennes1.fr ' ,
' physix86.ipr.univ-rennes1.fr ' ,
' physix87.ipr.univ-rennes1.fr ' ]
num_cores = ' 28 '
elif host_group_id == ' intel_xeon_gold_6140 ' :
hosts = [
' physix88.ipr.univ-rennes1.fr ' ,
' physix89.ipr.univ-rennes1.fr ' ]
num_cores = ' 36 '
elif host_group_id == ' intel_xeon_gold_6154 ' :
hosts = [
' physix90.ipr.univ-rennes1.fr ' ]
num_cores = ' 72 '
elif host_group_id == ' intel_xeon_gold_5222 ' :
hosts = [
' physix92.ipr.univ-rennes1.fr ' ]
num_cores = ' 4 '
elif host_group_id == ' intel_xeon_gold_6226r ' :
hosts = [
' physix93.ipr.univ-rennes1.fr ' ,
' physix94.ipr.univ-rennes1.fr ' ]
num_cores = ' 32 '
elif host_group_id == ' intel_xeon_gold_6240r ' :
hosts = [
' physix99.ipr.univ-rennes1.fr ' ]
num_cores = ' 48 '
elif host_group_id == ' intel_xeon_gold_6248r ' :
hosts = [
' physix95.ipr.univ-rennes1.fr ' ,
' physix96.ipr.univ-rennes1.fr ' ,
' physix97.ipr.univ-rennes1.fr ' ,
' physix98.ipr.univ-rennes1.fr ' ,
' physix99.ipr.univ-rennes1.fr ' ,
' physix100.ipr.univ-rennes1.fr ' ,
' physix101.ipr.univ-rennes1.fr ' ,
' physix102.ipr.univ-rennes1.fr ' ]
num_cores = ' 48 '
elif host_group_id == ' amd_epyc_7282 ' :
hosts = [
' physix12.ipr.univ-rennes1.fr ' ,
' physix13.ipr.univ-rennes1.fr ' ,
' physix14.ipr.univ-rennes1.fr ' ,
' physix15.ipr.univ-rennes1.fr ' ]
num_cores = ' 32 '
else :
assert f " unhandled host_group_id : { host_group_id } "
return ( hosts , num_cores )
2024-10-07 12:16:47 +02:00
def launch_job_for_host_group ( hibridon_version : GitCommitTag , host_group_id : HostGroupId , results_dir : Path , compiler_id : CompilerId ) :
2024-10-04 18:27:21 +02:00
( hosts , num_cores ) = get_host_group_info ( host_group_id )
# quick_test = 'arch4_quick' # about 2s on a core i5 8th generation
representative_test = ' nh3h2_qma_long ' # about 10min on a core i5 8th generation
benchmark_test = representative_test
if benchmark_test == ' arch4_quick ' :
ram_per_core = ' 1G '
elif benchmark_test == ' nh3h2_qma_long ' :
ram_per_core = ' 2.8G ' # this was enough on physix48, but maybe we can reduce more
else :
assert f ' unhandled benchmark_test : { benchmark_test } '
git_repos_url = ' https://github.com/hibridon/hibridon '
git_user = ' g-raffy ' # os.environ['HIBRIDON_REPOS_USER']
git_pass_file = f ' { getenv ( " HOME " ) } /.github/personal_access_tokens/bench.hibridon.cluster.ipr.univ-rennes1.fr.pat '
cmake_options = [
' -DCMAKE_BUILD_TYPE=Release ' , # build in release mode for highest performance
' -DBUILD_TESTING=ON ' # enable hibridon tests
]
benchmark_command = f ' ctest --output-on-failure -L ^ { benchmark_test } $ '
env_vars_bash_commands = ' '
if compiler_id == ' ifort ' :
env_vars_bash_commands = ' module load compilers/ifort/latest '
cmake_options . append ( ' -DCMAKE_Fortran_COMPILER=ifort ' ) # use intel fortran compiler
cmake_options . append ( ' -DBLA_VENDOR=Intel10_64lp ' ) # use 64 bits intel mkl with multithreading
elif compiler_id == ' gfortran ' :
env_vars_bash_commands = ' '
cmake_options . append ( ' -DCMAKE_Fortran_COMPILER=gfortran ' ) # use gfortran compiler
else :
assert f ' unhandled compiler_id : { compiler_id } '
2024-10-07 12:16:47 +02:00
makedirs ( results_dir , exist_ok = True )
2024-10-04 18:27:21 +02:00
2024-10-07 12:16:47 +02:00
this_bench_dir = Path ( f ' { results_dir } / { hibridon_version } / { benchmark_test } / { host_group_id } / { compiler_id } ' )
2024-10-04 18:27:21 +02:00
makedirs ( this_bench_dir , exist_ok = True )
starbench_job_path = this_bench_dir / ' starbench.job '
this_file_path = os . path . realpath ( __file__ )
scripts_dir = this_file_path . parent
# create the job file (which embeds starbench.py)
substitute_tag_with_filecontents ( input_file_path = scripts_dir / ' starbench-template.job ' , tag = ' <include:starbench.py> ' , contents_file = scripts_dir / ' starbench.py ' , output_file_path = starbench_job_path )
subprocess . run ( f ' chmod a+x { starbench_job_path } ' , check = True )
2024-10-08 16:58:40 +02:00
command = f ' { starbench_job_path } " { git_repos_url } " " { git_user } " " { git_pass_file } " " { hibridon_version } " " { " " . join ( cmake_options ) } " " { benchmark_command } " " { env_vars_bash_commands } " '
2024-10-04 18:27:21 +02:00
print ( f ' command = { command } ' )
qsub_command = ' qsub '
qsub_command + = f ' -pe smp { num_cores } '
2024-10-08 16:58:40 +02:00
qsub_command + = f ' -l " hostname= { " | " . join ( hosts ) } " '
2024-10-04 18:27:21 +02:00
qsub_command + = ' -cwd '
qsub_command + = ' -m ae '
qsub_command + = f ' -l mem_available= { ram_per_core } '
qsub_command + = ' -j y ' # merge stderr file into stdout file for easier reading of history of events
qsub_command + = f ' -N hibench_ { host_group_id } _ { compiler_id } _ { hibridon_version } '
qsub_command + = f ' { command } '
print ( f ' qsub_command = { qsub_command } ' )
subprocess . run ( qsub_command , cwd = this_bench_dir , check = True )
2024-10-07 12:16:47 +02:00
def launch_perf_jobs ( hibridon_version : GitCommitTag , results_dir : Path ) :
2024-10-04 18:27:21 +02:00
"""
hibridon_version : the version of hibridon to test , in the form of a valid commit number eg ' a3bed1c3ccfbca572003020d3e3d3b1ff3934fad '
2024-10-07 12:16:47 +02:00
results_dir : where the results of the benchmark are stored ( eg $ GLOBAL_WORK_DIR / graffy / benchmarks / hibench )
2024-10-04 18:27:21 +02:00
"""
compilers = [
2024-10-08 16:58:40 +02:00
' gfortran ' ,
2024-10-04 18:27:21 +02:00
' ifort '
]
host_groups = [
' intel_xeon_x5550 ' ,
' intel_xeon_x5650 ' ,
' intel_xeon_e5-2660 ' ,
' intel_xeon_e5-2660v2 ' ,
' intel_xeon_e5-2660v4 ' ,
' intel_xeon_gold_6140 ' ,
' intel_xeon_gold_6154 ' ,
' intel_xeon_gold_5222 ' ,
' intel_xeon_gold_6226r ' ,
' intel_xeon_gold_6240r ' ,
' intel_xeon_gold_6248r ' ,
' amd_epyc_7282 ' ,
]
for compiler in compilers :
for host_group in host_groups :
2024-10-07 12:16:47 +02:00
launch_job_for_host_group ( hibridon_version , host_group , results_dir , compiler )
def path_is_reachable_by_compute_nodes ( path : Path ) :
path_is_reachable = False
for shared_disk_path in [ Path ( ' /opt/ipr/cluster/work.global ' ) ] :
try :
_ = path . relative_to ( shared_disk_path )
except ValueError :
continue
path_is_reachable = True
break
return path_is_reachable
2024-10-04 18:27:21 +02:00
def main ( ) :
arg_parser = ArgumentParser ( description = ' launches hibridon benchmark jobs on IPR \' s physix cluster ' , epilog = ' example: \n --commit-id a3bed1c3ccfbca572003020d3e3d3b1ff3934fad ' )
arg_parser . add_argument ( ' --commit-id ' , type = str , required = True , help = ' the commit id of the version of code to benchmark ' )
2024-10-07 12:16:47 +02:00
arg_parser . add_argument ( ' --results-dir ' , type = Path , required = True , help = ' the root directory of the tree where the results of the benchmarks are stored (eg $GLOBAL_WORK_DIR/graffy/benchmarks/hibench) ' )
2024-10-04 18:27:21 +02:00
args = arg_parser . parse_args ( )
hibridon_version = args . commit_id
2024-10-07 12:16:47 +02:00
2024-10-04 18:27:21 +02:00
# the version of hibridon to test, in the form of a valid commit number eg 'a3bed1c3ccfbca572003020d3e3d3b1ff3934fad'
# '53894da48505892bfa05693a52312bacb12c70c9' # latest from branch master as of 10/06/2022 00:30
# code_version='dd0f413b85cf0f727a5a4e88b2b02d75a28b377f' # latest from branch graffy-issue51 as of 10/06/2022 00:30
2024-10-07 12:16:47 +02:00
results_dir = Path ( args . results_dir )
if not path_is_reachable_by_compute_nodes ( results_dir ) :
raise ValueError ( ' the results path is expected to be on a disk that is accessible to all cluster nodes, and it doesn \' t seem to be the case for {results_dir} ' )
launch_perf_jobs ( hibridon_version , results_dir )
2024-10-04 18:27:21 +02:00
main ( )