improvements to hibenchonphysix:

- although still hardcoded, made it easier to switch between alambix and physix clusters
- although still hardcoded, made it easier to switch to test mode (quick test)
- removed hardcoded value for all_hosts_groups as it is retreived from cluster node database

nb: changes made on 08/10/2024

work related to [https://bugzilla.ipr.univ-rennes.fr/show_bug.cgi?id=3372]
This commit is contained in:
Guillaume Raffy 2024-10-09 15:16:02 +02:00
parent 3ca66d0063
commit fb6f926cb1
1 changed files with 92 additions and 85 deletions

View File

@ -8,6 +8,8 @@ import shutil
from pathlib import Path
import subprocess
import re
import logging
HostFqdn = str # eg 'physix90.ipr.univ-rennes1.fr'
GitCommitTag = str # commit number eg 'a3bed1c3ccfbca572003020d3e3d3b1ff3934fad'
@ -62,71 +64,76 @@ class ClusterNodeDb:
def __init__(self):
self.cluster_nodes_defs = []
self.add_cluster_node_def(ClusterNodeDef('alambix50.ipr.univ-rennes.fr', 'intel_xeon_x5650', 2))
# self.add_cluster_node_def(ClusterNodeDef('alambix75.ipr.univ-rennes.fr', 'intel_xeon_e5-2660v2', 2))
self.add_cluster_node_def(ClusterNodeDef('alambix103.ipr.univ-rennes.fr', 'amd_epyc_7452', 2))
self.add_cluster_node_def(ClusterNodeDef('alambix104.ipr.univ-rennes.fr', 'intel_xeon_gold_6248r', 2))
self.add_cluster_node_def(ClusterNodeDef('alambix105.ipr.univ-rennes.fr', 'intel_xeon_gold_6348', 2))
self.add_cluster_node_def(ClusterNodeDef('alambix106.ipr.univ-rennes.fr', 'intel_xeon_gold_6348', 2))
self.add_cluster_node_def(ClusterNodeDef('alambix107.ipr.univ-rennes.fr', 'intel_xeon_gold_6348', 2))
self.add_cluster_node_def(ClusterNodeDef('alambix108.ipr.univ-rennes.fr', 'intel_xeon_gold_6348', 2))
# self.add_cluster_node_def(ClusterNodeDef('physix12.ipr.univ-rennes1.fr', 'amd_epyc_7282', 2))
# self.add_cluster_node_def(ClusterNodeDef('physix13.ipr.univ-rennes1.fr', 'amd_epyc_7282', 2))
# self.add_cluster_node_def(ClusterNodeDef('physix14.ipr.univ-rennes1.fr', 'amd_epyc_7282', 2))
# self.add_cluster_node_def(ClusterNodeDef('physix15.ipr.univ-rennes1.fr', 'amd_epyc_7282', 2))
# self.add_cluster_node_def(ClusterNodeDef('physix48.ipr.univ-rennes1.fr', 'intel_xeon_x5550', 2))
# self.add_cluster_node_def(ClusterNodeDef('physix49.ipr.univ-rennes1.fr', 'intel_xeon_x5650', 2))
# self.add_cluster_node_def(ClusterNodeDef('physix51.ipr.univ-rennes1.fr', 'intel_xeon_x5650', 2))
# self.add_cluster_node_def(ClusterNodeDef('physix52.ipr.univ-rennes1.fr', 'intel_xeon_x5650', 2))
# self.add_cluster_node_def(ClusterNodeDef('physix53.ipr.univ-rennes1.fr', 'intel_xeon_x5650', 2))
# self.add_cluster_node_def(ClusterNodeDef('physix54.ipr.univ-rennes1.fr', 'intel_xeon_x5650', 2))
# self.add_cluster_node_def(ClusterNodeDef('physix55.ipr.univ-rennes1.fr', 'intel_xeon_x5650', 2))
# self.add_cluster_node_def(ClusterNodeDef('physix56.ipr.univ-rennes1.fr', 'intel_xeon_x5650', 2))
# self.add_cluster_node_def(ClusterNodeDef('physix57.ipr.univ-rennes1.fr', 'intel_xeon_x5650', 2))
# self.add_cluster_node_def(ClusterNodeDef('physix58.ipr.univ-rennes1.fr', 'intel_xeon_x5650', 2))
# self.add_cluster_node_def(ClusterNodeDef('physix59.ipr.univ-rennes1.fr', 'intel_xeon_x5650', 2))
# self.add_cluster_node_def(ClusterNodeDef('physix60.ipr.univ-rennes1.fr', 'intel_xeon_e5-2660', 2))
# self.add_cluster_node_def(ClusterNodeDef('physix61.ipr.univ-rennes1.fr', 'intel_xeon_e5-2660', 2))
# self.add_cluster_node_def(ClusterNodeDef('physix62.ipr.univ-rennes1.fr', 'intel_xeon_e5-2660', 2))
# self.add_cluster_node_def(ClusterNodeDef('physix63.ipr.univ-rennes1.fr', 'intel_xeon_e5-2660', 2))
# self.add_cluster_node_def(ClusterNodeDef('physix64.ipr.univ-rennes1.fr', 'intel_xeon_e5-2660', 2))
# self.add_cluster_node_def(ClusterNodeDef('physix65.ipr.univ-rennes1.fr', 'intel_xeon_e5-2660', 2))
# self.add_cluster_node_def(ClusterNodeDef('physix66.ipr.univ-rennes1.fr', 'intel_xeon_e5-2660', 2))
# self.add_cluster_node_def(ClusterNodeDef('physix67.ipr.univ-rennes1.fr', 'intel_xeon_e5-2660', 2))
# self.add_cluster_node_def(ClusterNodeDef('physix68.ipr.univ-rennes1.fr', 'intel_xeon_e5-2660', 2))
# self.add_cluster_node_def(ClusterNodeDef('physix69.ipr.univ-rennes1.fr', 'intel_xeon_e5-2660', 2))
# self.add_cluster_node_def(ClusterNodeDef('physix70.ipr.univ-rennes1.fr', 'intel_xeon_e5-2660', 2))
# self.add_cluster_node_def(ClusterNodeDef('physix71.ipr.univ-rennes1.fr', 'intel_xeon_e5-2660', 2))
# self.add_cluster_node_def(ClusterNodeDef('physix72.ipr.univ-rennes1.fr', 'intel_xeon_e5-2660v2', 2))
# self.add_cluster_node_def(ClusterNodeDef('physix73.ipr.univ-rennes1.fr', 'intel_xeon_e5-2660v2', 2))
# self.add_cluster_node_def(ClusterNodeDef('physix74.ipr.univ-rennes1.fr', 'intel_xeon_e5-2660v2', 2))
# self.add_cluster_node_def(ClusterNodeDef('physix76.ipr.univ-rennes1.fr', 'intel_xeon_e5-2660v2', 2))
# self.add_cluster_node_def(ClusterNodeDef('physix77.ipr.univ-rennes1.fr', 'intel_xeon_e5-2660v2', 2))
# self.add_cluster_node_def(ClusterNodeDef('physix78.ipr.univ-rennes1.fr', 'intel_xeon_e5-2660v2', 2))
# self.add_cluster_node_def(ClusterNodeDef('physix79.ipr.univ-rennes1.fr', 'intel_xeon_e5-2660v2', 2))
# self.add_cluster_node_def(ClusterNodeDef('physix80.ipr.univ-rennes1.fr', 'intel_xeon_e5-2660v2', 2))
# self.add_cluster_node_def(ClusterNodeDef('physix81.ipr.univ-rennes1.fr', 'intel_xeon_e5-2660v2', 2))
# self.add_cluster_node_def(ClusterNodeDef('physix82.ipr.univ-rennes1.fr', 'intel_xeon_e5-2660v2', 2))
# self.add_cluster_node_def(ClusterNodeDef('physix83.ipr.univ-rennes1.fr', 'intel_xeon_e5-2660v2', 2))
# self.add_cluster_node_def(ClusterNodeDef('physix84.ipr.univ-rennes1.fr', 'intel_xeon_e5-2660v4', 2))
# self.add_cluster_node_def(ClusterNodeDef('physix85.ipr.univ-rennes1.fr', 'intel_xeon_e5-2660v4', 2))
# self.add_cluster_node_def(ClusterNodeDef('physix86.ipr.univ-rennes1.fr', 'intel_xeon_e5-2660v4', 2))
# self.add_cluster_node_def(ClusterNodeDef('physix87.ipr.univ-rennes1.fr', 'intel_xeon_e5-2660v4', 2))
# self.add_cluster_node_def(ClusterNodeDef('physix88.ipr.univ-rennes1.fr', 'intel_xeon_gold_6140', 2))
# self.add_cluster_node_def(ClusterNodeDef('physix89.ipr.univ-rennes1.fr', 'intel_xeon_gold_6140', 2))
# self.add_cluster_node_def(ClusterNodeDef('physix90.ipr.univ-rennes1.fr', 'intel_xeon_gold_6154', 4))
# self.add_cluster_node_def(ClusterNodeDef('physix91.ipr.univ-rennes1.fr', 'intel_xeon_gold_6140', 4))
# self.add_cluster_node_def(ClusterNodeDef('physix92.ipr.univ-rennes1.fr', 'intel_xeon_gold_5220', 1))
# self.add_cluster_node_def(ClusterNodeDef('physix93.ipr.univ-rennes1.fr', 'intel_xeon_gold_6226r', 2))
# self.add_cluster_node_def(ClusterNodeDef('physix94.ipr.univ-rennes1.fr', 'intel_xeon_gold_6226r', 2))
# self.add_cluster_node_def(ClusterNodeDef('physix95.ipr.univ-rennes1.fr', 'intel_xeon_gold_6248r', 2))
# self.add_cluster_node_def(ClusterNodeDef('physix96.ipr.univ-rennes1.fr', 'intel_xeon_gold_6248r', 2))
# self.add_cluster_node_def(ClusterNodeDef('physix97.ipr.univ-rennes1.fr', 'intel_xeon_gold_6248r', 2))
# self.add_cluster_node_def(ClusterNodeDef('physix98.ipr.univ-rennes1.fr', 'intel_xeon_gold_6248r', 2))
# self.add_cluster_node_def(ClusterNodeDef('physix99.ipr.univ-rennes1.fr', 'intel_xeon_gold_6240r', 2))
# self.add_cluster_node_def(ClusterNodeDef('physix100.ipr.univ-rennes1.fr', 'intel_xeon_gold_6248r', 2))
# self.add_cluster_node_def(ClusterNodeDef('physix101.ipr.univ-rennes1.fr', 'intel_xeon_gold_6248r', 2))
# self.add_cluster_node_def(ClusterNodeDef('physix102.ipr.univ-rennes1.fr', 'intel_xeon_gold_6248r', 2))
cluster_id = 'alambix'
include_multiqueue_nodes = False # at the moment hibench only works on nodes that have all their cores in the same queue
if cluster_id == 'alambix':
self.add_cluster_node_def(ClusterNodeDef('alambix50.ipr.univ-rennes.fr', 'intel_xeon_x5650', 2))
if include_multiqueue_nodes:
self.add_cluster_node_def(ClusterNodeDef('physix90.ipr.univ-rennes1.fr', 'intel_xeon_gold_6154', 4)) # also has some cores reserved for gpuonly.q
self.add_cluster_node_def(ClusterNodeDef('alambix103.ipr.univ-rennes.fr', 'amd_epyc_7452', 2))
if include_multiqueue_nodes:
self.add_cluster_node_def(ClusterNodeDef('alambix104.ipr.univ-rennes.fr', 'intel_xeon_gold_6248r', 2)) # also has some cores reserved for gpuonly.q
self.add_cluster_node_def(ClusterNodeDef('alambix105.ipr.univ-rennes.fr', 'intel_xeon_gold_6348', 2))
self.add_cluster_node_def(ClusterNodeDef('alambix106.ipr.univ-rennes.fr', 'intel_xeon_gold_6348', 2))
self.add_cluster_node_def(ClusterNodeDef('alambix107.ipr.univ-rennes.fr', 'intel_xeon_gold_6348', 2))
self.add_cluster_node_def(ClusterNodeDef('alambix108.ipr.univ-rennes.fr', 'intel_xeon_gold_6348', 2))
elif cluster_id == 'physix':
self.add_cluster_node_def(ClusterNodeDef('physix12.ipr.univ-rennes1.fr', 'amd_epyc_7282', 2))
self.add_cluster_node_def(ClusterNodeDef('physix13.ipr.univ-rennes1.fr', 'amd_epyc_7282', 2))
self.add_cluster_node_def(ClusterNodeDef('physix14.ipr.univ-rennes1.fr', 'amd_epyc_7282', 2))
self.add_cluster_node_def(ClusterNodeDef('physix15.ipr.univ-rennes1.fr', 'amd_epyc_7282', 2))
self.add_cluster_node_def(ClusterNodeDef('physix48.ipr.univ-rennes1.fr', 'intel_xeon_x5550', 2))
self.add_cluster_node_def(ClusterNodeDef('physix49.ipr.univ-rennes1.fr', 'intel_xeon_x5650', 2))
self.add_cluster_node_def(ClusterNodeDef('physix51.ipr.univ-rennes1.fr', 'intel_xeon_x5650', 2))
self.add_cluster_node_def(ClusterNodeDef('physix52.ipr.univ-rennes1.fr', 'intel_xeon_x5650', 2))
self.add_cluster_node_def(ClusterNodeDef('physix53.ipr.univ-rennes1.fr', 'intel_xeon_x5650', 2))
self.add_cluster_node_def(ClusterNodeDef('physix54.ipr.univ-rennes1.fr', 'intel_xeon_x5650', 2))
self.add_cluster_node_def(ClusterNodeDef('physix55.ipr.univ-rennes1.fr', 'intel_xeon_x5650', 2))
self.add_cluster_node_def(ClusterNodeDef('physix56.ipr.univ-rennes1.fr', 'intel_xeon_x5650', 2))
self.add_cluster_node_def(ClusterNodeDef('physix57.ipr.univ-rennes1.fr', 'intel_xeon_x5650', 2))
self.add_cluster_node_def(ClusterNodeDef('physix58.ipr.univ-rennes1.fr', 'intel_xeon_x5650', 2))
self.add_cluster_node_def(ClusterNodeDef('physix59.ipr.univ-rennes1.fr', 'intel_xeon_x5650', 2))
self.add_cluster_node_def(ClusterNodeDef('physix60.ipr.univ-rennes1.fr', 'intel_xeon_e5-2660', 2))
self.add_cluster_node_def(ClusterNodeDef('physix61.ipr.univ-rennes1.fr', 'intel_xeon_e5-2660', 2))
self.add_cluster_node_def(ClusterNodeDef('physix62.ipr.univ-rennes1.fr', 'intel_xeon_e5-2660', 2))
self.add_cluster_node_def(ClusterNodeDef('physix63.ipr.univ-rennes1.fr', 'intel_xeon_e5-2660', 2))
self.add_cluster_node_def(ClusterNodeDef('physix64.ipr.univ-rennes1.fr', 'intel_xeon_e5-2660', 2))
self.add_cluster_node_def(ClusterNodeDef('physix65.ipr.univ-rennes1.fr', 'intel_xeon_e5-2660', 2))
self.add_cluster_node_def(ClusterNodeDef('physix66.ipr.univ-rennes1.fr', 'intel_xeon_e5-2660', 2))
self.add_cluster_node_def(ClusterNodeDef('physix67.ipr.univ-rennes1.fr', 'intel_xeon_e5-2660', 2))
self.add_cluster_node_def(ClusterNodeDef('physix68.ipr.univ-rennes1.fr', 'intel_xeon_e5-2660', 2))
self.add_cluster_node_def(ClusterNodeDef('physix69.ipr.univ-rennes1.fr', 'intel_xeon_e5-2660', 2))
self.add_cluster_node_def(ClusterNodeDef('physix70.ipr.univ-rennes1.fr', 'intel_xeon_e5-2660', 2))
self.add_cluster_node_def(ClusterNodeDef('physix71.ipr.univ-rennes1.fr', 'intel_xeon_e5-2660', 2))
self.add_cluster_node_def(ClusterNodeDef('physix72.ipr.univ-rennes1.fr', 'intel_xeon_e5-2660v2', 2))
self.add_cluster_node_def(ClusterNodeDef('physix73.ipr.univ-rennes1.fr', 'intel_xeon_e5-2660v2', 2))
self.add_cluster_node_def(ClusterNodeDef('physix74.ipr.univ-rennes1.fr', 'intel_xeon_e5-2660v2', 2))
self.add_cluster_node_def(ClusterNodeDef('physix76.ipr.univ-rennes1.fr', 'intel_xeon_e5-2660v2', 2))
self.add_cluster_node_def(ClusterNodeDef('physix77.ipr.univ-rennes1.fr', 'intel_xeon_e5-2660v2', 2))
self.add_cluster_node_def(ClusterNodeDef('physix78.ipr.univ-rennes1.fr', 'intel_xeon_e5-2660v2', 2))
self.add_cluster_node_def(ClusterNodeDef('physix79.ipr.univ-rennes1.fr', 'intel_xeon_e5-2660v2', 2))
self.add_cluster_node_def(ClusterNodeDef('physix80.ipr.univ-rennes1.fr', 'intel_xeon_e5-2660v2', 2))
self.add_cluster_node_def(ClusterNodeDef('physix81.ipr.univ-rennes1.fr', 'intel_xeon_e5-2660v2', 2))
self.add_cluster_node_def(ClusterNodeDef('physix82.ipr.univ-rennes1.fr', 'intel_xeon_e5-2660v2', 2))
self.add_cluster_node_def(ClusterNodeDef('physix83.ipr.univ-rennes1.fr', 'intel_xeon_e5-2660v2', 2))
self.add_cluster_node_def(ClusterNodeDef('physix84.ipr.univ-rennes1.fr', 'intel_xeon_e5-2660v4', 2))
self.add_cluster_node_def(ClusterNodeDef('physix85.ipr.univ-rennes1.fr', 'intel_xeon_e5-2660v4', 2))
self.add_cluster_node_def(ClusterNodeDef('physix86.ipr.univ-rennes1.fr', 'intel_xeon_e5-2660v4', 2))
self.add_cluster_node_def(ClusterNodeDef('physix87.ipr.univ-rennes1.fr', 'intel_xeon_e5-2660v4', 2))
self.add_cluster_node_def(ClusterNodeDef('physix88.ipr.univ-rennes1.fr', 'intel_xeon_gold_6140', 2))
self.add_cluster_node_def(ClusterNodeDef('physix89.ipr.univ-rennes1.fr', 'intel_xeon_gold_6140', 2))
self.add_cluster_node_def(ClusterNodeDef('physix91.ipr.univ-rennes1.fr', 'intel_xeon_gold_6140', 4))
self.add_cluster_node_def(ClusterNodeDef('physix92.ipr.univ-rennes1.fr', 'intel_xeon_gold_5220', 1))
self.add_cluster_node_def(ClusterNodeDef('physix93.ipr.univ-rennes1.fr', 'intel_xeon_gold_6226r', 2))
self.add_cluster_node_def(ClusterNodeDef('physix94.ipr.univ-rennes1.fr', 'intel_xeon_gold_6226r', 2))
self.add_cluster_node_def(ClusterNodeDef('physix95.ipr.univ-rennes1.fr', 'intel_xeon_gold_6248r', 2))
self.add_cluster_node_def(ClusterNodeDef('physix96.ipr.univ-rennes1.fr', 'intel_xeon_gold_6248r', 2))
self.add_cluster_node_def(ClusterNodeDef('physix97.ipr.univ-rennes1.fr', 'intel_xeon_gold_6248r', 2))
self.add_cluster_node_def(ClusterNodeDef('physix98.ipr.univ-rennes1.fr', 'intel_xeon_gold_6248r', 2))
self.add_cluster_node_def(ClusterNodeDef('physix99.ipr.univ-rennes1.fr', 'intel_xeon_gold_6240r', 2))
self.add_cluster_node_def(ClusterNodeDef('physix100.ipr.univ-rennes1.fr', 'intel_xeon_gold_6248r', 2))
self.add_cluster_node_def(ClusterNodeDef('physix101.ipr.univ-rennes1.fr', 'intel_xeon_gold_6248r', 2))
self.add_cluster_node_def(ClusterNodeDef('physix102.ipr.univ-rennes1.fr', 'intel_xeon_gold_6248r', 2))
self.cpu_defs = {}
self.add_cpu_def(CpuDef('intel_xeon_x5550', 4))
@ -152,10 +159,12 @@ class ClusterNodeDb:
def get_host_group_info(self, host_group_id: HostGroupId) -> Tuple[List[HostFqdn], int]:
hosts = [cluster_node_def.host_fqdn for cluster_node_def in self.cluster_nodes_defs if cluster_node_def.cpu_id == host_group_id]
num_cpus_set = set([cluster_node_def.num_cpus for cluster_node_def in self.cluster_nodes_defs if cluster_node_def.cpu_id == host_group_id])
assert len(num_cpus_set) > 0
assert len(num_cpus_set) <= 1, f'the number of cpus for the host group {host_group_id} is not homogen ({num_cpus_set})'
num_cpus = num_cpus_set.pop()
num_cores = self.cpu_defs[host_group_id].num_cores * num_cpus
if len(num_cpus_set) > 0:
num_cpus = num_cpus_set.pop()
num_cores = self.cpu_defs[host_group_id].num_cores * num_cpus
else:
num_cores = 0
return (hosts, num_cores)
@ -164,10 +173,19 @@ def launch_job_for_host_group(hibridon_version: GitCommitTag, host_group_id: Hos
cluster_db = ClusterNodeDb()
(hosts, num_cores) = cluster_db.get_host_group_info(host_group_id)
if len(hosts) == 0:
logging.warning('skipping benchmarks with compiler %s on architecture %s because no hosts are available for it', compiler_id, host_group_id)
return
# quick_test = 'arch4_quick' # about 2s on a core i5 8th generation
representative_test = 'nh3h2_qma_long' # about 10min on a core i5 8th generation
benchmark_test = representative_test
quick_test = 'arch4_quick' # about 2s on a core i5 8th generation
representative_test = 'nh3h2_qma_long' # about 10min on a core i5 8th generation
use_test_mode = False
if use_test_mode:
benchmark_test = quick_test
else:
benchmark_test = representative_test
logging.info('using test %s for benchmarking', benchmark_test)
if benchmark_test == 'arch4_quick':
ram_per_core = '1G'
elif benchmark_test == 'nh3h2_qma_long':
@ -247,20 +265,9 @@ def launch_perf_jobs(hibridon_version: GitCommitTag, results_dir: Path, arch_reg
'ifort'
]
all_host_groups = [
'intel_xeon_x5550',
'intel_xeon_x5650',
'intel_xeon_e5-2660',
'intel_xeon_e5-2660v2',
'intel_xeon_e5-2660v4',
'intel_xeon_gold_6140',
'intel_xeon_gold_6154',
'intel_xeon_gold_5222',
'intel_xeon_gold_6226r',
'intel_xeon_gold_6240r',
'intel_xeon_gold_6248r',
'amd_epyc_7282',
]
cluster_db = ClusterNodeDb()
all_host_groups = cluster_db.cpu_defs.keys()
print(f'available host groups: {all_host_groups}')
host_groups = [host_group for host_group in all_host_groups if re.match(arch_regexp, host_group) is not None]
print(f'requested host groups: {host_groups}')