improved the hosts table to ease adaptation to alambix instead of physix

This hosts description table is still hardcoded, though... at some point it will be better to use external data to make the code more generic (an attempt was made with pandas, but this introduced complexity in the setup so I decided to keep it hardcoded for the moment).

work related to [https://bugzilla.ipr.univ-rennes.fr/show_bug.cgi?id=3372]
This commit is contained in:
Guillaume Raffy 2024-10-08 17:22:26 +02:00
parent 350785bfee
commit c534d7e135
1 changed files with 120 additions and 99 deletions

View File

@ -36,113 +36,134 @@ def substitute_tags(input_file_path: Path, tags_dict: Dict[str, str], output_fil
out_file.write(line)
def get_host_group_info(host_group_id: HostGroupId) -> Tuple[List[HostFqdn], int]:
if host_group_id == 'intel_xeon_x5550':
hosts = ['physix48.ipr.univ-rennes1.fr']
num_cores = '8'
elif host_group_id == 'intel_xeon_x5650':
hosts = [
'physix49.ipr.univ-rennes1.fr',
'physix50.ipr.univ-rennes1.fr',
'physix51.ipr.univ-rennes1.fr',
'physix52.ipr.univ-rennes1.fr',
'physix53.ipr.univ-rennes1.fr',
'physix54.ipr.univ-rennes1.fr',
'physix55.ipr.univ-rennes1.fr',
'physix56.ipr.univ-rennes1.fr',
'physix57.ipr.univ-rennes1.fr',
'physix58.ipr.univ-rennes1.fr',
'physix59.ipr.univ-rennes1.fr',]
num_cores = '12'
elif host_group_id == 'intel_xeon_e5-2660':
hosts = [
'physix60.ipr.univ-rennes1.fr',
'physix61.ipr.univ-rennes1.fr',
'physix62.ipr.univ-rennes1.fr',
'physix63.ipr.univ-rennes1.fr',
class ClusterNodeDef:
host_fqdn: str
cpu_id: str
num_cpus: int
'physix64.ipr.univ-rennes1.fr',
'physix65.ipr.univ-rennes1.fr',
'physix66.ipr.univ-rennes1.fr',
'physix67.ipr.univ-rennes1.fr',
def __init__(self, host_fqdn: str, cpu_id: str, num_cpus: int):
self.host_fqdn = host_fqdn
self.cpu_id = cpu_id
self.num_cpus = num_cpus
'physix68.ipr.univ-rennes1.fr',
'physix69.ipr.univ-rennes1.fr',
'physix70.ipr.univ-rennes1.fr',
'physix71.ipr.univ-rennes1.fr']
num_cores = '16'
elif host_group_id == 'intel_xeon_e5-2660v2':
hosts = [
'physix72.ipr.univ-rennes1.fr',
'physix73.ipr.univ-rennes1.fr',
'physix74.ipr.univ-rennes1.fr',
'physix75.ipr.univ-rennes1.fr',
'physix76.ipr.univ-rennes1.fr',
'physix77.ipr.univ-rennes1.fr',
'physix78.ipr.univ-rennes1.fr',
'physix79.ipr.univ-rennes1.fr',
class CpuDef:
cpu_id: str
num_cores: int
'physix80.ipr.univ-rennes1.fr',
'physix81.ipr.univ-rennes1.fr',
'physix82.ipr.univ-rennes1.fr',
'physix84.ipr.univ-rennes1.fr']
num_cores = '20'
elif host_group_id == 'intel_xeon_e5-2660v4':
hosts = [
'physix84.ipr.univ-rennes1.fr',
'physix85.ipr.univ-rennes1.fr',
'physix86.ipr.univ-rennes1.fr',
'physix87.ipr.univ-rennes1.fr']
num_cores = '28'
elif host_group_id == 'intel_xeon_gold_6140':
hosts = [
'physix88.ipr.univ-rennes1.fr',
'physix89.ipr.univ-rennes1.fr']
num_cores = '36'
elif host_group_id == 'intel_xeon_gold_6154':
hosts = [
'physix90.ipr.univ-rennes1.fr']
num_cores = '72'
elif host_group_id == 'intel_xeon_gold_5222':
hosts = [
'physix92.ipr.univ-rennes1.fr']
num_cores = '4'
elif host_group_id == 'intel_xeon_gold_6226r':
hosts = [
'physix93.ipr.univ-rennes1.fr',
'physix94.ipr.univ-rennes1.fr']
num_cores = '32'
elif host_group_id == 'intel_xeon_gold_6240r':
hosts = [
'physix99.ipr.univ-rennes1.fr']
num_cores = '48'
elif host_group_id == 'intel_xeon_gold_6248r':
hosts = [
'physix95.ipr.univ-rennes1.fr',
'physix96.ipr.univ-rennes1.fr',
'physix97.ipr.univ-rennes1.fr',
'physix98.ipr.univ-rennes1.fr',
'physix99.ipr.univ-rennes1.fr',
'physix100.ipr.univ-rennes1.fr',
'physix101.ipr.univ-rennes1.fr',
'physix102.ipr.univ-rennes1.fr']
num_cores = '48'
elif host_group_id == 'amd_epyc_7282':
hosts = [
'physix12.ipr.univ-rennes1.fr',
'physix13.ipr.univ-rennes1.fr',
'physix14.ipr.univ-rennes1.fr',
'physix15.ipr.univ-rennes1.fr']
num_cores = '32'
else:
assert f"unhandled host_group_id : {host_group_id}"
return (hosts, num_cores)
def __init__(self, cpu_id: str, num_cores: int):
self.cpu_id = cpu_id
self.num_cores = num_cores
class ClusterNodeDb:
cluster_nodes_defs: List[ClusterNodeDef]
cpu_defs: Dict[str, int]
def __init__(self):
self.cluster_nodes_defs = []
self.add_cluster_node_def(ClusterNodeDef('alambix50.ipr.univ-rennes.fr', 'intel_xeon_x5650', 2))
# self.add_cluster_node_def(ClusterNodeDef('alambix75.ipr.univ-rennes.fr', 'intel_xeon_e5-2660v2', 2))
self.add_cluster_node_def(ClusterNodeDef('alambix103.ipr.univ-rennes.fr', 'amd_epyc_7452', 2))
self.add_cluster_node_def(ClusterNodeDef('alambix104.ipr.univ-rennes.fr', 'intel_xeon_gold_6248r', 2))
self.add_cluster_node_def(ClusterNodeDef('alambix105.ipr.univ-rennes.fr', 'intel_xeon_gold_6348', 2))
self.add_cluster_node_def(ClusterNodeDef('alambix106.ipr.univ-rennes.fr', 'intel_xeon_gold_6348', 2))
self.add_cluster_node_def(ClusterNodeDef('alambix107.ipr.univ-rennes.fr', 'intel_xeon_gold_6348', 2))
self.add_cluster_node_def(ClusterNodeDef('alambix108.ipr.univ-rennes.fr', 'intel_xeon_gold_6348', 2))
# self.add_cluster_node_def(ClusterNodeDef('physix12.ipr.univ-rennes1.fr', 'amd_epyc_7282', 2))
# self.add_cluster_node_def(ClusterNodeDef('physix13.ipr.univ-rennes1.fr', 'amd_epyc_7282', 2))
# self.add_cluster_node_def(ClusterNodeDef('physix14.ipr.univ-rennes1.fr', 'amd_epyc_7282', 2))
# self.add_cluster_node_def(ClusterNodeDef('physix15.ipr.univ-rennes1.fr', 'amd_epyc_7282', 2))
# self.add_cluster_node_def(ClusterNodeDef('physix48.ipr.univ-rennes1.fr', 'intel_xeon_x5550', 2))
# self.add_cluster_node_def(ClusterNodeDef('physix49.ipr.univ-rennes1.fr', 'intel_xeon_x5650', 2))
# self.add_cluster_node_def(ClusterNodeDef('physix51.ipr.univ-rennes1.fr', 'intel_xeon_x5650', 2))
# self.add_cluster_node_def(ClusterNodeDef('physix52.ipr.univ-rennes1.fr', 'intel_xeon_x5650', 2))
# self.add_cluster_node_def(ClusterNodeDef('physix53.ipr.univ-rennes1.fr', 'intel_xeon_x5650', 2))
# self.add_cluster_node_def(ClusterNodeDef('physix54.ipr.univ-rennes1.fr', 'intel_xeon_x5650', 2))
# self.add_cluster_node_def(ClusterNodeDef('physix55.ipr.univ-rennes1.fr', 'intel_xeon_x5650', 2))
# self.add_cluster_node_def(ClusterNodeDef('physix56.ipr.univ-rennes1.fr', 'intel_xeon_x5650', 2))
# self.add_cluster_node_def(ClusterNodeDef('physix57.ipr.univ-rennes1.fr', 'intel_xeon_x5650', 2))
# self.add_cluster_node_def(ClusterNodeDef('physix58.ipr.univ-rennes1.fr', 'intel_xeon_x5650', 2))
# self.add_cluster_node_def(ClusterNodeDef('physix59.ipr.univ-rennes1.fr', 'intel_xeon_x5650', 2))
# self.add_cluster_node_def(ClusterNodeDef('physix60.ipr.univ-rennes1.fr', 'intel_xeon_e5-2660', 2))
# self.add_cluster_node_def(ClusterNodeDef('physix61.ipr.univ-rennes1.fr', 'intel_xeon_e5-2660', 2))
# self.add_cluster_node_def(ClusterNodeDef('physix62.ipr.univ-rennes1.fr', 'intel_xeon_e5-2660', 2))
# self.add_cluster_node_def(ClusterNodeDef('physix63.ipr.univ-rennes1.fr', 'intel_xeon_e5-2660', 2))
# self.add_cluster_node_def(ClusterNodeDef('physix64.ipr.univ-rennes1.fr', 'intel_xeon_e5-2660', 2))
# self.add_cluster_node_def(ClusterNodeDef('physix65.ipr.univ-rennes1.fr', 'intel_xeon_e5-2660', 2))
# self.add_cluster_node_def(ClusterNodeDef('physix66.ipr.univ-rennes1.fr', 'intel_xeon_e5-2660', 2))
# self.add_cluster_node_def(ClusterNodeDef('physix67.ipr.univ-rennes1.fr', 'intel_xeon_e5-2660', 2))
# self.add_cluster_node_def(ClusterNodeDef('physix68.ipr.univ-rennes1.fr', 'intel_xeon_e5-2660', 2))
# self.add_cluster_node_def(ClusterNodeDef('physix69.ipr.univ-rennes1.fr', 'intel_xeon_e5-2660', 2))
# self.add_cluster_node_def(ClusterNodeDef('physix70.ipr.univ-rennes1.fr', 'intel_xeon_e5-2660', 2))
# self.add_cluster_node_def(ClusterNodeDef('physix71.ipr.univ-rennes1.fr', 'intel_xeon_e5-2660', 2))
# self.add_cluster_node_def(ClusterNodeDef('physix72.ipr.univ-rennes1.fr', 'intel_xeon_e5-2660v2', 2))
# self.add_cluster_node_def(ClusterNodeDef('physix73.ipr.univ-rennes1.fr', 'intel_xeon_e5-2660v2', 2))
# self.add_cluster_node_def(ClusterNodeDef('physix74.ipr.univ-rennes1.fr', 'intel_xeon_e5-2660v2', 2))
# self.add_cluster_node_def(ClusterNodeDef('physix76.ipr.univ-rennes1.fr', 'intel_xeon_e5-2660v2', 2))
# self.add_cluster_node_def(ClusterNodeDef('physix77.ipr.univ-rennes1.fr', 'intel_xeon_e5-2660v2', 2))
# self.add_cluster_node_def(ClusterNodeDef('physix78.ipr.univ-rennes1.fr', 'intel_xeon_e5-2660v2', 2))
# self.add_cluster_node_def(ClusterNodeDef('physix79.ipr.univ-rennes1.fr', 'intel_xeon_e5-2660v2', 2))
# self.add_cluster_node_def(ClusterNodeDef('physix80.ipr.univ-rennes1.fr', 'intel_xeon_e5-2660v2', 2))
# self.add_cluster_node_def(ClusterNodeDef('physix81.ipr.univ-rennes1.fr', 'intel_xeon_e5-2660v2', 2))
# self.add_cluster_node_def(ClusterNodeDef('physix82.ipr.univ-rennes1.fr', 'intel_xeon_e5-2660v2', 2))
# self.add_cluster_node_def(ClusterNodeDef('physix83.ipr.univ-rennes1.fr', 'intel_xeon_e5-2660v2', 2))
# self.add_cluster_node_def(ClusterNodeDef('physix84.ipr.univ-rennes1.fr', 'intel_xeon_e5-2660v4', 2))
# self.add_cluster_node_def(ClusterNodeDef('physix85.ipr.univ-rennes1.fr', 'intel_xeon_e5-2660v4', 2))
# self.add_cluster_node_def(ClusterNodeDef('physix86.ipr.univ-rennes1.fr', 'intel_xeon_e5-2660v4', 2))
# self.add_cluster_node_def(ClusterNodeDef('physix87.ipr.univ-rennes1.fr', 'intel_xeon_e5-2660v4', 2))
# self.add_cluster_node_def(ClusterNodeDef('physix88.ipr.univ-rennes1.fr', 'intel_xeon_gold_6140', 2))
# self.add_cluster_node_def(ClusterNodeDef('physix89.ipr.univ-rennes1.fr', 'intel_xeon_gold_6140', 2))
# self.add_cluster_node_def(ClusterNodeDef('physix90.ipr.univ-rennes1.fr', 'intel_xeon_gold_6154', 4))
# self.add_cluster_node_def(ClusterNodeDef('physix91.ipr.univ-rennes1.fr', 'intel_xeon_gold_6140', 4))
# self.add_cluster_node_def(ClusterNodeDef('physix92.ipr.univ-rennes1.fr', 'intel_xeon_gold_5220', 1))
# self.add_cluster_node_def(ClusterNodeDef('physix93.ipr.univ-rennes1.fr', 'intel_xeon_gold_6226r', 2))
# self.add_cluster_node_def(ClusterNodeDef('physix94.ipr.univ-rennes1.fr', 'intel_xeon_gold_6226r', 2))
# self.add_cluster_node_def(ClusterNodeDef('physix95.ipr.univ-rennes1.fr', 'intel_xeon_gold_6248r', 2))
# self.add_cluster_node_def(ClusterNodeDef('physix96.ipr.univ-rennes1.fr', 'intel_xeon_gold_6248r', 2))
# self.add_cluster_node_def(ClusterNodeDef('physix97.ipr.univ-rennes1.fr', 'intel_xeon_gold_6248r', 2))
# self.add_cluster_node_def(ClusterNodeDef('physix98.ipr.univ-rennes1.fr', 'intel_xeon_gold_6248r', 2))
# self.add_cluster_node_def(ClusterNodeDef('physix99.ipr.univ-rennes1.fr', 'intel_xeon_gold_6240r', 2))
# self.add_cluster_node_def(ClusterNodeDef('physix100.ipr.univ-rennes1.fr', 'intel_xeon_gold_6248r', 2))
# self.add_cluster_node_def(ClusterNodeDef('physix101.ipr.univ-rennes1.fr', 'intel_xeon_gold_6248r', 2))
# self.add_cluster_node_def(ClusterNodeDef('physix102.ipr.univ-rennes1.fr', 'intel_xeon_gold_6248r', 2))
self.cpu_defs = {}
self.add_cpu_def(CpuDef('intel_xeon_x5550', 4))
self.add_cpu_def(CpuDef('intel_xeon_x5650', 6))
self.add_cpu_def(CpuDef('intel_xeon_e5-2660', 8))
self.add_cpu_def(CpuDef('intel_xeon_e5-2660v2', 10))
self.add_cpu_def(CpuDef('intel_xeon_e5-2660v4', 14))
self.add_cpu_def(CpuDef('intel_xeon_gold_6140', 18))
self.add_cpu_def(CpuDef('intel_xeon_gold_6154', 18))
self.add_cpu_def(CpuDef('intel_xeon_gold_5220', 4))
self.add_cpu_def(CpuDef('intel_xeon_gold_6226r', 16))
self.add_cpu_def(CpuDef('intel_xeon_gold_6248r', 24))
self.add_cpu_def(CpuDef('intel_xeon_gold_6348', 28))
self.add_cpu_def(CpuDef('amd_epyc_7282', 16))
self.add_cpu_def(CpuDef('amd_epyc_7452', 32))
def add_cluster_node_def(self, cluster_node_def: ClusterNodeDef):
self.cluster_nodes_defs.append(cluster_node_def)
def add_cpu_def(self, cpu_def: CpuDef):
self.cpu_defs[cpu_def.cpu_id] = cpu_def
def get_host_group_info(self, host_group_id: HostGroupId) -> Tuple[List[HostFqdn], int]:
hosts = [cluster_node_def.host_fqdn for cluster_node_def in self.cluster_nodes_defs if cluster_node_def.cpu_id == host_group_id]
num_cpus_set = set([cluster_node_def.num_cpus for cluster_node_def in self.cluster_nodes_defs if cluster_node_def.cpu_id == host_group_id])
assert len(num_cpus_set) > 0
assert len(num_cpus_set) <= 1, f'the number of cpus for the host group {host_group_id} is not homogen ({num_cpus_set})'
num_cpus = num_cpus_set.pop()
num_cores = self.cpu_defs[host_group_id].num_cores * num_cpus
return (hosts, num_cores)
def launch_job_for_host_group(hibridon_version: GitCommitTag, host_group_id: HostGroupId, results_dir: Path, compiler_id: CompilerId):
(hosts, num_cores) = get_host_group_info(host_group_id)
cluster_db = ClusterNodeDb()
(hosts, num_cores) = cluster_db.get_host_group_info(host_group_id)
# quick_test = 'arch4_quick' # about 2s on a core i5 8th generation
representative_test = 'nh3h2_qma_long' # about 10min on a core i5 8th generation