402 lines
23 KiB
Python
Executable File
402 lines
23 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
# this script launches jobs to run hibridon benchmarks on physix cluster for the given version of hibridon (commit number)
|
|
from typing import List, Tuple, Dict
|
|
import argparse
|
|
from os import getenv, makedirs
|
|
import shutil
|
|
from pathlib import Path
|
|
import subprocess
|
|
import re
|
|
import logging
|
|
import importlib.resources
|
|
import venv
|
|
import json
|
|
import abc
|
|
from .core import IBenchmark, BenchmarkConfig, BenchmarkId, ResultsDbParams, BenchParam, HostTypeId
|
|
from .main import BenchmarkFactory
|
|
from .util import Singleton
|
|
|
|
|
|
HostFqdn = str # eg 'physix90.ipr.univ-rennes1.fr'
|
|
GitCommitTag = str # commit number eg 'a3bed1c3ccfbca572003020d3e3d3b1ff3934fad'
|
|
HostGroupId = str # eg 'xeon_gold_6140'
|
|
CompilerId = str # eg 'gfortran'
|
|
ClusterId = str # eg 'alambix'
|
|
|
|
|
|
def substitute_tag_with_filecontents(input_file_path: Path, tag: str, contents_file: Path, output_file_path: Path):
|
|
contents = open(contents_file, 'rt', encoding='utf8').read()
|
|
with open(input_file_path, 'rt', encoding='utf8') as template_file, open(output_file_path, 'wt', encoding='utf8') as out_file:
|
|
for template_line in template_file.readlines():
|
|
line = template_line.replace(tag, contents)
|
|
out_file.write(line)
|
|
|
|
|
|
def substitute_tags(input_file_path: Path, tags_dict: Dict[str, str], output_file_path: Path):
|
|
with open(input_file_path, 'rt', encoding='utf8') as template_file, open(output_file_path, 'wt', encoding='utf8') as out_file:
|
|
for template_line in template_file.readlines():
|
|
line = template_line
|
|
for tag, value in tags_dict.items():
|
|
if re.match(r'<include:', tag) is not None:
|
|
contents = open(value, 'rt', encoding='utf8').read()
|
|
else:
|
|
contents = value
|
|
line = line.replace(tag, contents)
|
|
out_file.write(line)
|
|
|
|
|
|
class ClusterNodeDef:
|
|
host_fqdn: str
|
|
cpu_id: str
|
|
num_cpus: int
|
|
|
|
def __init__(self, host_fqdn: str, cpu_id: str, num_cpus: int):
|
|
self.host_fqdn = host_fqdn
|
|
self.cpu_id = cpu_id
|
|
self.num_cpus = num_cpus
|
|
|
|
|
|
class CpuDef:
|
|
cpu_id: str
|
|
num_cores: int
|
|
|
|
def __init__(self, cpu_id: str, num_cores: int):
|
|
self.cpu_id = cpu_id
|
|
self.num_cores = num_cores
|
|
|
|
|
|
class ClusterNodeDb:
|
|
cluster_nodes_defs: List[ClusterNodeDef]
|
|
cpu_defs: Dict[str, int]
|
|
|
|
def __init__(self, cluster_id: ClusterId = 'alambix'):
|
|
self.cluster_nodes_defs = []
|
|
include_multiqueue_nodes = False # at the moment hibench only works on nodes that have all their cores in the same queue
|
|
if cluster_id == 'alambix':
|
|
self.add_cluster_node_def(ClusterNodeDef('alambix50.ipr.univ-rennes.fr', 'intel_xeon_x5650', 2))
|
|
if include_multiqueue_nodes:
|
|
self.add_cluster_node_def(ClusterNodeDef('physix90.ipr.univ-rennes1.fr', 'intel_xeon_gold_6154', 4)) # also has some cores reserved for gpuonly.q
|
|
self.add_cluster_node_def(ClusterNodeDef('alambix103.ipr.univ-rennes.fr', 'amd_epyc_7452', 2))
|
|
if include_multiqueue_nodes:
|
|
self.add_cluster_node_def(ClusterNodeDef('alambix104.ipr.univ-rennes.fr', 'intel_xeon_gold_6248r', 2)) # also has some cores reserved for gpuonly.q
|
|
self.add_cluster_node_def(ClusterNodeDef('alambix105.ipr.univ-rennes.fr', 'intel_xeon_gold_6348', 2))
|
|
self.add_cluster_node_def(ClusterNodeDef('alambix106.ipr.univ-rennes.fr', 'intel_xeon_gold_6348', 2))
|
|
self.add_cluster_node_def(ClusterNodeDef('alambix107.ipr.univ-rennes.fr', 'intel_xeon_gold_6348', 2))
|
|
self.add_cluster_node_def(ClusterNodeDef('alambix108.ipr.univ-rennes.fr', 'intel_xeon_gold_6348', 2))
|
|
elif cluster_id == 'physix':
|
|
self.add_cluster_node_def(ClusterNodeDef('physix12.ipr.univ-rennes1.fr', 'amd_epyc_7282', 2))
|
|
self.add_cluster_node_def(ClusterNodeDef('physix13.ipr.univ-rennes1.fr', 'amd_epyc_7282', 2))
|
|
self.add_cluster_node_def(ClusterNodeDef('physix14.ipr.univ-rennes1.fr', 'amd_epyc_7282', 2))
|
|
self.add_cluster_node_def(ClusterNodeDef('physix15.ipr.univ-rennes1.fr', 'amd_epyc_7282', 2))
|
|
self.add_cluster_node_def(ClusterNodeDef('physix48.ipr.univ-rennes1.fr', 'intel_xeon_x5550', 2))
|
|
self.add_cluster_node_def(ClusterNodeDef('physix49.ipr.univ-rennes1.fr', 'intel_xeon_x5650', 2))
|
|
self.add_cluster_node_def(ClusterNodeDef('physix51.ipr.univ-rennes1.fr', 'intel_xeon_x5650', 2))
|
|
self.add_cluster_node_def(ClusterNodeDef('physix52.ipr.univ-rennes1.fr', 'intel_xeon_x5650', 2))
|
|
self.add_cluster_node_def(ClusterNodeDef('physix53.ipr.univ-rennes1.fr', 'intel_xeon_x5650', 2))
|
|
self.add_cluster_node_def(ClusterNodeDef('physix54.ipr.univ-rennes1.fr', 'intel_xeon_x5650', 2))
|
|
self.add_cluster_node_def(ClusterNodeDef('physix55.ipr.univ-rennes1.fr', 'intel_xeon_x5650', 2))
|
|
self.add_cluster_node_def(ClusterNodeDef('physix56.ipr.univ-rennes1.fr', 'intel_xeon_x5650', 2))
|
|
self.add_cluster_node_def(ClusterNodeDef('physix57.ipr.univ-rennes1.fr', 'intel_xeon_x5650', 2))
|
|
self.add_cluster_node_def(ClusterNodeDef('physix58.ipr.univ-rennes1.fr', 'intel_xeon_x5650', 2))
|
|
self.add_cluster_node_def(ClusterNodeDef('physix59.ipr.univ-rennes1.fr', 'intel_xeon_x5650', 2))
|
|
self.add_cluster_node_def(ClusterNodeDef('physix60.ipr.univ-rennes1.fr', 'intel_xeon_e5-2660', 2))
|
|
self.add_cluster_node_def(ClusterNodeDef('physix61.ipr.univ-rennes1.fr', 'intel_xeon_e5-2660', 2))
|
|
self.add_cluster_node_def(ClusterNodeDef('physix62.ipr.univ-rennes1.fr', 'intel_xeon_e5-2660', 2))
|
|
self.add_cluster_node_def(ClusterNodeDef('physix63.ipr.univ-rennes1.fr', 'intel_xeon_e5-2660', 2))
|
|
self.add_cluster_node_def(ClusterNodeDef('physix64.ipr.univ-rennes1.fr', 'intel_xeon_e5-2660', 2))
|
|
self.add_cluster_node_def(ClusterNodeDef('physix65.ipr.univ-rennes1.fr', 'intel_xeon_e5-2660', 2))
|
|
self.add_cluster_node_def(ClusterNodeDef('physix66.ipr.univ-rennes1.fr', 'intel_xeon_e5-2660', 2))
|
|
self.add_cluster_node_def(ClusterNodeDef('physix67.ipr.univ-rennes1.fr', 'intel_xeon_e5-2660', 2))
|
|
self.add_cluster_node_def(ClusterNodeDef('physix68.ipr.univ-rennes1.fr', 'intel_xeon_e5-2660', 2))
|
|
self.add_cluster_node_def(ClusterNodeDef('physix69.ipr.univ-rennes1.fr', 'intel_xeon_e5-2660', 2))
|
|
self.add_cluster_node_def(ClusterNodeDef('physix70.ipr.univ-rennes1.fr', 'intel_xeon_e5-2660', 2))
|
|
self.add_cluster_node_def(ClusterNodeDef('physix71.ipr.univ-rennes1.fr', 'intel_xeon_e5-2660', 2))
|
|
self.add_cluster_node_def(ClusterNodeDef('physix72.ipr.univ-rennes1.fr', 'intel_xeon_e5-2660v2', 2))
|
|
self.add_cluster_node_def(ClusterNodeDef('physix73.ipr.univ-rennes1.fr', 'intel_xeon_e5-2660v2', 2))
|
|
self.add_cluster_node_def(ClusterNodeDef('physix74.ipr.univ-rennes1.fr', 'intel_xeon_e5-2660v2', 2))
|
|
self.add_cluster_node_def(ClusterNodeDef('physix76.ipr.univ-rennes1.fr', 'intel_xeon_e5-2660v2', 2))
|
|
self.add_cluster_node_def(ClusterNodeDef('physix77.ipr.univ-rennes1.fr', 'intel_xeon_e5-2660v2', 2))
|
|
self.add_cluster_node_def(ClusterNodeDef('physix78.ipr.univ-rennes1.fr', 'intel_xeon_e5-2660v2', 2))
|
|
self.add_cluster_node_def(ClusterNodeDef('physix79.ipr.univ-rennes1.fr', 'intel_xeon_e5-2660v2', 2))
|
|
self.add_cluster_node_def(ClusterNodeDef('physix80.ipr.univ-rennes1.fr', 'intel_xeon_e5-2660v2', 2))
|
|
self.add_cluster_node_def(ClusterNodeDef('physix81.ipr.univ-rennes1.fr', 'intel_xeon_e5-2660v2', 2))
|
|
self.add_cluster_node_def(ClusterNodeDef('physix82.ipr.univ-rennes1.fr', 'intel_xeon_e5-2660v2', 2))
|
|
self.add_cluster_node_def(ClusterNodeDef('physix83.ipr.univ-rennes1.fr', 'intel_xeon_e5-2660v2', 2))
|
|
self.add_cluster_node_def(ClusterNodeDef('physix84.ipr.univ-rennes1.fr', 'intel_xeon_e5-2660v4', 2))
|
|
self.add_cluster_node_def(ClusterNodeDef('physix85.ipr.univ-rennes1.fr', 'intel_xeon_e5-2660v4', 2))
|
|
self.add_cluster_node_def(ClusterNodeDef('physix86.ipr.univ-rennes1.fr', 'intel_xeon_e5-2660v4', 2))
|
|
self.add_cluster_node_def(ClusterNodeDef('physix87.ipr.univ-rennes1.fr', 'intel_xeon_e5-2660v4', 2))
|
|
self.add_cluster_node_def(ClusterNodeDef('physix88.ipr.univ-rennes1.fr', 'intel_xeon_gold_6140', 2))
|
|
self.add_cluster_node_def(ClusterNodeDef('physix89.ipr.univ-rennes1.fr', 'intel_xeon_gold_6140', 2))
|
|
self.add_cluster_node_def(ClusterNodeDef('physix91.ipr.univ-rennes1.fr', 'intel_xeon_gold_6140', 4))
|
|
self.add_cluster_node_def(ClusterNodeDef('physix92.ipr.univ-rennes1.fr', 'intel_xeon_gold_5220', 1))
|
|
self.add_cluster_node_def(ClusterNodeDef('physix93.ipr.univ-rennes1.fr', 'intel_xeon_gold_6226r', 2))
|
|
self.add_cluster_node_def(ClusterNodeDef('physix94.ipr.univ-rennes1.fr', 'intel_xeon_gold_6226r', 2))
|
|
self.add_cluster_node_def(ClusterNodeDef('physix95.ipr.univ-rennes1.fr', 'intel_xeon_gold_6248r', 2))
|
|
self.add_cluster_node_def(ClusterNodeDef('physix96.ipr.univ-rennes1.fr', 'intel_xeon_gold_6248r', 2))
|
|
self.add_cluster_node_def(ClusterNodeDef('physix97.ipr.univ-rennes1.fr', 'intel_xeon_gold_6248r', 2))
|
|
self.add_cluster_node_def(ClusterNodeDef('physix98.ipr.univ-rennes1.fr', 'intel_xeon_gold_6248r', 2))
|
|
self.add_cluster_node_def(ClusterNodeDef('physix99.ipr.univ-rennes1.fr', 'intel_xeon_gold_6240r', 2))
|
|
self.add_cluster_node_def(ClusterNodeDef('physix100.ipr.univ-rennes1.fr', 'intel_xeon_gold_6248r', 2))
|
|
self.add_cluster_node_def(ClusterNodeDef('physix101.ipr.univ-rennes1.fr', 'intel_xeon_gold_6248r', 2))
|
|
self.add_cluster_node_def(ClusterNodeDef('physix102.ipr.univ-rennes1.fr', 'intel_xeon_gold_6248r', 2))
|
|
elif cluster_id == 'dummy':
|
|
self.add_cluster_node_def(ClusterNodeDef('graffy-ws2.ipr.univ-rennes.fr', 'intel_core_i5_8350u', 1))
|
|
else:
|
|
assert False
|
|
|
|
self.cpu_defs = {}
|
|
self.add_cpu_def(CpuDef('intel_core_i5_8350u', 4))
|
|
self.add_cpu_def(CpuDef('intel_xeon_x5550', 4))
|
|
self.add_cpu_def(CpuDef('intel_xeon_x5650', 6))
|
|
self.add_cpu_def(CpuDef('intel_xeon_e5-2660', 8))
|
|
self.add_cpu_def(CpuDef('intel_xeon_e5-2660v2', 10))
|
|
self.add_cpu_def(CpuDef('intel_xeon_e5-2660v4', 14))
|
|
self.add_cpu_def(CpuDef('intel_xeon_gold_6140', 18))
|
|
self.add_cpu_def(CpuDef('intel_xeon_gold_6154', 18))
|
|
self.add_cpu_def(CpuDef('intel_xeon_gold_5220', 4))
|
|
self.add_cpu_def(CpuDef('intel_xeon_gold_6226r', 16))
|
|
self.add_cpu_def(CpuDef('intel_xeon_gold_6248r', 24))
|
|
self.add_cpu_def(CpuDef('intel_xeon_gold_6348', 28))
|
|
self.add_cpu_def(CpuDef('amd_epyc_7282', 16))
|
|
self.add_cpu_def(CpuDef('amd_epyc_7452', 32))
|
|
|
|
def add_cluster_node_def(self, cluster_node_def: ClusterNodeDef):
|
|
self.cluster_nodes_defs.append(cluster_node_def)
|
|
|
|
def add_cpu_def(self, cpu_def: CpuDef):
|
|
self.cpu_defs[cpu_def.cpu_id] = cpu_def
|
|
|
|
def get_host_group_info(self, host_group_id: HostGroupId) -> Tuple[List[HostFqdn], int]:
|
|
hosts = [cluster_node_def.host_fqdn for cluster_node_def in self.cluster_nodes_defs if cluster_node_def.cpu_id == host_group_id]
|
|
num_cpus_set = set([cluster_node_def.num_cpus for cluster_node_def in self.cluster_nodes_defs if cluster_node_def.cpu_id == host_group_id])
|
|
assert len(num_cpus_set) <= 1, f'the number of cpus for the host group {host_group_id} is not homogen ({num_cpus_set})'
|
|
if len(num_cpus_set) > 0:
|
|
num_cpus = num_cpus_set.pop()
|
|
num_cores = self.cpu_defs[host_group_id].num_cores * num_cpus
|
|
else:
|
|
num_cores = 0
|
|
return (hosts, num_cores)
|
|
|
|
|
|
class ICluster(abc.ABC):
|
|
cluster_db: ClusterNodeDb
|
|
cluster_id: ClusterId
|
|
|
|
def __init__(self, cluster_id: ClusterId, cluster_db: ClusterNodeDb):
|
|
self.cluster_id = cluster_id
|
|
self.cluster_db = cluster_db
|
|
|
|
@abc.abstractmethod
|
|
def path_is_reachable_by_compute_nodes(self, path: Path):
|
|
pass
|
|
|
|
@abc.abstractmethod
|
|
def submit_job(self, qsub_args: List[str], exec_path: Path, exec_args: List[str], working_dir: Path):
|
|
"""
|
|
qsub_args: the arguments sent to qsub, eg ['-pe', 'smp', '12', 'gaussian.job', 'h2o.gjf']
|
|
"""
|
|
|
|
def get_cluster_db(self) -> ClusterNodeDb:
|
|
return self.cluster_db
|
|
|
|
|
|
class IprCluster(ICluster):
|
|
|
|
def __init__(self, cluster_id: ClusterId):
|
|
super().__init__(cluster_id, ClusterNodeDb(cluster_id))
|
|
|
|
def path_is_reachable_by_compute_nodes(self, path: Path):
|
|
path_is_reachable = False
|
|
for shared_disk_path in [Path('/opt/ipr/cluster/work.global')]:
|
|
try:
|
|
_ = path.relative_to(shared_disk_path)
|
|
except ValueError:
|
|
continue
|
|
path_is_reachable = True
|
|
break
|
|
return path_is_reachable
|
|
|
|
def submit_job(self, qsub_args: List[str], exec_path: Path, exec_args: List[str], working_dir: Path):
|
|
qsub_command = f'qsub {" ".join(qsub_args)} {exec_path} {" ".join(exec_args)}'
|
|
logging.debug('qsub_command = %s, working_dir=%s', qsub_command, working_dir)
|
|
subprocess.run(qsub_command, cwd=working_dir, check=True, shell=True)
|
|
|
|
|
|
class DummyCluster(ICluster):
|
|
|
|
def __init__(self):
|
|
cluster_id = 'dummy'
|
|
super().__init__(cluster_id, ClusterNodeDb(cluster_id))
|
|
|
|
def path_is_reachable_by_compute_nodes(self, path: Path):
|
|
return True
|
|
|
|
def submit_job(self, qsub_args: List[str], exec_path: Path, exec_args: List[str], working_dir: Path):
|
|
qsub_command = f'qsub {" ".join(qsub_args)} {exec_path} {" ".join(exec_args)}'
|
|
logging.info('executing %s as a replacement of qsub_command %s, working_dir=%s', exec_path, qsub_command, working_dir)
|
|
subprocess.run(exec_path, check=True, cwd=working_dir)
|
|
|
|
|
|
class ClusterFactory(metaclass=Singleton):
|
|
|
|
def __init__(self):
|
|
pass
|
|
|
|
def create_cluster(self, cluster_id: ClusterId) -> ICluster:
|
|
cluster = {
|
|
'dummy': DummyCluster(),
|
|
'physix': IprCluster('physix'),
|
|
'alambix': IprCluster('alambix')
|
|
}[cluster_id]
|
|
return cluster
|
|
|
|
|
|
def duplicate_this_virtualenv_to(duplicate_virtualenv_path: Path):
|
|
this_virtualenv_path = Path(getenv('VIRTUAL_ENV')) # eg /home/graffy/work/starbench/iprbench.git/iprbench.venv
|
|
assert this_virtualenv_path.exists(), f'failed to find the root the virtual environment in use (VIRTUAL_ENV environment variable has the value {this_virtualenv_path})'
|
|
|
|
if duplicate_virtualenv_path.exists():
|
|
shutil.rmtree(duplicate_virtualenv_path)
|
|
|
|
cloner_virtualenv_path = Path('/tmp/venv_cloner.venv')
|
|
venv.create(cloner_virtualenv_path, with_pip=True)
|
|
subprocess.run(f'source {cloner_virtualenv_path}/bin/activate; pip install virtualenv-clone', shell=True, check=True, executable='/bin/bash')
|
|
subprocess.run(f'source {cloner_virtualenv_path}/bin/activate; virtualenv-clone {this_virtualenv_path} {duplicate_virtualenv_path}', shell=True, check=True, executable='/bin/bash')
|
|
shutil.rmtree(cloner_virtualenv_path)
|
|
|
|
|
|
def archive_this_virtualenv_to(venv_archive_path: Path, venv_hardcoded_path: Path):
|
|
duplicate_this_virtualenv_to(venv_hardcoded_path)
|
|
subprocess.run(f'tar czvf {venv_archive_path} {venv_hardcoded_path.relative_to(venv_hardcoded_path.parent)}', shell=True, check=True, cwd=venv_hardcoded_path.parent, stdout=subprocess.DEVNULL)
|
|
|
|
|
|
def launch_job_for_host_group(benchmark: IBenchmark, benchmark_config: BenchmarkConfig, host_group_id: HostGroupId, results_dir: Path, cluster: ICluster, resultsdb_params: ResultsDbParams, target_system_type_id: HostTypeId):
|
|
|
|
compiler_id: CompilerId = benchmark_config['fortran_compiler']
|
|
|
|
cluster_db = cluster.get_cluster_db()
|
|
|
|
(hosts, num_cores) = cluster_db.get_host_group_info(host_group_id)
|
|
if len(hosts) == 0:
|
|
logging.warning('skipping benchmarks with compiler %s on architecture %s because no hosts are available for it', compiler_id, host_group_id)
|
|
return
|
|
|
|
benchmark_config['num_cores'] = num_cores # we expect the benchmark to have the parameter num_cores
|
|
|
|
makedirs(results_dir, exist_ok=True)
|
|
|
|
this_bench_dir = Path(f'{results_dir}/{host_group_id}')
|
|
makedirs(this_bench_dir, exist_ok=True)
|
|
|
|
starbench_job_path = this_bench_dir / 'starbench.job'
|
|
|
|
job_venv_archive_path = results_dir / 'iprbench.venv.tgz'
|
|
iprbench_venv_hardcoded_path = Path('/tmp') / 'iprbench.venv'
|
|
if job_venv_archive_path.exists():
|
|
logging.info('skipping the creation of %s because it already exists (probably created for other jobs of the same bench)', job_venv_archive_path)
|
|
else:
|
|
# freeze this virtualenv so that all jobs related to this benchmark will use the same version of iprbench
|
|
logging.info('creating %s (the virtual environment that will be used in this bench by all its jobs at some point)', job_venv_archive_path)
|
|
archive_this_virtualenv_to(job_venv_archive_path, iprbench_venv_hardcoded_path)
|
|
|
|
logging.debug("type of resultsdb_params = %s", type(resultsdb_params))
|
|
logging.debug("resultsdb_params = %s", resultsdb_params)
|
|
logging.debug("resultsdb_params = %s", json.dumps(resultsdb_params))
|
|
|
|
# create the job file (which embeds starbench.py)
|
|
tags_dict = {
|
|
# '<include:starbench.py>': scripts_dir / 'starbench.py',
|
|
'<benchmark_id>': str(benchmark.bench_id),
|
|
'<starbench_job_path>': str(starbench_job_path),
|
|
'<iprbench_venv_hardcoded_path>': str(iprbench_venv_hardcoded_path),
|
|
'<iprbench_venv_archive_path>': str(job_venv_archive_path),
|
|
'<benchmark_config>': json.dumps(benchmark_config).replace('"', r'\"'),
|
|
'<results_dir>': str(results_dir),
|
|
'<resultsdb_params>': json.dumps(resultsdb_params).replace('"', r'\"'),
|
|
'<num_cores>': str(num_cores),
|
|
'<target_system_type_id>': str(target_system_type_id),
|
|
}
|
|
logging.debug('tags_dict = %s', str(tags_dict))
|
|
with importlib.resources.path('iprbench.resources', 'clusterbench-template.job') as job_template_path:
|
|
# job_template_path = importlib.resources..files('iprbench.resources') / 'hibench' / 'starbench-template.job'
|
|
substitute_tags(input_file_path=job_template_path, tags_dict=tags_dict, output_file_path=starbench_job_path)
|
|
subprocess.run(['chmod', 'a+x', starbench_job_path], check=True)
|
|
|
|
ram_requirements = benchmark.get_ram_requirements(benchmark_config)
|
|
ram_per_core = f'{ram_requirements / num_cores / 1.e9}G'
|
|
|
|
qsub_args = []
|
|
qsub_args += ['-pe', 'smp', f'{num_cores}']
|
|
qsub_args += ['-l', f'"hostname={"|".join(hosts)}"']
|
|
qsub_args += ['-S', '/bin/bash']
|
|
qsub_args += ['-cwd']
|
|
qsub_args += ['-m', 'ae']
|
|
qsub_args += ['-l', f'mem_available={ram_per_core}']
|
|
qsub_args += ['-j', 'y'] # merge stderr file into stdout file for easier reading of history of events
|
|
qsub_args += ['-N', f'hibench_{host_group_id}']
|
|
|
|
logging.debug('qsub_args = %s', str(qsub_args))
|
|
|
|
exec_path = starbench_job_path
|
|
exec_args = []
|
|
|
|
cluster.submit_job(qsub_args, exec_path, exec_args, this_bench_dir)
|
|
|
|
|
|
def launch_perf_jobs(benchmark: IBenchmark, benchmark_config: BenchmarkConfig, results_dir: Path, cluster: ICluster, arch_regexp: str, resultsdb_params: ResultsDbParams, target_system_type_id: HostTypeId):
|
|
"""
|
|
results_dir: where the results of the benchmark are stored (eg $GLOBAL_WORK_DIR/graffy/benchmarks/hibench)
|
|
"""
|
|
|
|
cluster_db = ClusterNodeDb()
|
|
all_host_groups = cluster_db.cpu_defs.keys()
|
|
|
|
logging.info('available host groups: %s', all_host_groups)
|
|
host_groups = [host_group for host_group in all_host_groups if re.match(arch_regexp, host_group) is not None]
|
|
logging.info('requested host groups: %s', host_groups)
|
|
|
|
for host_group in host_groups:
|
|
launch_job_for_host_group(benchmark, benchmark_config, host_group, results_dir, cluster, resultsdb_params, target_system_type_id)
|
|
|
|
|
|
def main():
|
|
logging.basicConfig(level=logging.DEBUG)
|
|
example_text = '''example:
|
|
|
|
%(prog)s --benchmark-id 'mamul1' --config '{"matrix_size": 1024, "num_loops":10}' --results-dir /tmp/mamul1_out
|
|
|
|
'''
|
|
|
|
arg_parser = argparse.ArgumentParser(description='submits a benchmark on the compute cluster (assuming this is running from a sge cluster machine where qsub command is available)', epilog=example_text, formatter_class=argparse.RawDescriptionHelpFormatter)
|
|
arg_parser.add_argument('--cluster-id', type=ClusterId, required=True, help='the identifier of cluster on which to submit the benchmark eg (\'dummy\', \'alambix\', etc.)')
|
|
arg_parser.add_argument('--benchmark-id', type=BenchmarkId, required=True, help='the benchmark id of the benchmark to perform (eg mamul1)')
|
|
arg_parser.add_argument('--results-dir', type=Path, required=True, help='the root directory of the tree where the results of the benchmarks are stored (eg $GLOBAL_WORK_DIR/graffy/benchmarks/hibench)')
|
|
arg_parser.add_argument('--config', type=str, default='cmake', help='the benchmark configuration in json format, eg {"compiler_id": "gfortran", "matrix_size": 1024}')
|
|
arg_parser.add_argument('--arch-regexp', type=str, default='.*', help='the regular expression for the architectures the benchmark is allowed to run on (eg "intel_xeon_.*"). By defauls, all available architectures are allowed.')
|
|
arg_parser.add_argument('--resultsdb-params', type=str, required=True, help='the resultsdb configuration in json format, eg {"type": "tsv-files", "tsv_results_dir": "/tmp/toto"}')
|
|
arg_parser.add_argument('--target-system-type-id', type=str, required=True, help='id of the operating system type to use. This is used to get the list installed packages, how to activate them, etc, eg "debian", "fr.univ-rennes.ipr.cluster-node".')
|
|
|
|
args = arg_parser.parse_args()
|
|
benchmark_id = ClusterId(args.benchmark_id)
|
|
|
|
results_dir = Path(args.results_dir)
|
|
arch_regexp = args.arch_regexp
|
|
benchmark_config = json.loads(args.config)
|
|
|
|
cluster = ClusterFactory().create_cluster(args.cluster_id)
|
|
resultsdb_params = json.loads(args.resultsdb_params)
|
|
|
|
# TODO: put the declaration of common params in a common function so that there is only one set of common parameters
|
|
common_params: List[BenchParam] = []
|
|
common_params.append(BenchParam('launcher', BenchParam.Type.PARAM_TYPE_STRING, description='what triggered the benchmark (eg "alambix.job.12345", or "manual")'))
|
|
|
|
benchmark = BenchmarkFactory().create_benchmark(benchmark_id, common_params)
|
|
|
|
target_system_type_id = HostTypeId(args.target_system_type_id)
|
|
|
|
if not cluster.path_is_reachable_by_compute_nodes(results_dir):
|
|
raise ValueError('the results path is expected to be on a disk that is accessible to all cluster nodes, and it doesn\'t seem to be the case for {results_dir}')
|
|
|
|
launch_perf_jobs(benchmark, benchmark_config, results_dir, cluster, arch_regexp, resultsdb_params, target_system_type_id)
|