added support for commands depending on worker to `StarBencher`

This allows `measure_hibridon_perf` to launch `ctest` in a safe way (each worker has its own build directory). As a result, measure_hibridon_perf now works.
This commit is contained in:
Guillaume Raffy 2022-05-31 17:59:28 +02:00
parent 35d271ff63
commit 34fc73cccf
1 changed files with 48 additions and 12 deletions

View File

@ -12,8 +12,9 @@ from typing import ForwardRef
class Run():
def __init__(self, run_id: int):
def __init__(self, run_id: int, worker_id: int):
self.id = run_id
self.worker_id = worker_id # the worker used for this run (number of workers = number of parallel runs)
self.pid = None
self.return_code = 0
self.start_time = datetime.now()
@ -131,6 +132,7 @@ class StarBencher():
run.pid = pid
run.end_time = end_time
run.return_code = return_code
do_stop = False
if self.stop_on_error and run.return_code != 0:
do_stop = True
@ -138,22 +140,25 @@ class StarBencher():
do_stop = self.stop_condition.should_stop(self)
if not do_stop:
print('adding a run')
self._start_run()
self._start_run(run.worker_id) # reuse the same worker as the run that has just finished
if self._all_runs_have_finished():
# tell the main thread that all the runs have finished
self._finished_event.set()
def _start_run(self):
def _start_run(self, worker_id: int):
print(self.run_command)
worker_as_str = '%03d' % worker_id
run_command = [str(s).replace('<worker_id>', worker_as_str) for s in self.run_command]
run_command_cwd = str(self.run_command_cwd).replace('<worker_id>', worker_as_str)
with self._runs_lock:
run = Run(self._next_run_id)
run = Run(self._next_run_id, worker_id)
self._next_run_id += 1
run_thread = self.popen_and_call(popen_args=self.run_command, on_exit=self.on_exit, run_id=run.id, cwd=self.run_command_cwd) # noqa:F841
run_thread = self.popen_and_call(popen_args=run_command, on_exit=self.on_exit, run_id=run.id, cwd=run_command_cwd) # noqa:F841
self._runs[run.id] = run
def run(self):
for run_index in range(self.num_parallel_runs):
self._start_run()
for worker_id in range(self.num_parallel_runs):
self._start_run(worker_id)
# wait until all runs have finished
self._finished_event.wait()
with self._runs_lock:
@ -174,13 +179,44 @@ def measure_hibridon_perf(hibridon_version: str, tmp_dir: Path, num_cores: int,
assert src_dir.exists()
for compiler in ['gfortran']: # , 'ifort']:
build_dir = tmp_dir / compiler
build_dir.mkdir(exist_ok=True)
subprocess.run(['cmake', '-DCMAKE_BUILD_TYPE=Release', '-DBUILD_TESTING=ON', src_dir], cwd=build_dir)
subprocess.run(['make'], cwd=build_dir)
# we need one build for each parallel run, otherwise running ctest on parallel would overwrite the same file, which causes the test to randomkly fail depnding on race conditions
build_dir = tmp_dir / compiler / 'worker<worker_id>'
create_build_dir = StarBencher(
run_command=['mkdir', '-p', build_dir],
num_cores_per_run=1,
num_parallel_runs=num_cores,
max_num_cores=num_cores,
stop_condition=StopAfterSingleRun(),
run_command_cwd=Path('/tmp'))
create_build_dir_duration = create_build_dir.run() # noqa: F841
# build_dir.mkdir(exist_ok=True)
configure = StarBencher(
run_command=['cmake', '-DCMAKE_BUILD_TYPE=Release', '-DBUILD_TESTING=ON', src_dir],
num_cores_per_run=1,
num_parallel_runs=num_cores,
max_num_cores=num_cores,
stop_condition=StopAfterSingleRun(),
run_command_cwd=build_dir)
configure_duration = configure.run() # noqa: F841
build = StarBencher(
run_command=['make'],
num_cores_per_run=1,
num_parallel_runs=num_cores,
max_num_cores=num_cores,
stop_condition=StopAfterSingleRun(),
run_command_cwd=build_dir)
build_duration = build.run() # noqa: F841
stop_condition = StopAfterSingleRun()
bench = StarBencher(run_command=['ctest', '--output-on-failure', '-L', '^arch4_quick$'], num_cores_per_run=1, num_parallel_runs=num_cores, max_num_cores=num_cores, stop_condition=stop_condition, run_command_cwd=build_dir)
bench = StarBencher(
run_command=['ctest', '--output-on-failure', '-L', '^arch4_quick$'],
num_cores_per_run=1,
num_parallel_runs=num_cores,
max_num_cores=num_cores,
stop_condition=stop_condition,
run_command_cwd=build_dir)
mean_duration = bench.run()
print('duration for compiler %s : %.3f s' % (compiler, mean_duration))