added a tool to measure the performance of a specific version of hibridon
This benchmarking tool includes `starbench` a benchmarking method (inspired by `hpl`'s `stardgemm`) which runs the same command in parallel, so that performance is measured in conditions similar to what a user would experience in a compute cluster environment. This is functional but still work in progress (the code dates from the 03/05/2022).
This commit is contained in:
commit
f136861893
|
@ -0,0 +1 @@
|
|||
This directory contains scripts for hibridon that are specific to IPR (Institut de Physique de Rennes)
|
|
@ -0,0 +1,167 @@
|
|||
#!/usr/bin/env python3
|
||||
# this script performs a performance benchmark of hibridon using ipr (Institut de Physique de Rennes)'s cluster
|
||||
import threading
|
||||
import subprocess
|
||||
import os
|
||||
from typing import List, Dict # , Set, , Tuple, Optional
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
class Run():
|
||||
|
||||
def __init__(self, run_id: int):
|
||||
self.id = run_id
|
||||
self.pid = None
|
||||
self.return_code = 0
|
||||
self.start_time = datetime.now()
|
||||
self.end_time = None
|
||||
|
||||
def has_finished(self):
|
||||
return self.end_time is not None
|
||||
|
||||
def get_duration(self):
|
||||
assert self.has_finished()
|
||||
return (self.end_time - self.start_time).total_seconds()
|
||||
|
||||
|
||||
class StarBencher():
|
||||
'''
|
||||
the 'star' term comes from hpl's stadgemm benchmark, where we launch `n` independent programs on `n cores`
|
||||
'''
|
||||
|
||||
def __init__(self, run_command: List[str], num_cores_per_run: int, num_parallel_runs: int, max_num_cores: int, max_error: float = 0.01, stop_on_error=True, run_command_cwd: Path = None):
|
||||
assert num_cores_per_run * num_parallel_runs <= max_num_cores
|
||||
self.run_command: List[str] = run_command
|
||||
self.run_command_cwd = run_command_cwd
|
||||
self.num_cores_per_run = num_cores_per_run
|
||||
self.num_parallel_runs = num_parallel_runs
|
||||
self.max_num_cores: int = max_num_cores
|
||||
self.max_error: float = max_error
|
||||
self.stop_on_error = stop_on_error
|
||||
self._next_run_id: int = 0
|
||||
self._runs: Dict(int, Run) = {}
|
||||
self._last_mean_duration = None
|
||||
self._num_runs = 0
|
||||
self._runs_lock = threading.Lock()
|
||||
self._finished_event = threading.Event()
|
||||
|
||||
def popen_and_call(self, popen_args, on_exit, run_id: int, cwd: Path):
|
||||
"""
|
||||
Runs the given args in a subprocess.Popen, and then calls the function
|
||||
on_exit when the subprocess completes.
|
||||
on_exit is a callable object, and popen_args is a list/tuple of args that
|
||||
would give to subprocess.Popen.
|
||||
"""
|
||||
def run_in_thread(popen_args, on_exit):
|
||||
print('popen_args', popen_args)
|
||||
proc = subprocess.Popen(popen_args, cwd=cwd)
|
||||
print('coucou')
|
||||
proc.wait()
|
||||
on_exit(proc.pid, proc.returncode, run_id)
|
||||
return
|
||||
thread = threading.Thread(target=run_in_thread, args=(popen_args, on_exit))
|
||||
thread.start()
|
||||
# returns immediately after the thread starts
|
||||
return thread
|
||||
|
||||
def _get_run_mean_duration(self):
|
||||
duration_sums: float = 0.0
|
||||
num_finished_runs: int = 0
|
||||
with self._runs_lock:
|
||||
for run in self._runs.values():
|
||||
if run.has_finished():
|
||||
num_finished_runs += 1
|
||||
duration_sums += run.get_duration()
|
||||
assert num_finished_runs > 0
|
||||
return duration_sums / num_finished_runs, num_finished_runs
|
||||
|
||||
def _all_runs_have_finished(self):
|
||||
with self._runs_lock:
|
||||
for run in self._runs.values():
|
||||
if not run.has_finished():
|
||||
return False
|
||||
return True
|
||||
|
||||
def on_exit(self, pid: int, return_code: int, run_id: int):
|
||||
end_time = datetime.now()
|
||||
# print(self, pid, run_id)
|
||||
run = self._runs[run_id]
|
||||
run.pid = pid
|
||||
run.end_time = end_time
|
||||
run.return_code = return_code
|
||||
do_stop = False
|
||||
if self.stop_on_error and run.return_code != 0:
|
||||
do_stop = True
|
||||
else:
|
||||
mean_duration, num_runs = self._get_run_mean_duration()
|
||||
print('mean_duration = %f' % mean_duration)
|
||||
if self._last_mean_duration is not None:
|
||||
diff = abs(mean_duration - self._last_mean_duration)
|
||||
print('diff = %f' % diff)
|
||||
if diff < self.max_error:
|
||||
do_stop = True
|
||||
self._num_runs = num_runs
|
||||
self._last_mean_duration = mean_duration
|
||||
if not do_stop:
|
||||
print('adding a run')
|
||||
self._start_run()
|
||||
if self._all_runs_have_finished():
|
||||
# tell the main thread that all the runs have finished
|
||||
self._finished_event.set()
|
||||
|
||||
def _start_run(self):
|
||||
print(self.run_command)
|
||||
with self._runs_lock:
|
||||
run = Run(self._next_run_id)
|
||||
self._next_run_id += 1
|
||||
run_thread = self.popen_and_call(popen_args=self.run_command, on_exit=self.on_exit, run_id=run.id, cwd=self.run_command_cwd) # noqa:F841
|
||||
self._runs[run.id] = run
|
||||
|
||||
def run(self):
|
||||
for run_index in range(self.num_parallel_runs):
|
||||
self._start_run()
|
||||
# wait until all runs have finished
|
||||
self._finished_event.wait()
|
||||
with self._runs_lock:
|
||||
if not all([run.return_code == 0 for run in self._runs.values()]):
|
||||
raise Exception('at least one run failed')
|
||||
print('mean duration : %.3f s (%d runs)' % (self._last_mean_duration, self._num_runs))
|
||||
print('finished')
|
||||
return self._last_mean_duration
|
||||
|
||||
|
||||
def measure_hibridon_perf(hibridon_version: str, tmp_dir: Path, num_cores: int, github_username: str, github_personal_access_token: str):
|
||||
tmp_dir.mkdir(exist_ok=True)
|
||||
hibridon_git_url = 'https://%s:%s@github.com/hibridon/hibridon' % (github_username, github_personal_access_token)
|
||||
subprocess.run(['git', 'clone', '%s' % (hibridon_git_url)], cwd=tmp_dir)
|
||||
src_dir = tmp_dir / 'hibridon'
|
||||
subprocess.run(['git', 'checkout', '%s' % (hibridon_version)], cwd=src_dir)
|
||||
assert src_dir.exists()
|
||||
for compiler in ['gfortran']: # , 'ifort']:
|
||||
build_dir = tmp_dir / compiler
|
||||
build_dir.mkdir(exist_ok=True)
|
||||
subprocess.run(['cmake', '-DCMAKE_BUILD_TYPE=Release', '-DBUILD_TESTING=ON', src_dir], cwd=build_dir)
|
||||
subprocess.run(['make'], cwd=build_dir)
|
||||
bench = StarBencher(run_command=['ctest', '-L', '^arch4_quick$'], num_cores_per_run=1, num_parallel_runs=num_cores, max_num_cores=num_cores, max_error=0.0001, run_command_cwd=build_dir)
|
||||
mean_duration = bench.run()
|
||||
print('duration for compiler %s : %.3f s' % (compiler, mean_duration))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if True:
|
||||
github_username = 'g-raffy' # os.environ['HIBRIDON_REPOS_USER']
|
||||
with open('%s/.github/personal_access_tokens/bench.hibridon.cluster.ipr.univ-rennes1.fr.pat' % os.environ['HOME'], 'r') as f:
|
||||
github_personal_access_token = f.readline().replace('\n', '') # os.environ['HIBRIDON_REPOS_PAT']
|
||||
print('coucou', github_personal_access_token[-1])
|
||||
hibridon_version = '02aeb2c2da5ebe0f7301c9909aa623864e562c71'
|
||||
tmp_dir = Path('/tmp/hibridon_perf')
|
||||
measure_hibridon_perf(hibridon_version, tmp_dir, num_cores=2, github_username=github_username, github_personal_access_token=github_personal_access_token)
|
||||
|
||||
if False:
|
||||
bench = StarBencher(run_command=['sleep', '0.1415927'], num_cores_per_run=1, num_parallel_runs=2, max_num_cores=2, max_error=0.0001)
|
||||
mean_duration = bench.run()
|
||||
|
||||
if False:
|
||||
bench = StarBencher(run_command=['ls', '/tmp'], num_cores_per_run=1, num_parallel_runs=2, max_num_cores=2, max_error=0.0001)
|
||||
mean_duration = bench.run()
|
Loading…
Reference in New Issue