From f2ceeb2cdb638a8c0129877a1ad823b00c7dc0fe Mon Sep 17 00:00:00 2001 From: Guillaume Raffy Date: Tue, 8 Oct 2024 16:46:53 +0200 Subject: [PATCH] added a mechanism to prevent starbench to hang in case the executed command fails I had the case where on_exit() was never called because proc had no value and therefore the attempt to pass proc.pid to on_exit caused an exception before on_exit was called. As a result, the mater thread was waiting its children threads forever, as these child never signaled that they finished. work related to [https://bugzilla.ipr.univ-rennes.fr/show_bug.cgi?id=3372] --- src/starbench/core.py | 44 ++++++++++++++++++++++++++----------------- src/starbench/main.py | 3 ++- 2 files changed, 29 insertions(+), 18 deletions(-) diff --git a/src/starbench/core.py b/src/starbench/core.py index 416883c..f8cca14 100755 --- a/src/starbench/core.py +++ b/src/starbench/core.py @@ -160,22 +160,32 @@ class CommandPerfEstimator(): # (false positive) pylint: disable=function-redef def run_in_thread(popen_args: List[str], on_exit: Callable[[ProcessId, ReturnCode, RunId], None]): stdout = None stderr = None - if stdout_filepath is not None: - stdout = open(stdout_filepath, 'w', encoding='utf8') - if stderr_filepath is not None: - stderr = open(stderr_filepath, 'w', encoding='utf8') - env = os.environ.copy() - # restrict the number of threads used by openmp - env['OMP_NUM_THREADS'] = f'{self.num_cores_per_run}' - # restrict the nu,ber of threads used by intel math kernel library - env['MKL_NUM_THREADS'] = f'{self.num_cores_per_run}' - proc = subprocess.Popen(popen_args, cwd=cwd, stdout=stdout, stderr=stderr, env=env) - proc.wait() - if stderr is not None: - stderr.close() - if stdout is not None: - stdout.close() - on_exit(proc.pid, proc.returncode, run_id) + returncode = -1 + pid = -1 + streams_are_ok = True + try: + # with open(stdout_filepath, 'w', encoding='utf8') as stdout, open(stderr_filepath, 'w', encoding='utf8') as stderr: + if stdout_filepath is not None: + stdout = open(stdout_filepath, 'w', encoding='utf8') + if stderr_filepath is not None: + stderr = open(stderr_filepath, 'w', encoding='utf8') + except: + print(f'failed to open {stdout_filepath} or {stderr_filepath} in write mode') + streams_are_ok = False + if streams_are_ok: + try: + env = os.environ.copy() + # restrict the number of threads used by openmp + env['OMP_NUM_THREADS'] = f'{self.num_cores_per_run}' + # restrict the nu,ber of threads used by intel math kernel library + env['MKL_NUM_THREADS'] = f'{self.num_cores_per_run}' + proc = subprocess.Popen(popen_args, cwd=cwd, stdout=stdout, stderr=stderr, env=env) + pid = proc.pid + proc.wait() + returncode = proc.returncode + except: + print(f'command failed: {popen_args}') + on_exit(pid, returncode, run_id) return thread = threading.Thread(target=run_in_thread, args=(popen_args, on_exit)) thread.start() @@ -255,8 +265,8 @@ class CommandPerfEstimator(): # (false positive) pylint: disable=function-redef with self._runs_lock: run = Run(self._next_run_id, worker_id) self._next_run_id += 1 - _run_thread = self.popen_and_call(popen_args=run_command, on_exit=self.on_exit, run_id=run.id, cwd=run_command_cwd, stdout_filepath=stdout_filepath, stderr_filepath=stderr_filepath) # noqa:F841 self._runs[run.id] = run + _run_thread = self.popen_and_call(popen_args=run_command, on_exit=self.on_exit, run_id=run.id, cwd=run_command_cwd, stdout_filepath=stdout_filepath, stderr_filepath=stderr_filepath) # noqa:F841 def run(self) -> DurationInSeconds: '''performs the runs of the command and returns the runs' average duration''' diff --git a/src/starbench/main.py b/src/starbench/main.py index b4e30e5..7535ca1 100755 --- a/src/starbench/main.py +++ b/src/starbench/main.py @@ -77,7 +77,8 @@ def starbench_cmake_app(source_code_provider: IFileTreeProvider, tmp_dir: Path, max_num_cores=num_cores, stop_condition=StopAfterSingleRun(), run_command_cwd=Path('/tmp'), - stdout_filepath=None) + stdout_filepath=worker_dir / 'createdir_stdout.txt', + stderr_filepath=worker_dir / 'createdir_stderr.txt') _create_build_dir_duration = create_build_dir.run() # noqa: F841 # build_dir.mkdir(exist_ok=True)