added a mechanism to prevent starbench to hang in case the executed command fails

I had the case where on_exit() was never called because proc had no value and therefore the attempt to pass proc.pid to on_exit caused an exception before on_exit was called. As a result, the mater thread was waiting its children threads forever, as these child never signaled that they finished.

work related to [https://bugzilla.ipr.univ-rennes.fr/show_bug.cgi?id=3372]
This commit is contained in:
Guillaume Raffy 2024-10-08 16:46:53 +02:00
parent d71bf3f67f
commit f2ceeb2cdb
2 changed files with 29 additions and 18 deletions

View File

@ -160,22 +160,32 @@ class CommandPerfEstimator(): # (false positive) pylint: disable=function-redef
def run_in_thread(popen_args: List[str], on_exit: Callable[[ProcessId, ReturnCode, RunId], None]): def run_in_thread(popen_args: List[str], on_exit: Callable[[ProcessId, ReturnCode, RunId], None]):
stdout = None stdout = None
stderr = None stderr = None
if stdout_filepath is not None: returncode = -1
stdout = open(stdout_filepath, 'w', encoding='utf8') pid = -1
if stderr_filepath is not None: streams_are_ok = True
stderr = open(stderr_filepath, 'w', encoding='utf8') try:
env = os.environ.copy() # with open(stdout_filepath, 'w', encoding='utf8') as stdout, open(stderr_filepath, 'w', encoding='utf8') as stderr:
# restrict the number of threads used by openmp if stdout_filepath is not None:
env['OMP_NUM_THREADS'] = f'{self.num_cores_per_run}' stdout = open(stdout_filepath, 'w', encoding='utf8')
# restrict the nu,ber of threads used by intel math kernel library if stderr_filepath is not None:
env['MKL_NUM_THREADS'] = f'{self.num_cores_per_run}' stderr = open(stderr_filepath, 'w', encoding='utf8')
proc = subprocess.Popen(popen_args, cwd=cwd, stdout=stdout, stderr=stderr, env=env) except:
proc.wait() print(f'failed to open {stdout_filepath} or {stderr_filepath} in write mode')
if stderr is not None: streams_are_ok = False
stderr.close() if streams_are_ok:
if stdout is not None: try:
stdout.close() env = os.environ.copy()
on_exit(proc.pid, proc.returncode, run_id) # restrict the number of threads used by openmp
env['OMP_NUM_THREADS'] = f'{self.num_cores_per_run}'
# restrict the nu,ber of threads used by intel math kernel library
env['MKL_NUM_THREADS'] = f'{self.num_cores_per_run}'
proc = subprocess.Popen(popen_args, cwd=cwd, stdout=stdout, stderr=stderr, env=env)
pid = proc.pid
proc.wait()
returncode = proc.returncode
except:
print(f'command failed: {popen_args}')
on_exit(pid, returncode, run_id)
return return
thread = threading.Thread(target=run_in_thread, args=(popen_args, on_exit)) thread = threading.Thread(target=run_in_thread, args=(popen_args, on_exit))
thread.start() thread.start()
@ -255,8 +265,8 @@ class CommandPerfEstimator(): # (false positive) pylint: disable=function-redef
with self._runs_lock: with self._runs_lock:
run = Run(self._next_run_id, worker_id) run = Run(self._next_run_id, worker_id)
self._next_run_id += 1 self._next_run_id += 1
_run_thread = self.popen_and_call(popen_args=run_command, on_exit=self.on_exit, run_id=run.id, cwd=run_command_cwd, stdout_filepath=stdout_filepath, stderr_filepath=stderr_filepath) # noqa:F841
self._runs[run.id] = run self._runs[run.id] = run
_run_thread = self.popen_and_call(popen_args=run_command, on_exit=self.on_exit, run_id=run.id, cwd=run_command_cwd, stdout_filepath=stdout_filepath, stderr_filepath=stderr_filepath) # noqa:F841
def run(self) -> DurationInSeconds: def run(self) -> DurationInSeconds:
'''performs the runs of the command and returns the runs' average duration''' '''performs the runs of the command and returns the runs' average duration'''

View File

@ -77,7 +77,8 @@ def starbench_cmake_app(source_code_provider: IFileTreeProvider, tmp_dir: Path,
max_num_cores=num_cores, max_num_cores=num_cores,
stop_condition=StopAfterSingleRun(), stop_condition=StopAfterSingleRun(),
run_command_cwd=Path('/tmp'), run_command_cwd=Path('/tmp'),
stdout_filepath=None) stdout_filepath=worker_dir / 'createdir_stdout.txt',
stderr_filepath=worker_dir / 'createdir_stderr.txt')
_create_build_dir_duration = create_build_dir.run() # noqa: F841 _create_build_dir_duration = create_build_dir.run() # noqa: F841
# build_dir.mkdir(exist_ok=True) # build_dir.mkdir(exist_ok=True)