added a mechanism to prevent starbench to hang in case the executed command fails
I had the case where on_exit() was never called because proc had no value and therefore the attempt to pass proc.pid to on_exit caused an exception before on_exit was called. As a result, the mater thread was waiting its children threads forever, as these child never signaled that they finished. work related to [https://bugzilla.ipr.univ-rennes.fr/show_bug.cgi?id=3372]
This commit is contained in:
parent
d71bf3f67f
commit
f2ceeb2cdb
|
@ -160,22 +160,32 @@ class CommandPerfEstimator(): # (false positive) pylint: disable=function-redef
|
||||||
def run_in_thread(popen_args: List[str], on_exit: Callable[[ProcessId, ReturnCode, RunId], None]):
|
def run_in_thread(popen_args: List[str], on_exit: Callable[[ProcessId, ReturnCode, RunId], None]):
|
||||||
stdout = None
|
stdout = None
|
||||||
stderr = None
|
stderr = None
|
||||||
if stdout_filepath is not None:
|
returncode = -1
|
||||||
stdout = open(stdout_filepath, 'w', encoding='utf8')
|
pid = -1
|
||||||
if stderr_filepath is not None:
|
streams_are_ok = True
|
||||||
stderr = open(stderr_filepath, 'w', encoding='utf8')
|
try:
|
||||||
env = os.environ.copy()
|
# with open(stdout_filepath, 'w', encoding='utf8') as stdout, open(stderr_filepath, 'w', encoding='utf8') as stderr:
|
||||||
# restrict the number of threads used by openmp
|
if stdout_filepath is not None:
|
||||||
env['OMP_NUM_THREADS'] = f'{self.num_cores_per_run}'
|
stdout = open(stdout_filepath, 'w', encoding='utf8')
|
||||||
# restrict the nu,ber of threads used by intel math kernel library
|
if stderr_filepath is not None:
|
||||||
env['MKL_NUM_THREADS'] = f'{self.num_cores_per_run}'
|
stderr = open(stderr_filepath, 'w', encoding='utf8')
|
||||||
proc = subprocess.Popen(popen_args, cwd=cwd, stdout=stdout, stderr=stderr, env=env)
|
except:
|
||||||
proc.wait()
|
print(f'failed to open {stdout_filepath} or {stderr_filepath} in write mode')
|
||||||
if stderr is not None:
|
streams_are_ok = False
|
||||||
stderr.close()
|
if streams_are_ok:
|
||||||
if stdout is not None:
|
try:
|
||||||
stdout.close()
|
env = os.environ.copy()
|
||||||
on_exit(proc.pid, proc.returncode, run_id)
|
# restrict the number of threads used by openmp
|
||||||
|
env['OMP_NUM_THREADS'] = f'{self.num_cores_per_run}'
|
||||||
|
# restrict the nu,ber of threads used by intel math kernel library
|
||||||
|
env['MKL_NUM_THREADS'] = f'{self.num_cores_per_run}'
|
||||||
|
proc = subprocess.Popen(popen_args, cwd=cwd, stdout=stdout, stderr=stderr, env=env)
|
||||||
|
pid = proc.pid
|
||||||
|
proc.wait()
|
||||||
|
returncode = proc.returncode
|
||||||
|
except:
|
||||||
|
print(f'command failed: {popen_args}')
|
||||||
|
on_exit(pid, returncode, run_id)
|
||||||
return
|
return
|
||||||
thread = threading.Thread(target=run_in_thread, args=(popen_args, on_exit))
|
thread = threading.Thread(target=run_in_thread, args=(popen_args, on_exit))
|
||||||
thread.start()
|
thread.start()
|
||||||
|
@ -255,8 +265,8 @@ class CommandPerfEstimator(): # (false positive) pylint: disable=function-redef
|
||||||
with self._runs_lock:
|
with self._runs_lock:
|
||||||
run = Run(self._next_run_id, worker_id)
|
run = Run(self._next_run_id, worker_id)
|
||||||
self._next_run_id += 1
|
self._next_run_id += 1
|
||||||
_run_thread = self.popen_and_call(popen_args=run_command, on_exit=self.on_exit, run_id=run.id, cwd=run_command_cwd, stdout_filepath=stdout_filepath, stderr_filepath=stderr_filepath) # noqa:F841
|
|
||||||
self._runs[run.id] = run
|
self._runs[run.id] = run
|
||||||
|
_run_thread = self.popen_and_call(popen_args=run_command, on_exit=self.on_exit, run_id=run.id, cwd=run_command_cwd, stdout_filepath=stdout_filepath, stderr_filepath=stderr_filepath) # noqa:F841
|
||||||
|
|
||||||
def run(self) -> DurationInSeconds:
|
def run(self) -> DurationInSeconds:
|
||||||
'''performs the runs of the command and returns the runs' average duration'''
|
'''performs the runs of the command and returns the runs' average duration'''
|
||||||
|
|
|
@ -77,7 +77,8 @@ def starbench_cmake_app(source_code_provider: IFileTreeProvider, tmp_dir: Path,
|
||||||
max_num_cores=num_cores,
|
max_num_cores=num_cores,
|
||||||
stop_condition=StopAfterSingleRun(),
|
stop_condition=StopAfterSingleRun(),
|
||||||
run_command_cwd=Path('/tmp'),
|
run_command_cwd=Path('/tmp'),
|
||||||
stdout_filepath=None)
|
stdout_filepath=worker_dir / 'createdir_stdout.txt',
|
||||||
|
stderr_filepath=worker_dir / 'createdir_stderr.txt')
|
||||||
_create_build_dir_duration = create_build_dir.run() # noqa: F841
|
_create_build_dir_duration = create_build_dir.run() # noqa: F841
|
||||||
# build_dir.mkdir(exist_ok=True)
|
# build_dir.mkdir(exist_ok=True)
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue