added a mechanism to prevent starbench to hang in case the executed command fails
I had the case where on_exit() was never called because proc had no value and therefore the attempt to pass proc.pid to on_exit caused an exception before on_exit was called. As a result, the mater thread was waiting its children threads forever, as these child never signaled that they finished. work related to [https://bugzilla.ipr.univ-rennes.fr/show_bug.cgi?id=3372]
This commit is contained in:
		
							parent
							
								
									d71bf3f67f
								
							
						
					
					
						commit
						f2ceeb2cdb
					
				| 
						 | 
					@ -160,22 +160,32 @@ class CommandPerfEstimator():  # (false positive) pylint: disable=function-redef
 | 
				
			||||||
        def run_in_thread(popen_args: List[str], on_exit: Callable[[ProcessId, ReturnCode, RunId], None]):
 | 
					        def run_in_thread(popen_args: List[str], on_exit: Callable[[ProcessId, ReturnCode, RunId], None]):
 | 
				
			||||||
            stdout = None
 | 
					            stdout = None
 | 
				
			||||||
            stderr = None
 | 
					            stderr = None
 | 
				
			||||||
            if stdout_filepath is not None:
 | 
					            returncode = -1
 | 
				
			||||||
                stdout = open(stdout_filepath, 'w', encoding='utf8')
 | 
					            pid = -1
 | 
				
			||||||
            if stderr_filepath is not None:
 | 
					            streams_are_ok = True
 | 
				
			||||||
                stderr = open(stderr_filepath, 'w', encoding='utf8')
 | 
					            try:
 | 
				
			||||||
            env = os.environ.copy()
 | 
					                # with open(stdout_filepath, 'w', encoding='utf8') as stdout, open(stderr_filepath, 'w', encoding='utf8') as stderr:
 | 
				
			||||||
            # restrict the number of threads used by openmp
 | 
					                if stdout_filepath is not None:
 | 
				
			||||||
            env['OMP_NUM_THREADS'] = f'{self.num_cores_per_run}'
 | 
					                    stdout = open(stdout_filepath, 'w', encoding='utf8')
 | 
				
			||||||
            # restrict the nu,ber of threads used by intel math kernel library
 | 
					                if stderr_filepath is not None:
 | 
				
			||||||
            env['MKL_NUM_THREADS'] = f'{self.num_cores_per_run}'
 | 
					                    stderr = open(stderr_filepath, 'w', encoding='utf8')
 | 
				
			||||||
            proc = subprocess.Popen(popen_args, cwd=cwd, stdout=stdout, stderr=stderr, env=env)
 | 
					            except:
 | 
				
			||||||
            proc.wait()
 | 
					                print(f'failed to open {stdout_filepath} or {stderr_filepath} in write mode')
 | 
				
			||||||
            if stderr is not None:
 | 
					                streams_are_ok = False
 | 
				
			||||||
                stderr.close()
 | 
					            if streams_are_ok:
 | 
				
			||||||
            if stdout is not None:
 | 
					                try:
 | 
				
			||||||
                stdout.close()
 | 
					                    env = os.environ.copy()
 | 
				
			||||||
            on_exit(proc.pid, proc.returncode, run_id)
 | 
					                    # restrict the number of threads used by openmp
 | 
				
			||||||
 | 
					                    env['OMP_NUM_THREADS'] = f'{self.num_cores_per_run}'
 | 
				
			||||||
 | 
					                    # restrict the nu,ber of threads used by intel math kernel library
 | 
				
			||||||
 | 
					                    env['MKL_NUM_THREADS'] = f'{self.num_cores_per_run}'
 | 
				
			||||||
 | 
					                    proc = subprocess.Popen(popen_args, cwd=cwd, stdout=stdout, stderr=stderr, env=env)
 | 
				
			||||||
 | 
					                    pid = proc.pid
 | 
				
			||||||
 | 
					                    proc.wait()
 | 
				
			||||||
 | 
					                    returncode = proc.returncode
 | 
				
			||||||
 | 
					                except:
 | 
				
			||||||
 | 
					                    print(f'command failed: {popen_args}')
 | 
				
			||||||
 | 
					            on_exit(pid, returncode, run_id)
 | 
				
			||||||
            return
 | 
					            return
 | 
				
			||||||
        thread = threading.Thread(target=run_in_thread, args=(popen_args, on_exit))
 | 
					        thread = threading.Thread(target=run_in_thread, args=(popen_args, on_exit))
 | 
				
			||||||
        thread.start()
 | 
					        thread.start()
 | 
				
			||||||
| 
						 | 
					@ -255,8 +265,8 @@ class CommandPerfEstimator():  # (false positive) pylint: disable=function-redef
 | 
				
			||||||
        with self._runs_lock:
 | 
					        with self._runs_lock:
 | 
				
			||||||
            run = Run(self._next_run_id, worker_id)
 | 
					            run = Run(self._next_run_id, worker_id)
 | 
				
			||||||
            self._next_run_id += 1
 | 
					            self._next_run_id += 1
 | 
				
			||||||
            _run_thread = self.popen_and_call(popen_args=run_command, on_exit=self.on_exit, run_id=run.id, cwd=run_command_cwd, stdout_filepath=stdout_filepath, stderr_filepath=stderr_filepath)  # noqa:F841
 | 
					 | 
				
			||||||
            self._runs[run.id] = run
 | 
					            self._runs[run.id] = run
 | 
				
			||||||
 | 
					            _run_thread = self.popen_and_call(popen_args=run_command, on_exit=self.on_exit, run_id=run.id, cwd=run_command_cwd, stdout_filepath=stdout_filepath, stderr_filepath=stderr_filepath)  # noqa:F841
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def run(self) -> DurationInSeconds:
 | 
					    def run(self) -> DurationInSeconds:
 | 
				
			||||||
        '''performs the runs of the command and returns the runs' average duration'''
 | 
					        '''performs the runs of the command and returns the runs' average duration'''
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -77,7 +77,8 @@ def starbench_cmake_app(source_code_provider: IFileTreeProvider, tmp_dir: Path,
 | 
				
			||||||
        max_num_cores=num_cores,
 | 
					        max_num_cores=num_cores,
 | 
				
			||||||
        stop_condition=StopAfterSingleRun(),
 | 
					        stop_condition=StopAfterSingleRun(),
 | 
				
			||||||
        run_command_cwd=Path('/tmp'),
 | 
					        run_command_cwd=Path('/tmp'),
 | 
				
			||||||
        stdout_filepath=None)
 | 
					        stdout_filepath=worker_dir / 'createdir_stdout.txt',
 | 
				
			||||||
 | 
					        stderr_filepath=worker_dir / 'createdir_stderr.txt')
 | 
				
			||||||
    _create_build_dir_duration = create_build_dir.run()  # noqa: F841
 | 
					    _create_build_dir_duration = create_build_dir.run()  # noqa: F841
 | 
				
			||||||
    # build_dir.mkdir(exist_ok=True)
 | 
					    # build_dir.mkdir(exist_ok=True)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in New Issue