refactored cluster related code (renamed Job as Task and Job2 as Job to avoid confusion)
when Job2 was introduced, it was actually representing a job, but the class Job (which actually represented a Task, and this was misleading) already existed also: - added type hinting to ease code understanding - took this opportunity to fix styling issues
This commit is contained in:
		
							parent
							
								
									f36b2d9d9c
								
							
						
					
					
						commit
						6bf69f909b
					
				|  | @ -1,3 +1,6 @@ | |||
| from typing import Optional, Dict, List | ||||
| from datetime import datetime | ||||
| 
 | ||||
| 
 | ||||
| class JobStateFlags: | ||||
|     RUNNING = 1    # the job is running | ||||
|  | @ -14,133 +17,161 @@ class ParallelEnvironment: | |||
|     MPI = 1 | ||||
| 
 | ||||
| 
 | ||||
| MemoryUnit = int  # number of bytes | ||||
| JobState = int  # combination of JobStateFlags | ||||
| JobId = int  # the job id in the sense of sge's job id : a simple unique number | ||||
| TaskId = int  # the id of a task within its array Job | ||||
| QueueId = str  # eg 'main.q' | ||||
| QueueMachineId = str  # the identifier of a queue machine, eg 'main.q@physix99.ipr.univ-rennes.fr' | ||||
| ResourceRequest = str  # eg 'mem_available=5G' | ||||
| 
 | ||||
| 
 | ||||
| class JobRequirements: | ||||
|     num_slots: Optional[int] | ||||
|     architecture: Optional[str]    # machine architecture | ||||
|     m_parallelEnvironment: Optional[int]  # todo: make ParallelEnvironment an Enum | ||||
|     queues: Optional[List[QueueId]]  # the list of queues this job is allowed to run on | ||||
|     resources: Optional[List[ResourceRequest]] | ||||
| 
 | ||||
|     def __init__(self): | ||||
|         self.m_numSlots = None | ||||
|         self.m_strArchitecture = None    # machine architecture | ||||
|         self.num_slots = None | ||||
|         self.architecture = None | ||||
|         self.m_parallelEnvironment = None | ||||
|         self.m_queues = None  # the list of queues this job is allowed to run on | ||||
|         self.queues = None | ||||
|         self.resources = None | ||||
| 
 | ||||
| 
 | ||||
| class JobId: | ||||
| class TaskUid: | ||||
|     """ | ||||
|         the identifier of a job. | ||||
|         We treat each element of a job array as a separate job | ||||
|         the identifier of a task, in the form <job_id>.<element_id> | ||||
|         We treat each element of a job array as a separate task | ||||
|         A single integer is no longer enough to identify a job because all elements in a job array | ||||
|         share the same sge job identifier. To uniquely define a job array element, we also use the task id. | ||||
|     """ | ||||
|     job_id: JobId | ||||
|     task_id: Optional[TaskId]  # the identifier of a task within its job  None if this identifier does not refer to a job array element | ||||
|     MAX_NUM_JOBS_IN_ARRAY = 1000000 | ||||
| 
 | ||||
|     def __init__(self, iJobId, iJobArrayElementId=None): | ||||
|         if iJobArrayElementId is not None: | ||||
|             assert iJobArrayElementId <= self.MAX_NUM_JOBS_IN_ARRAY | ||||
|         self.m_iJobId = iJobId | ||||
|         self.m_iJobArrayElementId = iJobArrayElementId  # None if this identifier does not refer to a job array element | ||||
|     def __init__(self, job_id: JobId, task_id: Optional[TaskId] = None): | ||||
|         if task_id is not None: | ||||
|             assert task_id <= self.MAX_NUM_JOBS_IN_ARRAY | ||||
|         self.job_id = job_id | ||||
|         self.task_id = task_id | ||||
| 
 | ||||
|     def __hash__(self): | ||||
|         """ | ||||
|             required to use a JobId as a dict hash key | ||||
|             required to use a TaskUid as a dict hash key | ||||
|         """ | ||||
|         hash = self.m_iJobId * self.MAX_NUM_JOBS_IN_ARRAY | ||||
|         if self.m_iJobArrayElementId is not None: | ||||
|             hash += self.m_iJobArrayElementId | ||||
|         hash = self.job_id * self.MAX_NUM_JOBS_IN_ARRAY | ||||
|         if self.task_id is not None: | ||||
|             hash += self.task_id | ||||
|         return hash | ||||
| 
 | ||||
|     def __eq__(self, other): | ||||
|     def __eq__(self, other: 'TaskUid'): | ||||
|         """ | ||||
|             required to use a JobId as a dict hash key | ||||
|             required to use a TaskUid as a dict hash key | ||||
|         """ | ||||
|         if self.m_iJobId != other.m_iJobId: | ||||
|         if self.job_id != other.job_id: | ||||
|             return False | ||||
|         if self.m_iJobArrayElementId != other.m_iJobArrayElementId: | ||||
|         if self.task_id != other.task_id: | ||||
|             return False | ||||
|         return True | ||||
| 
 | ||||
|     def isJobArrayElement(self): | ||||
|         return (self.m_iJobArrayElementId is not None) | ||||
|     def is_job_array_element(self) -> bool: | ||||
|         return (self.task_id is not None) | ||||
| 
 | ||||
|     def getMainId(self): | ||||
|         return self.m_iJobId | ||||
|     def get_job_id(self) -> JobId: | ||||
|         return self.job_id | ||||
| 
 | ||||
|     def asStr(self): | ||||
|         strResult = '%s' % self.m_iJobId | ||||
|         if self.isJobArrayElement(): | ||||
|             strResult += '.%d' % self.m_iJobArrayElementId | ||||
|         return strResult | ||||
|     def as_str(self): | ||||
|         result = '%s' % self.job_id | ||||
|         if self.is_job_array_element(): | ||||
|             result += '.%d' % self.task_id | ||||
|         return result | ||||
| 
 | ||||
| 
 | ||||
| class Job: | ||||
|     def __init__(self, jobId): | ||||
|         self.m_jobId = jobId | ||||
|         self.m_startTime = None | ||||
|         self.m_submitTime = None | ||||
|         self.m_owner = None | ||||
|         self.m_scriptName = None | ||||
|         self.m_slots = {} | ||||
|         self.m_stateFlags = 0 | ||||
|         self.m_jobRequirements = JobRequirements() | ||||
|         self.m_requestedRamPerCore = 0 | ||||
| class Task: | ||||
|     task_uid: TaskUid  # the unique identified of this task, eg '12345.789' | ||||
|     start_time: Optional[datetime] | ||||
|     submit_time: Optional[datetime] | ||||
|     owner: Optional[str] | ||||
|     script_name: Optional[str] | ||||
|     slots: Dict[QueueMachineId, int] | ||||
|     state_flags: JobStateFlags | ||||
|     job_requirements: JobRequirements | ||||
|     requested_ram_per_core: MemoryUnit | ||||
| 
 | ||||
|     def getId(self): | ||||
|         return self.m_jobId | ||||
|     def __init__(self, task_uid): | ||||
|         self.task_uid = task_uid | ||||
|         self.start_time = None | ||||
|         self.submit_time = None | ||||
|         self.owner = None | ||||
|         self.script_name = None | ||||
|         self.slots = {} | ||||
|         self.state_flags = 0 | ||||
|         self.job_requirements = JobRequirements() | ||||
|         self.requested_ram_per_core = 0 | ||||
| 
 | ||||
|     def setState(self, state): | ||||
|         self.m_stateFlags = state | ||||
|     def get_id(self): | ||||
|         return self.task_uid | ||||
| 
 | ||||
|     def setOwner(self, jobOwner): | ||||
|         if self.m_owner: | ||||
|             assert self.m_owner == jobOwner | ||||
|         self.m_owner = jobOwner | ||||
|     def set_state(self, state: JobState): | ||||
|         self.state_flags = state | ||||
| 
 | ||||
|     def getOwner(self): | ||||
|         return self.m_owner | ||||
|     def set_owner(self, job_owner: str): | ||||
|         if self.owner: | ||||
|             assert self.owner == job_owner | ||||
|         self.owner = job_owner | ||||
| 
 | ||||
|     def setStartTime(self, jobStartTime): | ||||
|         if self.m_startTime: | ||||
|             assert self.m_startTime == jobStartTime | ||||
|         self.m_startTime = jobStartTime | ||||
|     def get_owner(self) -> str: | ||||
|         return self.owner | ||||
| 
 | ||||
|     def setSubmitTime(self, jobSubmitTime): | ||||
|         if self.m_submitTime: | ||||
|             assert self.m_submitTime == jobSubmitTime | ||||
|         self.m_submitTime = jobSubmitTime | ||||
|     def set_start_time(self, job_start_time: datetime): | ||||
|         if self.start_time: | ||||
|             assert self.start_time == job_start_time | ||||
|         self.start_time = job_start_time | ||||
| 
 | ||||
|     def getStartTime(self): | ||||
|         return self.m_startTime | ||||
|     def get_submit_time(self, job_submit_time: datetime): | ||||
|         if self.submit_time: | ||||
|             assert self.submit_time == job_submit_time | ||||
|         self.submit_time = job_submit_time | ||||
| 
 | ||||
|     def setScriptName(self, jobScriptName): | ||||
|         if self.m_scriptName: | ||||
|             assert self.m_scriptName == jobScriptName | ||||
|         self.m_scriptName = jobScriptName | ||||
|     def get_start_time(self) -> datetime: | ||||
|         return self.start_time | ||||
| 
 | ||||
|     def addSlots(self, queueMachineName, numSlots): | ||||
|         assert self.m_slots.get(queueMachineName) is None | ||||
|         if self.m_slots.get(queueMachineName) is None: | ||||
|             self.m_slots[queueMachineName] = numSlots | ||||
|     def set_script_name(self, job_script_name: str): | ||||
|         if self.script_name: | ||||
|             assert self.script_name == job_script_name | ||||
|         self.script_name = job_script_name | ||||
| 
 | ||||
|     def add_slots(self, queue_machine_id: QueueMachineId, num_slots: int): | ||||
|         assert self.slots.get(queue_machine_id) is None | ||||
|         if self.slots.get(queue_machine_id) is None: | ||||
|             self.slots[queue_machine_id] = num_slots | ||||
|         else: | ||||
|             # should never happen | ||||
|             self.m_slots[queueMachineName] += numSlots | ||||
|             self.slots[queue_machine_id] += num_slots | ||||
| 
 | ||||
|     def getSlots(self): | ||||
|         return self.m_slots | ||||
|     def get_slots(self) -> Dict[QueueMachineId, int]: | ||||
|         return self.slots | ||||
| 
 | ||||
|     def setNumRequiredSlots(self, numSlots): | ||||
|         self.m_jobRequirements.m_numSlots = numSlots | ||||
|     def set_num_required_slots(self, num_slots: int): | ||||
|         self.job_requirements.num_slots = num_slots | ||||
| 
 | ||||
|     def isPending(self): | ||||
|     def is_pending(self): | ||||
|         """ | ||||
|             returns true if this job is waiting in the queue for whatever reason | ||||
|             returns true if this task is waiting in the queue for whatever reason | ||||
|         """ | ||||
|         return self.m_stateFlags & JobStateFlags.QUEUED | ||||
|         return self.state_flags & JobStateFlags.QUEUED | ||||
| 
 | ||||
|     def getRequestedRamPerCore(self): | ||||
|     def get_requested_ram_per_core(self) -> MemoryUnit: | ||||
|         """ | ||||
|             requested RAM per core in bytes | ||||
|         """ | ||||
|         return self.m_requestedRamPerCore | ||||
|         return self.requested_ram_per_core | ||||
| 
 | ||||
|     def setRequestedRamPerCore(self, requestedRam): | ||||
|     def set_requested_ram_per_core(self, requested_ram: MemoryUnit): | ||||
|         """ | ||||
|             requestedRam : requested RAM per core in bytes | ||||
|         """ | ||||
|         self.m_requestedRamPerCore = requestedRam | ||||
|         self.requested_ram_per_core = requested_ram | ||||
|  |  | |||
|  | @ -1,35 +1,40 @@ | |||
| from typing import Dict | ||||
| from .Log import * | ||||
| from .Job import Task, TaskUid | ||||
| 
 | ||||
| 
 | ||||
| class JobsState: | ||||
|     """ | ||||
|         represents a snapshot of the state of SGE jobs as seen by the SGE command "qstat -f -u \*" | ||||
|     """ | ||||
|     tasks: Dict[TaskUid, Task] | ||||
|     job_array_tasks: Dict[int, Dict[TaskUid, Task]] | ||||
| 
 | ||||
|     def __init__(self): | ||||
|         self.m_jobs = {}  # list of jobs | ||||
|         self.m_jobArrayJobs = {}  # a dictionary of jobs for each job array, indexed by job array id | ||||
|         self.tasks = {}  # list of jobs | ||||
|         self.job_array_tasks = {}  # a dictionary of jobs for each job array, indexed by job array id | ||||
|         self.m_queueMachines = {}    # list of queue machines such as allintel.q@simpatix10 | ||||
|         self.m_stateTime = None  # the time at which the state was snapshot | ||||
| 
 | ||||
|     def deleteAllJobs(self): | ||||
|         self.m_jobs = {} | ||||
|         self.m_jobArrayJobs = {} | ||||
|         self.tasks = {} | ||||
|         self.job_array_tasks = {} | ||||
| 
 | ||||
|     def addJob(self, job): | ||||
|         jobId = job.getId() | ||||
|         self.m_jobs[jobId] = job | ||||
|         if jobId.isJobArrayElement(): | ||||
|             tasks = self.m_jobArrayJobs.get(jobId.m_iJobId) | ||||
|     def addTask(self, task: Task): | ||||
|         task_uid = task.get_id() | ||||
|         self.tasks[task_uid] = task | ||||
|         if task_uid.is_job_array_element(): | ||||
|             tasks = self.job_array_tasks.get(task_uid.job_id) | ||||
|             if tasks is None: | ||||
|                 tasks = {} | ||||
|                 self.m_jobArrayJobs[jobId.m_iJobId] = tasks | ||||
|             tasks[jobId] = job | ||||
|                 self.job_array_tasks[task_uid.job_id] = tasks | ||||
|             tasks[task_uid] = task | ||||
| 
 | ||||
|     def getJob(self, jobId): | ||||
|         return self.m_jobs.get(jobId) | ||||
|     def get_task(self, task_uid: TaskUid) -> Task: | ||||
|         return self.tasks.get(task_uid) | ||||
| 
 | ||||
|     def getJobArrayJobs(self, iJobArrayId): | ||||
|         return self.m_jobArrayJobs.get(iJobArrayId) | ||||
|     def get_job_array_tasks(self, job_array_id: int) -> Dict[TaskUid, Task]: | ||||
|         return self.job_array_tasks.get(job_array_id) | ||||
| 
 | ||||
|     def setTime(self, stateTime): | ||||
|         self.m_stateTime = stateTime | ||||
|  | @ -38,19 +43,19 @@ class JobsState: | |||
|         return self.m_stateTime | ||||
| 
 | ||||
|     def getJobsOnMachine(self, machineName): | ||||
|         jobsOnMachine = {} | ||||
|         for jobId, job in self.m_jobs.items(): | ||||
|             for queueMachineName, numSlots in job.getSlots().items(): | ||||
|         jobs_on_machine = {} | ||||
|         for task_uid, task in self.tasks.items(): | ||||
|             for queueMachineName, numSlots in task.get_slots().items(): | ||||
|                 jobMachineName = queueMachineName.split('@')[1] | ||||
|                 if jobMachineName == machineName: | ||||
|                     jobsOnMachine[jobId] = job | ||||
|         return jobsOnMachine | ||||
|                     jobs_on_machine[task_uid] = task | ||||
|         return jobs_on_machine | ||||
| 
 | ||||
|     def getNumFreeSlotsOnQueueMachine(self, queueMachine): | ||||
|         # logInfo('getNumFreeSlotsOnQueueMachine : looking for free slots on queuemachine %s' % queueMachine.getName()) | ||||
|         numUsedSlots = 0 | ||||
|         for job in self.m_jobs.values(): | ||||
|             numUsedSlotsByThisJob = job.getSlots().get(queueMachine.getName()) | ||||
|         for job in self.tasks.values(): | ||||
|             numUsedSlotsByThisJob = job.get_slots().get(queueMachine.getName()) | ||||
|             if numUsedSlotsByThisJob is not None: | ||||
|                 # logInfo('getNumFreeSlotsOnQueueMachine : job %d uses %d slots' % (job.getId().asStr(), numUsedSlotsByThisJob)) | ||||
|                 numUsedSlots += numUsedSlotsByThisJob | ||||
|  | @ -80,7 +85,7 @@ class JobsState: | |||
| 
 | ||||
|     def getPendingJobs(self): | ||||
|         pendingJobs = {} | ||||
|         for jobId, job in self.m_jobs.items(): | ||||
|             if job.isPending(): | ||||
|                 pendingJobs[job.getId()] = job | ||||
|         for jobId, job in self.tasks.items(): | ||||
|             if job.is_pending(): | ||||
|                 pendingJobs[job.get_id()] = job | ||||
|         return pendingJobs | ||||
|  |  | |||
|  | @ -4,35 +4,35 @@ from .JobsState import JobsState | |||
| from .QueueMachine import QueueMachine, QueueMachineStateFlags | ||||
| from .Util import * | ||||
| from .Log import logError | ||||
| from .Job import JobStateFlags, JobId, Job, ParallelEnvironment | ||||
| from .Job import JobStateFlags, TaskUid, Task, ParallelEnvironment, JobState | ||||
| import logging | ||||
| import time | ||||
| 
 | ||||
| 
 | ||||
| class QstatParser: | ||||
|     def parseJobState(self, strJobStatus): | ||||
|         jobState = 0 | ||||
|         for i in range(0, len(strJobStatus)): | ||||
|             c = strJobStatus[i] | ||||
|     def parseJobState(self, job_status_as_str: str) -> JobState: | ||||
|         job_state = 0 | ||||
|         for i in range(0, len(job_status_as_str)): | ||||
|             c = job_status_as_str[i] | ||||
|             if c == 'r': | ||||
|                 jobState += JobStateFlags.RUNNING | ||||
|                 job_state += JobStateFlags.RUNNING | ||||
|             elif c == 'w': | ||||
|                 jobState += JobStateFlags.WAITING | ||||
|                 job_state += JobStateFlags.WAITING | ||||
|             elif c == 'q': | ||||
|                 jobState += JobStateFlags.QUEUED | ||||
|                 job_state += JobStateFlags.QUEUED | ||||
|             elif c == 't': | ||||
|                 jobState += JobStateFlags.TRANSFERING | ||||
|                 job_state += JobStateFlags.TRANSFERING | ||||
|             elif c == 'd': | ||||
|                 jobState += JobStateFlags.DELETED | ||||
|                 job_state += JobStateFlags.DELETED | ||||
|             elif c == 'h': | ||||
|                 jobState += JobStateFlags.HOLD | ||||
|                 job_state += JobStateFlags.HOLD | ||||
|             elif c == 's': | ||||
|                 jobState += JobStateFlags.SUSPENDED | ||||
|                 job_state += JobStateFlags.SUSPENDED | ||||
|             elif c == 'E': | ||||
|                 jobState += JobStateFlags.ERROR | ||||
|                 job_state += JobStateFlags.ERROR | ||||
|             else: | ||||
|                 assert False, 'unhandled job state flag :"' + c + '"' | ||||
|         return jobState | ||||
|         return job_state | ||||
| 
 | ||||
|     def parseQueueMachineState(self, strQueueMachineStatus): | ||||
|         queueMachineState = 0 | ||||
|  | @ -134,8 +134,8 @@ class QstatParser: | |||
|                 if not bInPendingJobsSection: | ||||
|                     assert currentQueueMachine | ||||
|                 # log('QstatParser::parseQstatOutput : jobId = "'+matchObj.group('jobId')+'"') | ||||
|                 iJobId = int(matchObj.group('jobId')) | ||||
|                 logging.debug('iJobId = %d' % iJobId) | ||||
|                 job_id = int(matchObj.group('jobId')) | ||||
|                 logging.debug('iJobId = %d' % job_id) | ||||
|                 jobState = self.parseJobState(matchObj.group('jobStatus')) | ||||
|                 strJobArrayDetails = matchObj.group('jobArrayDetails') | ||||
|                 bIsJobArray = (len(strJobArrayDetails) != 0) | ||||
|  | @ -154,33 +154,33 @@ class QstatParser: | |||
|                 logging.debug('task_ids = %s' % task_ids) | ||||
|                 for task_id in task_ids: | ||||
|                     logging.debug('task_id = %s' % task_id) | ||||
|                     jobId = None | ||||
|                     task_uid = None | ||||
|                     if bIsJobArray: | ||||
|                         jobId = JobId(iJobId, task_id) | ||||
|                         task_uid = TaskUid(job_id, task_id) | ||||
|                     else: | ||||
|                         jobId = JobId(iJobId) | ||||
|                     job = jobsState.getJob(jobId) | ||||
|                         task_uid = TaskUid(job_id) | ||||
|                     task = jobsState.get_task(task_uid) | ||||
|                     # logDebug('iElementIndex = %d job id = %s' % (iElementIndex, jobId.asStr())) | ||||
|                     if job is None: | ||||
|                     if task is None: | ||||
|                         # this job hasn't been encountered yet in the output of qstat ... | ||||
|                         # we could either be in the pending jobs section or in the running jobs section | ||||
|                         job = Job(jobId) | ||||
|                         jobsState.addJob(job) | ||||
|                         job.setState(jobState) | ||||
|                         task = Task(task_uid) | ||||
|                         jobsState.addTask(task) | ||||
|                         task.set_state(jobState) | ||||
|                         strJobStartOrSubmitTime = matchObj.group('jobStartOrSubmitTime') | ||||
|                         jobStartOrSubmitTime = time.strptime(strJobStartOrSubmitTime, '%m/%d/%Y %H:%M:%S') | ||||
|                         if bInPendingJobsSection: | ||||
|                             job.setSubmitTime(jobStartOrSubmitTime) | ||||
|                             task.get_submit_time(jobStartOrSubmitTime) | ||||
|                         else: | ||||
|                             job.setStartTime(jobStartOrSubmitTime) | ||||
|                         job.setOwner(matchObj.group('jobOwner')) | ||||
|                         job.setScriptName(matchObj.group('jobScriptName')) | ||||
|                             task.set_start_time(jobStartOrSubmitTime) | ||||
|                         task.set_owner(matchObj.group('jobOwner')) | ||||
|                         task.set_script_name(matchObj.group('jobScriptName')) | ||||
|                         if bInPendingJobsSection: | ||||
|                             job.setNumRequiredSlots(int(matchObj.group('numSlots'))) | ||||
|                             task.set_num_required_slots(int(matchObj.group('numSlots'))) | ||||
|                     else: | ||||
|                         assert not bInPendingJobsSection  # if we are in the pending jobs section, the job should be new | ||||
|                     if not bInPendingJobsSection: | ||||
|                         job.addSlots(currentQueueMachine.getName(), int(matchObj.group('numSlots'))) | ||||
|                         task.add_slots(currentQueueMachine.getName(), int(matchObj.group('numSlots'))) | ||||
|             else: | ||||
|                 # the current line does not describe a job | ||||
|                 if not bInPendingJobsSection: | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue