cocluto/cocluto/ClusterController/Job.py

179 lines
5.6 KiB
Python

import enum
from typing import Optional, Dict, List
from datetime import datetime
class JobStateFlags:
RUNNING = 1 # the job is running
WAITING = 2 # the job is waiting
QUEUED = 4 # not sure what that exactly means but it reflects the q state of jobs as seen in the pending jobs list from qstat -f -u \*
TRANSFERING = 8
DELETED = 16
HOLD = 32
ERROR = 64
SUSPENDED = 128
class ParallelEnvironment(enum.Enum):
MPI = 1
MemoryUnit = int # number of bytes
JobState = int # combination of JobStateFlags
JobId = int # the job id in the sense of sge's job id : a simple unique number
TaskId = int # the id of a task within its array Job
QueueId = str # eg 'main.q'
QueueMachineId = str # the identifier of a queue machine, eg 'main.q@physix99.ipr.univ-rennes.fr'
ResourceRequest = str # eg 'mem_available=5G'
class JobRequirements:
num_slots: Optional[int]
architecture: Optional[str] # machine architecture
parallel_environment: Optional[ParallelEnvironment]
queues: Optional[List[QueueId]] # the list of queues this job is allowed to run on
resources: Optional[List[ResourceRequest]]
def __init__(self):
self.num_slots = None
self.architecture = None
self.parallel_environment = None
self.queues = None
self.resources = None
class TaskUid:
"""
the identifier of a task, in the form <job_id>.<element_id>
We treat each element of a job array as a separate task
A single integer is no longer enough to identify a job because all elements in a job array
share the same sge job identifier. To uniquely define a job array element, we also use the task id.
"""
job_id: JobId
task_id: Optional[TaskId] # the identifier of a task within its job None if this identifier does not refer to a job array element
MAX_NUM_JOBS_IN_ARRAY = 1000000
def __init__(self, job_id: JobId, task_id: Optional[TaskId] = None):
if task_id is not None:
assert task_id <= self.MAX_NUM_JOBS_IN_ARRAY
self.job_id = job_id
self.task_id = task_id
def __hash__(self):
"""
required to use a TaskUid as a dict hash key
"""
_hash = self.job_id * self.MAX_NUM_JOBS_IN_ARRAY
if self.task_id is not None:
_hash += self.task_id
return _hash
def __eq__(self, other: 'TaskUid'):
"""
required to use a TaskUid as a dict hash key
"""
if self.job_id != other.job_id:
return False
if self.task_id != other.task_id:
return False
return True
def is_job_array_element(self) -> bool:
return (self.task_id is not None)
def get_job_id(self) -> JobId:
return self.job_id
def as_str(self):
result = '%s' % self.job_id
if self.is_job_array_element():
result += '.%d' % self.task_id
return result
class Task:
task_uid: TaskUid # the unique identified of this task, eg '12345.789'
start_time: Optional[datetime]
submit_time: Optional[datetime]
owner: Optional[str]
script_name: Optional[str]
slots: Dict[QueueMachineId, int]
state_flags: JobStateFlags
job_requirements: JobRequirements
requested_ram_per_core: MemoryUnit
def __init__(self, task_uid):
self.task_uid = task_uid
self.start_time = None
self.submit_time = None
self.owner = None
self.script_name = None
self.slots = {}
self.state_flags = 0
self.job_requirements = JobRequirements()
self.requested_ram_per_core = 0
def get_id(self):
return self.task_uid
def set_state(self, state: JobState):
self.state_flags = state
def set_owner(self, job_owner: str):
if self.owner:
assert self.owner == job_owner
self.owner = job_owner
def get_owner(self) -> str:
return self.owner
def set_start_time(self, job_start_time: datetime):
if self.start_time:
assert self.start_time == job_start_time
self.start_time = job_start_time
def get_submit_time(self, job_submit_time: datetime):
if self.submit_time:
assert self.submit_time == job_submit_time
self.submit_time = job_submit_time
def get_start_time(self) -> datetime:
return self.start_time
def set_script_name(self, job_script_name: str):
if self.script_name:
assert self.script_name == job_script_name
self.script_name = job_script_name
def add_slots(self, queue_machine_id: QueueMachineId, num_slots: int):
assert self.slots.get(queue_machine_id) is None
if self.slots.get(queue_machine_id) is None:
self.slots[queue_machine_id] = num_slots
else:
# should never happen
self.slots[queue_machine_id] += num_slots
def get_slots(self) -> Dict[QueueMachineId, int]:
return self.slots
def set_num_required_slots(self, num_slots: int):
self.job_requirements.num_slots = num_slots
def is_pending(self):
"""
returns true if this task is waiting in the queue for whatever reason
"""
return self.state_flags & JobStateFlags.QUEUED
def get_requested_ram_per_core(self) -> MemoryUnit:
"""
requested RAM per core in bytes
"""
return self.requested_ram_per_core
def set_requested_ram_per_core(self, requested_ram: MemoryUnit):
"""
requestedRam : requested RAM per core in bytes
"""
self.requested_ram_per_core = requested_ram