cocluto/cocluto/ClusterController/ClusterStatus.py

210 lines
8.9 KiB
Python

import threading
from JobsStateUpdater import JobsStateUpdater
import Lib.Util
import Lib.SimpaDbUtil
from ClusterNode import ClusterNode
from Log import logInfo, logError
from PowerState import PowerState
import time
class ClusterStatus:
"""
The current state (jobs, sensors) of the cluster
@param gridEngine the interface to the batch job tool (in our case it's sun grid engine)
"""
def __init__(self, gridEngine):
self.m_gridEngine = gridEngine
self.m_clusterNodes = {}
self.m_lock = threading.Lock() # to prevent concurrent access to this instance
self.m_jobsStateUpdater = JobsStateUpdater(self)
self.m_jobsState = None
# self.m_controlledMachineNames = ['simpatix30']
self.m_controlledMachineNames = [] # ['simpatix30']
if False:
for iMachine in range(11, 40):
if (iMachine == 31) or (iMachine == 32):
continue # these machines don't seem to be able to go to sleep properly (bug 00000010)
if (iMachine == 18):
continue # this machine needs maintenance (restarting because it's very slow for an unknown reason)
self.m_controlledMachineNames.append('simpatix%d' % iMachine)
nodeNames = Lib.SimpaDbUtil.getClusterMachinesNames()
for nodeName in nodeNames:
if nodeName in self.m_controlledMachineNames:
logInfo('machine %s is under the cluster controller\'s control' % nodeName)
clusterNode = ClusterNode(nodeName, self, gridEngine)
if nodeName == 'simpatix10':
clusterNode.setShouldAlwaysBeOn()
self.m_clusterNodes[nodeName] = clusterNode
return
def setControlOnMachine(self, machineName, bControl):
if bControl:
# add machineName under control of ClusterController
for k, v in self.m_clusterNodes.items():
if v.getName() == machineName:
return # nothing to do : machineName is already under the control of ClusterController
clusterNode = ClusterNode(machineName, self, self.m_gridEngine)
if machineName == 'simpatix10':
clusterNode.setShouldAlwaysBeOn()
self.m_clusterNodes[machineName] = clusterNode
clusterNode.m_machineStatusUpdater.start()
else:
# remove machineName from control of ClusterController
clusterNode = self.m_clusterNodes.get(machineName)
if clusterNode:
clusterNode.m_machineStatusUpdater.m_bStop = True
clusterNode.m_machineStatusUpdater.join()
self.m_clusterNodes.pop(machineName)
def getGridEngine(self):
return self.m_gridEngine
def getMachines(self):
return self.m_clusterNodes
def startReadingThreads(self):
for k, v in self.m_clusterNodes.items():
v.m_machineStatusUpdater.start()
self.m_jobsStateUpdater.start()
def stopReadingThreads(self):
for k, v in self.m_clusterNodes.items():
v.m_machineStatusUpdater.m_bStop = True
v.m_machineStatusUpdater.join()
self.m_jobsStateUpdater.m_bStop = True
self.m_jobsStateUpdater.join()
def onNewJobsState(self, newJobsState):
# logDebug('ClusterStatus::onNewJobsState : attempting to acquire lock to access m_jobsState')
self.m_lock.acquire()
# logDebug('ClusterStatus::onNewJobsState : got lock to access m_jobsState')
self.m_jobsState = newJobsState
self.m_lock.release()
def getJobsOnMachine(self, machineName):
return self.m_jobsState.getJobsOnMachine(machineName)
def isReady(self):
for k, v in self.m_clusterNodes.items():
if not v.isReady():
logInfo('ClusterStatus::isReady : not ready because of ' + v.getName())
return False
# log('ClusterStatus::isReady() : '+k+' is ready')
# assert(False)
if self.m_jobsState is None:
logInfo('ClusterStatus::isReady : not ready because waiting for jobs state')
return False
return True
def getIdleMachines(self):
assert self.isReady
bBUG_00000009_IS_STILL_ALIVE = True
if bBUG_00000009_IS_STILL_ALIVE:
currentTime = time.time()
fJOBS_STATE_MAX_ALLOWED_AGE = 3600
fJobsStateAge = currentTime - self.m_jobsState.getTime()
if fJobsStateAge > fJOBS_STATE_MAX_ALLOWED_AGE:
logError('ClusterStatus::getIdleMachines : age of jobs state is too old (%f s). This is bug 00000009.' % (fJobsStateAge))
assert False
idleMachines = {}
for machineName, machine in self.m_clusterNodes.items():
if machine.getPowerState() == PowerState.ON:
jobsOnThisMachine = self.getJobsOnMachine(machineName)
if len(jobsOnThisMachine) == 0:
idleMachines[machineName] = machine
return idleMachines
def getPendingJobs(self):
return self.m_jobsState.getPendingJobs()
def getJobsState(self):
return self.m_jobsState
def queueMachineFitsJobRequirements(self, queueMachine, jobRequirements):
if jobRequirements.m_queues:
bQueueIsInAllowedQueues = False
for queueName in jobRequirements.m_queues:
if queueName == queueMachine.getQueueName():
bQueueIsInAllowedQueues = True
if not bQueueIsInAllowedQueues:
logInfo('queueMachineFitsJobRequirements : queueMachine ' + queueMachine.getName() + ' rejected because it\'s not in the allowed queues')
return False
return True
def getEnergyConsumption(self):
"""
returns an estimate of the energy consumption since the start of the cluster controller (in joules)
"""
fEnergyConsumption = 0.0
for machine in self.m_clusterNodes.values():
if machine.isReady(): # there are cases where the machine is not ready yet (for example, it's just been added to clustercontroller's control)
fEnergyConsumption += machine.getEnergyConsumption()
return fEnergyConsumption
def getEnergySavings(self):
"""
returns an estimate of the energy saving since the start of the cluster controller (in joules)
"""
fEnergySavings = 0.0
for machine in self.m_clusterNodes.values():
if machine.isReady():
fEnergySavings += machine.getEnergySavings()
return fEnergySavings
def getCurrentPowerConsumption(self):
fPowerConsumption = 0.0
for machine in self.m_clusterNodes.values():
if machine.isReady():
fPowerConsumption += machine.getPowerConsumption()
return fPowerConsumption
def getCurrentPowerSavings(self):
fPowerSavings = 0.0
for machine in self.m_clusterNodes.values():
if machine.isReady():
fPowerSavings += machine.getPowerConsumptionForPowerState(PowerState.ON) - machine.getPowerConsumption()
return fPowerSavings
def getNumControlledSlots(self):
self.m_lock.acquire()
iNumControlledSlots = 0
for machine in self.m_clusterNodes.values():
queueMachine = self.m_jobsState.getQueueMachine(machine.getName())
iNumControlledSlots += queueMachine.getNumSlots()
self.m_lock.release()
return iNumControlledSlots
def getNumUsedSlots(self):
self.m_lock.acquire()
iNumUsedSlots = 0
for machine in self.m_clusterNodes.values():
queueMachine = self.m_jobsState.getQueueMachine(machine.getName())
iNumUsedSlotsOnThisMachine = queueMachine.getNumSlots() - self.m_jobsState.getNumFreeSlotsOnQueueMachine(queueMachine)
assert iNumUsedSlotsOnThisMachine >= 0
iNumUsedSlots += iNumUsedSlotsOnThisMachine
self.m_lock.release()
return iNumUsedSlots
def getNumWastedSlots(self):
self.m_lock.acquire()
iNumWastedSlots = 0
for machine in self.m_clusterNodes.values():
if machine.getPowerState() == PowerState.ON:
queueMachine = self.m_jobsState.getQueueMachine(machine.getName())
iNumWastedSlots += self.m_jobsState.getNumFreeSlotsOnQueueMachine(queueMachine)
self.m_lock.release()
return iNumWastedSlots
def getNumSleepingSlots(self):
self.m_lock.acquire()
iNumSleepingSlots = 0
for machine in self.m_clusterNodes.values():
if machine.getPowerState() == PowerState.SLEEP:
queueMachine = self.m_jobsState.getQueueMachine(machine.getName())
iNumSleepingSlots += self.m_jobsState.getNumFreeSlotsOnQueueMachine(queueMachine)
self.m_lock.release()
return iNumSleepingSlots