fixed styling errors and added more type hinting to increase maintainability of cocluto

- all styling errors are now fixed, but there are still warnings and information
- most functions have been converted to snake case
- most functions now have type hinting

work related to [https://bugzilla.ipr.univ-rennes.fr/show_bug.cgi?id=3873]
This commit is contained in:
Guillaume Raffy 2024-06-14 15:52:32 +02:00
parent 6bf69f909b
commit 9f4a80b11e
24 changed files with 1037 additions and 951 deletions

View File

@ -1,21 +1,26 @@
#!/usr/bin/env python #!/usr/bin/env python
from typing import Dict, Optional
import sys import sys
sys.path.insert(0, '..')
import os import os
import MySQLdb import MySQLdb
import threading import threading
from Lib.Util import *
from Lib.SimpaDbUtil import *
import time import time
from ClusterStatus import ClusterStatus from datetime import datetime
from SlotAllocator import DecoupledSlotAllocator if sys.version_info < (3, 0):
from Log import logDebug, logInfo from HTMLParser import HTMLParser
from ClusterNodeStatusUpdater import IWakeUpCompleteNotifier, ISleepCompleteNotifier else:
from SunGridEngine import SunGridEngine from html.parser import HTMLParser
from Util import log, onException from ..Util import log
from WebServer import WebServerThread # from ..SimpaDbUtil import toto
from PowerState import PowerState from .ClusterNode import ClusterNodeId, ClusterNode
from HTMLParser import HTMLParser from .ClusterStatus import ClusterStatus
from .SlotAllocator import DecoupledSlotAllocator, SlotAllocator
from .Log import logDebug, log_info
from .ClusterNodeStatusUpdater import IWakeUpCompleteNotifier, ISleepCompleteNotifier
from .SunGridEngine import SunGridEngine
from .Util import on_exception
from .WebServer import WebServerThread
from .PowerState import PowerState
VERSION = '1.18' VERSION = '1.18'
@ -23,38 +28,38 @@ VERSION = '1.18'
class MyHTMLParser(HTMLParser): class MyHTMLParser(HTMLParser):
def __init__(self): def __init__(self):
HTMLParser.__init__(self) HTMLParser.__init__(self)
self.TokenList = [] self.token_list = []
def handle_data(self, data): def handle_data(self, data):
data = data.strip() data = data.strip()
if data and len(data) > 0: if data and len(data) > 0:
self.TokenList.append(data) self.token_list.append(data)
# print data # print data
def GetTokenList(self): def get_token_list(self):
return self.TokenList return self.token_list
class WakeUpCompleteNotifier(IWakeUpCompleteNotifier): class WakeUpCompleteNotifier(IWakeUpCompleteNotifier):
def __init__(self, machineName, clusterController): def __init__(self, machineName, clusterController):
self.m_machineName = machineName self.machine_name = machineName
self.m_clusterController = clusterController self.cluster_controller = clusterController
def onWakeUpComplete(self): def on_wake_up_complete(self):
logDebug('WakeUpCompleteNotifier::onWakeUpComplete : start') logDebug('WakeUpCompleteNotifier::on_wake_up_complete : start')
self.m_clusterController.onMachineWakeUpComplete(self.m_machineName) self.cluster_controller.onMachineWakeUpComplete(self.machine_name)
class SleepCompleteNotifier(ISleepCompleteNotifier): class SleepCompleteNotifier(ISleepCompleteNotifier):
def __init__(self, machineName, clusterController): def __init__(self, machineName, clusterController):
self.m_machineName = machineName self.machine_name = machineName
self.m_clusterController = clusterController self.cluster_controller = clusterController
def onSleepComplete(self, bSleepSucceeded): def on_sleep_complete(self, bSleepSucceeded):
logDebug('SleepCompleteNotifier::onSleepComplete : start') logDebug('SleepCompleteNotifier::on_sleep_complete : start')
self.m_clusterController.onMachineSleepComplete(self.m_machineName, bSleepSucceeded) self.cluster_controller.onMachineSleepComplete(self.machine_name, bSleepSucceeded)
def jouleToKwh(fEnergyInJoules): def jouleToKwh(fEnergyInJoules):
@ -79,23 +84,36 @@ class ClusterController:
jobs (eg add some machines to a queue). jobs (eg add some machines to a queue).
Mechanism to let user get priority Mechanism to let user get priority
""" """
cluster_status: ClusterStatus
slot_allocator = SlotAllocator
machines_that_need_wake_up: Dict[ClusterNodeId, ClusterNode]
machines_that_need_wake_up_lock: threading.Lock # to prevent concurrent access to machines_that_need_wake_up
machines_that_need_sleeping: Dict[ClusterNodeId, ClusterNode]
machines_that_need_sleeping_lock: threading.Lock # to prevent concurrent access to machines_that_need_sleeping
last_energy_status_log_time: Optional[datetime]
DELAY_BETWEEN_ENERGY_STATUS_LOGS: int # in seconds
session_id: Optional[int] # session (run) identifier in database
web_server: WebServerThread
stop: bool
stop_lock: threading.Lock # to prevent concurrent access to stop
def __init__(self): def __init__(self):
gridEngine = SunGridEngine() gridEngine = SunGridEngine()
self.m_clusterStatus = ClusterStatus(gridEngine) self.cluster_status = ClusterStatus(gridEngine)
self.m_slotAllocator = DecoupledSlotAllocator() # SimpleSlotAllocator() self.slot_allocator = DecoupledSlotAllocator() # SimpleSlotAllocator() pylint: disable=no-value-for-parameter
self.m_machinesThatNeedWakeUp = {} self.machines_that_need_wake_up = {}
self.m_machinesThatNeedWakeupLock = threading.Lock() # to prevent concurrent access to m_machinesThatNeedWakeUp self.machines_that_need_wake_up_lock = threading.Lock()
self.m_machinesThatNeedSleeping = {} self.machines_that_need_sleeping = {}
self.m_machinesThatNeedSleepingLock = threading.Lock() # to prevent concurrent access to m_machinesThatNeedSleeping self.machines_that_need_sleeping_lock = threading.Lock()
self.m_lastEnergyStatusLogTime = None self.last_energy_status_log_time = None
self.DELAY_BETWEEN_ENERGY_STATUS_LOGS = 60 # in seconds self.DELAY_BETWEEN_ENERGY_STATUS_LOGS = 60
self.m_iSessionId = None # session (run) identifier in database self.session_id = None
self.m_webServer = WebServerThread(self) self.web_server = WebServerThread(self)
self.m_bStop = False self.stop = False
self.m_bStopLock = threading.Lock() # to prevent concurrent access to m_bStop self.stop_lock = threading.Lock()
def getClusterStatus(self): def getClusterStatus(self):
return self.m_clusterStatus return self.cluster_status
def log(self, message): def log(self, message):
print(message) print(message)
@ -104,79 +122,76 @@ class ClusterController:
self.log("ClusterController::shutdownLeastImportantNode : start") self.log("ClusterController::shutdownLeastImportantNode : start")
def onMachineWakeUpComplete(self, machineName): def onMachineWakeUpComplete(self, machineName):
self.m_machinesThatNeedWakeupLock.acquire() self.machines_that_need_wake_up_lock.acquire()
# logDebug('ClusterController::onMachineWakeUpComplete : machine %s old len(self.m_machinesThatNeedWakeUp) = %d' % (machineName,len(self.m_machinesThatNeedWakeUp))) # logDebug('ClusterController::onMachineWakeUpComplete : machine %s old len(self.machines_that_need_wake_up) = %d' % (machineName,len(self.machines_that_need_wake_up)))
del self.m_machinesThatNeedWakeUp[machineName] del self.machines_that_need_wake_up[machineName]
# logDebug('ClusterController::onMachineWakeUpComplete : machine %s new len(self.m_machinesThatNeedWakeUp) = %d' % (machineName,len(self.m_machinesThatNeedWakeUp))) # logDebug('ClusterController::onMachineWakeUpComplete : machine %s new len(self.machines_that_need_wake_up) = %d' % (machineName,len(self.machines_that_need_wake_up)))
self.m_machinesThatNeedWakeupLock.release() self.machines_that_need_wake_up_lock.release()
logDebug('ClusterController::onMachineWakeUpComplete : removed %s from the list of machines that need waking up because it\'s now awake' % machineName) logDebug('ClusterController::onMachineWakeUpComplete : removed %s from the list of machines that need waking up because it\'s now awake' % machineName)
def onMachineSleepComplete(self, machineName, bSleepSucceeded): def onMachineSleepComplete(self, machineName, bSleepSucceeded):
self.m_machinesThatNeedSleepingLock.acquire() self.machines_that_need_sleeping_lock.acquire()
# logDebug('ClusterController::onMachineSleepComplete : machine %s old len(self.m_machinesThatNeedWakeUp) = %d' % (machineName,len(self.m_machinesThatNeedWakeUp))) # logDebug('ClusterController::onMachineSleepComplete : machine %s old len(self.machines_that_need_wake_up) = %d' % (machineName,len(self.machines_that_need_wake_up)))
del self.m_machinesThatNeedSleeping[machineName] del self.machines_that_need_sleeping[machineName]
# logDebug('ClusterController::onMachineSleepComplete : machine %s new len(self.m_machinesThatNeedWakeUp) = %d' % (machineName,len(self.m_machinesThatNeedWakeUp))) # logDebug('ClusterController::onMachineSleepComplete : machine %s new len(self.machines_that_need_wake_up) = %d' % (machineName,len(self.machines_that_need_wake_up)))
self.m_machinesThatNeedSleepingLock.release() self.machines_that_need_sleeping_lock.release()
if bSleepSucceeded: if bSleepSucceeded:
logDebug('ClusterController::onMachineSleepComplete : removed %s from the list of machines that need sleeping because it\'s now sleeping' % machineName) logDebug('ClusterController::onMachineSleepComplete : removed %s from the list of machines that need sleeping because it\'s now sleeping' % machineName)
else: else:
logDebug('ClusterController::onMachineSleepComplete : removed %s from the list of machines that need sleeping because it can\'t be put to sleep at the moment (eg a job just arrived)' % machineName) logDebug('ClusterController::onMachineSleepComplete : removed %s from the list of machines that need sleeping because it can\'t be put to sleep at the moment (eg a job just arrived)' % machineName)
def getNumPendingWakeUps(self): def getNumPendingWakeUps(self):
self.m_machinesThatNeedWakeupLock.acquire() self.machines_that_need_wake_up_lock.acquire()
numPendingWakeUps = len(self.m_machinesThatNeedWakeUp) numPendingWakeUps = len(self.machines_that_need_wake_up)
self.m_machinesThatNeedWakeupLock.release() self.machines_that_need_wake_up_lock.release()
return numPendingWakeUps return numPendingWakeUps
def getNumPendingSleeps(self): def getNumPendingSleeps(self):
self.m_machinesThatNeedSleepingLock.acquire() self.machines_that_need_sleeping_lock.acquire()
numPendingSleeps = len(self.m_machinesThatNeedSleeping) numPendingSleeps = len(self.machines_that_need_sleeping)
self.m_machinesThatNeedSleepingLock.release() self.machines_that_need_sleeping_lock.release()
return numPendingSleeps return numPendingSleeps
def putIdleMachinesToSleep(self): def putIdleMachinesToSleep(self):
self.m_clusterStatus.m_lock.acquire() self.cluster_status.lock.acquire()
idleMachines = self.m_clusterStatus.getIdleMachines() idleMachines = self.cluster_status.get_idle_machines()
# logInfo('idleMachines :') # log_info('idleMachines :')
self.m_machinesThatNeedToSleep = [] for _machineName, idleMachine in idleMachines.items():
for machineName, idleMachine in idleMachines.items(): if idleMachine.get_power_state() == PowerState.ON:
if idleMachine.getPowerState() == PowerState.ON: # log_info('\t%s' % machineName)
# logInfo('\t%s' % machineName) if idleMachine.get_name() != 'simpatix10': # never put simpatix10 to sleep because it's the sge master and is also server for other things
if idleMachine.getName() != 'simpatix10': # never put simpatix10 to sleep because it's the sge master and is also server for other things self.machines_that_need_sleeping[idleMachine.get_name()] = idleMachine
self.m_machinesThatNeedSleeping[idleMachine.getName()] = idleMachine self.cluster_status.lock.release()
self.m_clusterStatus.m_lock.release()
listOfMachinesThatNeedSleeping = self.m_machinesThatNeedSleeping.values() # duplicate the list so that we don't iterate on m_machinesThatNeedSleeping, which could cause a runtime error because callbacks alter m_machinesThatNeedWakeUp listOfMachinesThatNeedSleeping = self.machines_that_need_sleeping.values() # duplicate the list so that we don't iterate on machines_that_need_sleeping, which could cause a runtime error because callbacks alter machines_that_need_wake_up
for machine in listOfMachinesThatNeedSleeping: for machine in listOfMachinesThatNeedSleeping:
logInfo('ClusterController::putIdleMachinesToSleep : requesting sleep for %s because it\'s idle' % machine.getName()) log_info('ClusterController::putIdleMachinesToSleep : requesting sleep for %s because it\'s idle' % machine.get_name())
machine.requestSleep(SleepCompleteNotifier(machine.getName(), self)) machine.request_sleep(SleepCompleteNotifier(machine.get_name(), self)) # pylint: disable=no-value-for-parameter
if len(listOfMachinesThatNeedSleeping) != 0: if len(listOfMachinesThatNeedSleeping) != 0:
# hack : wait until the sleep requests are handled so that we don't request the same machine to sleep multiple times # hack : wait until the sleep requests are handled so that we don't request the same machine to sleep multiple times
while self.getNumPendingSleeps() > 0: while self.getNumPendingSleeps() > 0:
time.sleep(1) time.sleep(1)
def wakeUpMachinesForPendingJobs(self): def wake_up_machinesForPendingJobs(self):
listOfMachinesThatNeedWakeUp = [] listOfMachinesThatNeedWakeUp = []
self.m_clusterStatus.m_lock.acquire() self.cluster_status.lock.acquire()
pendingJobs = self.m_clusterStatus.getPendingJobs() pendingJobs = self.cluster_status.get_pending_jobs()
""" # log_info('pending jobs :')
logInfo('pending jobs :') # for job in pendingJobs.values():
for job in pendingJobs.values(): # log_info('\t%d' % job.getId().asStr())
logInfo('\t%d' % job.getId().asStr())
"""
if len(pendingJobs) != 0: if len(pendingJobs) != 0:
self.m_machinesThatNeedWakeUp = self.m_slotAllocator.getMachinesThatNeedWakeUp(pendingJobs, self.m_clusterStatus) self.machines_that_need_wake_up = self.slot_allocator.get_machinesThatNeedWakeUp(pendingJobs, self.cluster_status)
if len(self.m_machinesThatNeedWakeUp) == 0: if len(self.machines_that_need_wake_up) == 0:
None pass
# logInfo('ClusterController::updateNormalState : no machine needs waking up') # log_info('ClusterController::updateNormalState : no machine needs waking up')
else: else:
listOfMachinesThatNeedWakeUp = self.m_machinesThatNeedWakeUp.values() # duplicate the list so that we don't iterate on m_machinesThatNeedWakeUp, which would cause a runtime error because callbacks alter m_machinesThatNeedWakeUp listOfMachinesThatNeedWakeUp = self.machines_that_need_wake_up.values() # duplicate the list so that we don't iterate on machines_that_need_wake_up, which would cause a runtime error because callbacks alter machines_that_need_wake_up
for machine in listOfMachinesThatNeedWakeUp: for machine in listOfMachinesThatNeedWakeUp:
logInfo('ClusterController::wakeUpMachinesForPendingJobs : requesting wake up for ' + machine.getName()) log_info('ClusterController::wake_up_machinesForPendingJobs : requesting wake up for ' + machine.get_name())
machine.requestWakeUp(WakeUpCompleteNotifier(machine.getName(), self)) machine.request_wake_up(WakeUpCompleteNotifier(machine.get_name(), self)) # pylint: disable=no-value-for-parameter
self.m_clusterStatus.m_lock.release() self.cluster_status.lock.release()
if len(listOfMachinesThatNeedWakeUp) != 0: if len(listOfMachinesThatNeedWakeUp) != 0:
# hack : wait until the wakeup requests are handled so that a later sleep request doesn't cancel it # hack : wait until the wakeup requests are handled so that a later sleep request doesn't cancel it
@ -184,16 +199,16 @@ class ClusterController:
while self.getNumPendingWakeUps() > 0: while self.getNumPendingWakeUps() > 0:
time.sleep(1) time.sleep(1)
iSGE_CHEK_RUNNABLE_JOBS_DELAY = 60 * 5 # max time it takes for sge between the fact that a queued job is runnable and SGE actually starting it (I've put a long time here because sometimes, qstat takes a long time to ralise that the machine is available after I wake it up) iSGE_CHEK_RUNNABLE_JOBS_DELAY = 60 * 5 # max time it takes for sge between the fact that a queued job is runnable and SGE actually starting it (I've put a long time here because sometimes, qstat takes a long time to ralise that the machine is available after I wake it up)
logInfo('ClusterController::wakeUpMachinesForPendingJobs : all required machines are awake. Now give %d seconds to SGE to allocate slots.' % iSGE_CHEK_RUNNABLE_JOBS_DELAY) log_info('ClusterController::wake_up_machinesForPendingJobs : all required machines are awake. Now give %d seconds to SGE to allocate slots.' % iSGE_CHEK_RUNNABLE_JOBS_DELAY)
# wait until SGE has a chance to allocate slots # wait until SGE has a chance to allocate slots
time.sleep(iSGE_CHEK_RUNNABLE_JOBS_DELAY) # note : this is annoying because it blocks the main thread. This could be improved if we forbid the machines to go to sleep for that much time.... time.sleep(iSGE_CHEK_RUNNABLE_JOBS_DELAY) # note : this is annoying because it blocks the main thread. This could be improved if we forbid the machines to go to sleep for that much time....
logInfo('ClusterController::wakeUpMachinesForPendingJobs : end of the delay given to SGE to allocate slots') log_info('ClusterController::wake_up_machinesForPendingJobs : end of the delay given to SGE to allocate slots')
def updateNormalState(self): def updateNormalState(self):
# attempt to shut down machines that are idle # attempt to shut down machines that are idle
self.putIdleMachinesToSleep() self.putIdleMachinesToSleep()
# wake up necessary machines if there are pending jobs # wake up necessary machines if there are pending jobs
self.wakeUpMachinesForPendingJobs() self.wake_up_machinesForPendingJobs()
def storeSessionInDatabase(self): def storeSessionInDatabase(self):
conn = MySQLdb.connect('simpatix10', 'clusterctrl', '', 'clustercontroller') conn = MySQLdb.connect('simpatix10', 'clusterctrl', '', 'clustercontroller')
@ -207,7 +222,7 @@ class ClusterController:
iSessionId = r.fetch_row()[0][0] iSessionId = r.fetch_row()[0][0]
# stores information about the session # stores information about the session
sqlCommand = "INSERT INTO `sessions_desc` (`start_time`, end_time, `program_version`, `machine_name`, `pid`, num_controlled_machines) VALUES (NOW(), NOW(), '%s', 'simpatix10', %d, %d);" % (VERSION, os.getpid(), len(self.m_clusterStatus.m_clusterNodes)) sqlCommand = "INSERT INTO `sessions_desc` (`start_time`, end_time, `program_version`, `machine_name`, `pid`, num_controlled_machines) VALUES (NOW(), NOW(), '%s', 'simpatix10', %d, %d);" % (VERSION, os.getpid(), len(self.cluster_status.cluster_nodes))
print(sqlCommand) print(sqlCommand)
conn.query(sqlCommand) conn.query(sqlCommand)
@ -225,64 +240,62 @@ class ClusterController:
assert conn assert conn
# update energy savings for the current session # update energy savings for the current session
sqlCommand = "UPDATE session_to_energy_savings SET energy_savings_kwh=%f WHERE session_id=%d;" % (jouleToKwh(self.m_clusterStatus.getEnergySavings()), self.m_iSessionId) sqlCommand = "UPDATE session_to_energy_savings SET energy_savings_kwh=%f WHERE session_id=%d;" % (jouleToKwh(self.cluster_status.get_energy_savings()), self.session_id)
print(sqlCommand) print(sqlCommand)
conn.query(sqlCommand) conn.query(sqlCommand)
# update the end time of the current session # update the end time of the current session
sqlCommand = "UPDATE sessions_desc SET end_time=NOW() WHERE session_id=%d;" % (self.m_iSessionId) sqlCommand = "UPDATE sessions_desc SET end_time=NOW() WHERE session_id=%d;" % (self.session_id)
print(sqlCommand) print(sqlCommand)
conn.query(sqlCommand) conn.query(sqlCommand)
conn.close() conn.close()
def setControlOnMachine(self, machineName, bControl): def set_control_on_machine(self, machineName, bControl):
""" """
adds or removes the control of ClusterController on the given machine adds or removes the control of ClusterController on the given machine
""" """
self.m_clusterStatus.setControlOnMachine(machineName, bControl) self.cluster_status.set_control_on_machine(machineName, bControl)
def run(self): def run(self):
""" """
""" """
self.m_iSessionId = self.storeSessionInDatabase() self.session_id = self.storeSessionInDatabase()
log("storeSessionInDatabase completed") log("storeSessionInDatabase completed")
DELAY_BETWEEN_MEASURES = 10 # in seconds DELAY_BETWEEN_MEASURES = 10 # in seconds
self.m_clusterStatus.startReadingThreads() self.cluster_status.start_reading_threads()
self.m_webServer.start() self.web_server.start()
while not self.m_clusterStatus.isReady(): while not self.cluster_status.is_ready():
log('waiting for system to be ready') log('waiting for system to be ready')
time.sleep(1) time.sleep(1)
None log_info('ClusterController::run : cluster initial readings have completed')
logInfo('ClusterController::run : cluster initial readings have completed')
startTime = time.localtime() startTime = time.localtime()
while not self.m_bStop: while not self.stop:
currentTime = time.time() currentTime = time.time()
# clusterStatus.m_nodesStatus['simpatix10'].dump() if (not self.last_energy_status_log_time) or (currentTime > (self.last_energy_status_log_time + self.DELAY_BETWEEN_ENERGY_STATUS_LOGS)):
if (not self.m_lastEnergyStatusLogTime) or (currentTime > (self.m_lastEnergyStatusLogTime + self.DELAY_BETWEEN_ENERGY_STATUS_LOGS)): iNumMachines = len(self.cluster_status.cluster_nodes)
iNumMachines = len(self.m_clusterStatus.m_clusterNodes)
iNumMachinesOn = 0 iNumMachinesOn = 0
iNumSleepingMachines = 0 iNumSleepingMachines = 0
for machine in self.m_clusterStatus.m_clusterNodes.values(): for machine in self.cluster_status.cluster_nodes.values():
ePowerState = machine.getPowerState() ePowerState = machine.get_power_state()
if ePowerState == PowerState.ON: if ePowerState == PowerState.ON:
iNumMachinesOn += 1 iNumMachinesOn += 1
elif ePowerState == PowerState.SLEEP: elif ePowerState == PowerState.SLEEP:
iNumSleepingMachines += 1 iNumSleepingMachines += 1
logInfo('%d machines (%d ON, %d SLEEPING)' % (iNumMachines, iNumMachinesOn, iNumSleepingMachines)) log_info('%d machines (%d ON, %d SLEEPING)' % (iNumMachines, iNumMachinesOn, iNumSleepingMachines))
iNumSlots = self.m_clusterStatus.getNumControlledSlots() iNumSlots = self.cluster_status.get_num_controlled_slots()
iNumUsedSlots = self.m_clusterStatus.getNumUsedSlots() iNumUsedSlots = self.cluster_status.get_num_used_slots()
iNumWastedSlots = self.m_clusterStatus.getNumWastedSlots() iNumWastedSlots = self.cluster_status.get_num_wasted_slots()
iNumSleepingSlots = self.m_clusterStatus.getNumSleepingSlots() iNumSleepingSlots = self.cluster_status.get_num_sleeping_slots()
logInfo('%d slots (%d used, %d wasted, %d sleeping)' % (iNumSlots, iNumUsedSlots, iNumWastedSlots, iNumSleepingSlots)) log_info('%d slots (%d used, %d wasted, %d sleeping)' % (iNumSlots, iNumUsedSlots, iNumWastedSlots, iNumSleepingSlots))
logInfo('cluster estimated power consumption : %f W (saving from cluster controller : %f W)' % (self.m_clusterStatus.getCurrentPowerConsumption(), self.m_clusterStatus.getCurrentPowerSavings())) log_info('cluster estimated power consumption : %f W (saving from cluster controller : %f W)' % (self.cluster_status.get_current_power_consumption(), self.cluster_status.get_current_power_savings()))
logInfo('cluster estimated energy consumption since %s : %f kWh (saving from cluster controller : %f kWh)' % (time.asctime(startTime), jouleToKwh(self.m_clusterStatus.getEnergyConsumption()), jouleToKwh(self.m_clusterStatus.getEnergySavings()))) log_info('cluster estimated energy consumption since %s : %f kWh (saving from cluster controller : %f kWh)' % (time.asctime(startTime), jouleToKwh(self.cluster_status.get_energy_consumption()), jouleToKwh(self.cluster_status.get_energy_savings())))
self.updateSessionEnergyConsumptionInDatabase() self.updateSessionEnergyConsumptionInDatabase()
self.m_lastEnergyStatusLogTime = currentTime self.last_energy_status_log_time = currentTime
self.updateNormalState() self.updateNormalState()
time.sleep(DELAY_BETWEEN_MEASURES) time.sleep(DELAY_BETWEEN_MEASURES)
self.m_clusterStatus.stopReadingThreads() self.cluster_status.stop_reading_threads()
def storeClusterNodeStatus(clusterNodeStatus): def storeClusterNodeStatus(clusterNodeStatus):
@ -290,19 +303,17 @@ def storeClusterNodeStatus(clusterNodeStatus):
conn = MySQLdb.connect('simpatix10', 'root', '', 'simpa_measurements') conn = MySQLdb.connect('simpatix10', 'root', '', 'simpa_measurements')
assert conn assert conn
# conn.query("""INSERT INTO `fan_rpm_logs` (`fan_id`, `rpm`, `date`) VALUES ('titi', 2000, NOW());""") # conn.query("""INSERT INTO `fan_rpm_logs` (`fan_id`, `rpm`, `date`) VALUES ('titi', 2000, NOW());""")
''' # conn.query("""SELECT * FROM fan_rpm_logs""")
conn.query("""SELECT * FROM fan_rpm_logs""") # r=conn.store_result()
r=conn.store_result() # print r.fetch_row()[0]
print r.fetch_row()[0] for _key, sensor in clusterNodeStatus.sensors.items():
''' sensorId = clusterNodeStatus.cluster_node_name + '_' + sensor.name
for key, sensor in clusterNodeStatus.m_sensors.items():
sensorId = clusterNodeStatus.m_clusterNodeName + '_' + sensor.m_name
if sensor.typeName() == 'Fan': if sensor.typeName() == 'Fan':
sqlCommand = """INSERT INTO `fan_rpm_logs` (`fan_id`, `rpm`, `date`) VALUES ('""" + sensorId + """', """ + str(sensor.m_rpms) + """, NOW());""" sqlCommand = """INSERT INTO `fan_rpm_logs` (`fan_id`, `rpm`, `date`) VALUES ('""" + sensorId + """', """ + str(sensor.rpms) + """, NOW());"""
print(sqlCommand) print(sqlCommand)
conn.query(sqlCommand) conn.query(sqlCommand)
elif sensor.typeName() == 'Temperature': elif sensor.typeName() == 'Temperature':
sqlCommand = """INSERT INTO `temperature_logs` (`temp_sensor_id`, `temperature`, `date`) VALUES ('""" + sensorId + """', """ + str(sensor.m_temperature) + """, NOW());""" sqlCommand = """INSERT INTO `temperature_logs` (`temp_sensor_id`, `temperature`, `date`) VALUES ('""" + sensorId + """', """ + str(sensor.temperature) + """, NOW());"""
print(sqlCommand) print(sqlCommand)
conn.query(sqlCommand) conn.query(sqlCommand)
else: else:
@ -311,11 +322,11 @@ def storeClusterNodeStatus(clusterNodeStatus):
if __name__ == '__main__': if __name__ == '__main__':
# Lib.Util.sendTextMail('SimpaCluster <guillaume.raffy@univ-rennes1.fr>', 'guillaume.raffy@univ-rennes1.fr', 'mail subject', 'mail content') # Lib.Util.send_text_mail('SimpaCluster <guillaume.raffy@univ-rennes1.fr>', 'guillaume.raffy@univ-rennes1.fr', 'mail subject', 'mail content')
try: try:
logInfo('ClusterController v. %s starting....' % VERSION) log_info('ClusterController v. %s starting....' % VERSION)
# executeCommand('ping -o -t 1 simpatix310 > /dev/null') # execute_command('ping -o -t 1 simpatix310 > /dev/null')
# print executeCommand('ssh simpatix10 "ipmitool sensor"') # print execute_command('ssh simpatix10 "ipmitool sensor"')
# assert False, 'prout' # assert False, 'prout'
controller = ClusterController() controller = ClusterController()
controller.run() controller.run()
@ -323,4 +334,4 @@ if __name__ == '__main__':
# except AssertionError, error: # except AssertionError, error:
# except KeyboardInterrupt, error: # except KeyboardInterrupt, error:
except BaseException as exception: # catches all exceptions, including the ctrl+C (KeyboardInterrupt) except BaseException as exception: # catches all exceptions, including the ctrl+C (KeyboardInterrupt)
onException(exception) on_exception(exception)

View File

@ -1,142 +1,161 @@
import threading from typing import TYPE_CHECKING
from PowerState import PowerState, PowerStateToStr from typing import Optional
from ClusterNodeStatusUpdater import ClusterNodeStatusUpdater from .PowerState import PowerState, PowerStateToStr
import Lib.Util from .Log import log_info, log_warning
import Lib.SimpaDbUtil from .ClusterNodeStatusUpdater import ClusterNodeStatusUpdater
from Log import logInfo, logWarning if TYPE_CHECKING:
from .ClusterStatus import ClusterStatus
from .SunGridEngine import SunGridEngine
from .ClusterController import SleepCompleteNotifier
from datetime import datetime from datetime import datetime
ClusterNodeId = str # eg 'physix99'
class ClusterNode: class ClusterNode:
""" """
the state of a machine node the state of a machine node
""" """
def __init__(self, machineName, cluster, gridEngine): name: ClusterNodeId
self.m_name = machineName cluster: 'ClusterStatus' # the cluster this machine belongs to
self.m_cluster = cluster # the cluster this machine belongs to requested_power_state: PowerState
self.m_requestedPowerState = PowerState.ON power_state: PowerState
self.m_powerState = PowerState.UNKNOWN last_power_state_time: Optional[datetime] # time at which the last value of self.power_state has been set
self.m_lastPowerStateTime = None # time at which the last value of self.m_powerState has been set machine_status_updater: ClusterNodeStatusUpdater
self.m_machineStatusUpdater = ClusterNodeStatusUpdater(machineName, self, gridEngine) energy_consumption: float # estimate of the energy consumption of this machine since the start of cluster controller (in joules)
self.m_energyConsumption = 0.0 # estimate of the energy consumption of this machine since the start of cluster controller (in joules) energy_savings: float # estimate of the energy savings on this machine caused by the cluster controller since it started (in joules)
self.m_energySavings = 0.0 # estimate of the energy savings on this machine caused by the cluster controller since it started (in joules)
def getName(self): def __init__(self, machine_name: ClusterNodeId, cluster: 'ClusterStatus', grid_engine: 'SunGridEngine'):
return self.m_name self.name = machine_name
self.cluster = cluster # the cluster this machine belongs to
self.requested_power_state = PowerState.ON
self.power_state = PowerState.UNKNOWN
self.last_power_state_time = None # time at which the last value of self.power_state has been set
self.machine_status_updater = ClusterNodeStatusUpdater(machine_name, self, grid_engine)
self.energy_consumption = 0.0 # estimate of the energy consumption of this machine since the start of cluster controller (in joules)
self.energy_savings = 0.0 # estimate of the energy savings on this machine caused by the cluster controller since it started (in joules)
def isReady(self): def get_name(self) -> ClusterNodeId:
if self.m_powerState == PowerState.UNKNOWN: return self.name
# logInfo(self.m_name + ' is not ready (waiting for power state)')
def is_ready(self) -> bool:
if self.power_state == PowerState.UNKNOWN:
# log_info(self.name + ' is not ready (waiting for power state)')
return False return False
if self.m_powerState == PowerState.ON: if self.power_state == PowerState.ON:
return True return True
# log(self.m_name + ' is ready') # log(self.name + ' is ready')
return True return True
def getPowerState(self): def get_power_state(self) -> PowerState:
return self.m_powerState return self.power_state
def setShouldAlwaysBeOn(self): def set_should_always_be_on(self):
self.m_machineStatusUpdater.setShouldAlwaysBeOn() self.machine_status_updater.set_should_always_be_on()
self.setPowerState(PowerState.ON) self.set_power_state(PowerState.ON)
def setPowerState(self, powerState): def set_power_state(self, power_state: PowerState):
bUpdateRequiredChecks = False bUpdateRequiredChecks = False
if self.m_powerState == PowerState.UNKNOWN: if self.power_state == PowerState.UNKNOWN:
logInfo('ClusterNode::setPowerState : ' + self.m_name + '\'s power state has been initialized to ' + PowerStateToStr(powerState)) log_info('ClusterNode::set_power_state : ' + self.name + '\'s power state has been initialized to ' + PowerStateToStr(power_state))
self.m_powerState = powerState self.power_state = power_state
self.m_lastPowerStateTime = datetime.now() self.last_power_state_time = datetime.now()
bUpdateRequiredChecks = True bUpdateRequiredChecks = True
else: else:
# update the estimation of energy consumption # update the estimation of energy consumption
self.updateEnergyMeasurements() self.update_energy_measurements()
# then change the power state # then change the power state
if self.m_powerState != powerState: if self.power_state != power_state:
logInfo('ClusterNode::setPowerState : ' + self.m_name + '\'s power state has been changed to ' + PowerStateToStr(powerState)) log_info('ClusterNode::set_power_state : ' + self.name + '\'s power state has been changed to ' + PowerStateToStr(power_state))
self.m_powerState = powerState self.power_state = power_state
self.m_lastPowerStateTime = datetime.now() self.last_power_state_time = datetime.now()
bUpdateRequiredChecks = True bUpdateRequiredChecks = True
if bUpdateRequiredChecks: if bUpdateRequiredChecks:
if self.m_powerState == PowerState.ON: if self.power_state == PowerState.ON:
self.m_machineStatusUpdater.m_bCheckPowerState = True self.machine_status_updater.check_power_state = True
self.m_machineStatusUpdater.m_bCheckSensors = True self.machine_status_updater.check_sensors = True
elif self.m_powerState == PowerState.OFF: elif self.power_state == PowerState.OFF:
self.m_machineStatusUpdater.m_bCheckPowerState = True self.machine_status_updater.check_power_state = True
self.m_machineStatusUpdater.m_bCheckSensors = False self.machine_status_updater.check_sensors = False
elif self.m_powerState == PowerState.SLEEP: elif self.power_state == PowerState.SLEEP:
self.m_machineStatusUpdater.m_bCheckPowerState = True self.machine_status_updater.check_power_state = True
self.m_machineStatusUpdater.m_bCheckSensors = False self.machine_status_updater.check_sensors = False
elif self.m_powerState == PowerState.UNPLUGGED: elif self.power_state == PowerState.UNPLUGGED:
self.m_machineStatusUpdater.m_bCheckPowerState = True self.machine_status_updater.check_power_state = True
self.m_machineStatusUpdater.m_bCheckSensors = False self.machine_status_updater.check_sensors = False
else: else:
assert False assert False
def onNewPowerStateReading(self, powerState): def on_new_power_state_reading(self, power_state: PowerState):
""" """
called when a new powerstate reading arrives called when a new powerstate reading arrives
""" """
if powerState != self.getPowerState(): if power_state != self.get_power_state():
if self.getPowerState() != PowerState.UNKNOWN: if self.get_power_state() != PowerState.UNKNOWN:
logWarning('ClusterNode::onNewPowerStateReading : ' + self.m_name + '\'s power state has been (manually it seems) changed to ' + PowerStateToStr(powerState)) log_warning('ClusterNode::on_new_power_state_reading : ' + self.name + '\'s power state has been (manually it seems) changed to ' + PowerStateToStr(power_state))
self.setPowerState(powerState) self.set_power_state(power_state)
def getPowerConsumptionForPowerState(self, ePowerState): def get_power_consumption_for_power_state(self, power_state: PowerState) -> float:
""" """
returns the power consumption estimation (in watts) of this machine for the given power state returns the power consumption estimation (in watts) of this machine for the given power state
""" """
fCurrentIntensity = 0.0 fCurrentIntensity = 0.0
fCurrentVoltage = 220.0 fCurrentVoltage = 220.0
# noticed on 26.08.2009 that putting 22 machines from sleep to on eats 17 A, resulting in difference of 0.77 A per machine # noticed on 26.08.2009 that putting 22 machines from sleep to on eats 17 A, resulting in difference of 0.77 A per machine
if ePowerState == PowerState.ON: if power_state == PowerState.ON:
fCurrentIntensity = 0.9 # value when the machine is doing nothing fCurrentIntensity = 0.9 # value when the machine is doing nothing
elif ePowerState == PowerState.OFF: elif power_state == PowerState.OFF:
fCurrentIntensity = 0.1 fCurrentIntensity = 0.1
elif ePowerState == PowerState.SLEEP: elif power_state == PowerState.SLEEP:
fCurrentIntensity = 0.1 fCurrentIntensity = 0.1
elif ePowerState == PowerState.UNPLUGGED: elif power_state == PowerState.UNPLUGGED:
fCurrentIntensity = 0.0 fCurrentIntensity = 0.0
else: else:
assert False assert False
return fCurrentIntensity * fCurrentVoltage return fCurrentIntensity * fCurrentVoltage
def updateEnergyMeasurements(self): def update_energy_measurements(self):
timeInterval = datetime.now() - self.m_lastPowerStateTime timeInterval = datetime.now() - self.last_power_state_time
self.m_energyConsumption += self.getPowerConsumptionForPowerState(self.m_powerState) * timeInterval.seconds self.energy_consumption += self.get_power_consumption_for_power_state(self.power_state) * timeInterval.seconds
self.m_energySavings += (self.getPowerConsumptionForPowerState(PowerState.ON) - self.getPowerConsumptionForPowerState(self.m_powerState)) * timeInterval.seconds self.energy_savings += (self.get_power_consumption_for_power_state(PowerState.ON) - self.get_power_consumption_for_power_state(self.power_state)) * timeInterval.seconds
self.m_lastPowerStateTime = datetime.now() self.last_power_state_time = datetime.now()
# logDebug('energy savings on %s : %f J' %(self.getName(), self.m_energySavings)) # logDebug('energy savings on %s : %f J' %(self.get_name(), self.energy_savings))
def getEnergyConsumption(self): def get_energy_consumption(self) -> float:
""" """
in joules in joules
""" """
self.updateEnergyMeasurements() self.update_energy_measurements()
return self.m_energyConsumption return self.energy_consumption
def getPowerConsumption(self): def get_power_consumption(self) -> float:
fCurrentPowerConsumption = self.getPowerConsumptionForPowerState(self.m_powerState) fCurrentPowerConsumption = self.get_power_consumption_for_power_state(self.power_state)
# logDebug('getPowerConsumption of %s : %f (powerstate = %d)' % (self.getName(), fCurrentPowerConsumption, self.m_powerState)) # logDebug('get_power_consumption of %s : %f (powerstate = %d)' % (self.get_name(), fCurrentPowerConsumption, self.power_state))
return fCurrentPowerConsumption return fCurrentPowerConsumption
def getEnergySavings(self): def get_energy_savings(self) -> float:
self.updateEnergyMeasurements() self.update_energy_measurements()
return self.m_energySavings return self.energy_savings
def onSleepFailedBecauseAJobJustArrived(self): def on_sleep_because_a_job_just_arrived(self):
logInfo('%s was scheduled to sleep but the sleep is canceled because it\'s currently executing a new job' % self.m_name) log_info('%s was scheduled to sleep but the sleep is canceled because it\'s currently executing a new job' % self.name)
def requestSleep(self, sleepCompleteNotifier=None): def request_sleep(self, sleep_complete_notifier: Optional['SleepCompleteNotifier'] = None):
self.m_machineStatusUpdater.requestSleep(sleepCompleteNotifier) self.machine_status_updater.request_sleep(sleep_complete_notifier)
def requestWakeUp(self, wakeUpCompleteNotifier=None): def request_wake_up(self, wake_up_complete_notifier: Optional['SleepCompleteNotifier'] = None):
self.m_machineStatusUpdater.requestWakeUp(wakeUpCompleteNotifier) self.machine_status_updater.request_wake_up(wake_up_complete_notifier)
def getQueueMachineName(self): def get_queue_machine_name(self) -> ClusterNodeId:
return self.getCluster().getJobsState().getQueueMachine(self.m_name).getName() return self.get_cluster().get_jobs_state().get_queue_machine(self.name).get_name()
assert self.m_queueName is not None # assert self.queue_name is not None
return self.m_queueName # return self.queue_name
def getCluster(self): def get_cluster(self) -> 'ClusterStatus':
return self.m_cluster return self.cluster
# from .ClusterStatus import ClusterStatus # noqa: E402, pylint: disable=wrong-import-position
# from .SunGridEngine import SunGridEngine # noqa: E402, pylint: disable=wrong-import-position
# from .ClusterController import SleepCompleteNotifier # noqa: E402, pylint: disable=wrong-import-position

View File

@ -1,39 +1,46 @@
from typing import TYPE_CHECKING
from typing import Optional, List
import threading import threading
import time import time
import Lib.Util import abc
import Lib.SimpaDbUtil from .PowerState import PowerState
from PowerState import PowerState from .Log import log_info, logDebug
from Log import logInfo, logDebug from .Util import blocking_wake_up_machine, blocking_put_machine_to_sleep, get_power_state, on_exception
from Util import blockingWakeUpMachine, blockingPutMachineToSleep, getPowerState, onException if TYPE_CHECKING:
from .ClusterNode import ClusterNodeId, ClusterNode
from .SunGridEngine import SunGridEngine
class IWakeUpCompleteNotifier: class IWakeUpCompleteNotifier(abc.ABCMeta):
""" """
interface for wakeup notifiers interface for wakeup notifiers
""" """
def onWakeUpComplete(self): @abc.abstractmethod
def on_wake_up_complete(self):
assert False assert False
class ISleepCompleteNotifier: class ISleepCompleteNotifier(abc.ABCMeta):
""" """
interface for sleep notifiers interface for sleep notifiers
""" """
def onSleepComplete(self, bSleepSucceeded): @abc.abstractmethod
def on_sleep_complete(self, bSleepSucceeded):
assert False assert False
class IRequest: class IRequest(abc.ABCMeta):
GO_TO_SLEEP = 1 GO_TO_SLEEP = 1
WAKE_UP = 2 WAKE_UP = 2
CHECK_POWER_STATE = 3 CHECK_POWER_STATE = 3
def __init__(self, requestType): def __init__(self, requestType):
self.m_type = requestType self.type = requestType
def getType(self): def getType(self):
return self.m_type return self.type
@abc.abstractmethod
def process(self, clusterNodeStatusUpdater): def process(self, clusterNodeStatusUpdater):
""" """
processes this request processes this request
@ -43,58 +50,58 @@ class IRequest:
class WakeUpRequest(IRequest): class WakeUpRequest(IRequest):
def __init__(self, wakeUpNotifier): def __init__(self, wakeUpNotifier: IWakeUpCompleteNotifier):
IRequest.__init__(self, IRequest.WAKE_UP) IRequest.__init__(self, IRequest.WAKE_UP)
self.m_wakeUpNotifier = wakeUpNotifier self.wake_up_notifier = wakeUpNotifier
def process(self, clusterNodeStatusUpdater): def process(self, clusterNodeStatusUpdater):
assert clusterNodeStatusUpdater.m_bShouldAlwaysBeOn is False # are we attempting to wake up a machine that should always be on ? assert clusterNodeStatusUpdater.should_always_be_on is False # are we attempting to wake up a machine that should always be on ?
logInfo('Handling wakeup request for %s' % clusterNodeStatusUpdater.getName()) log_info('Handling wakeup request for %s' % clusterNodeStatusUpdater.get_name())
bSuccess = blockingWakeUpMachine(clusterNodeStatusUpdater.getName()) bSuccess = blocking_wake_up_machine(clusterNodeStatusUpdater.get_name())
assert bSuccess assert bSuccess
# activate the associated machine queue # activate the associated machine queue
if clusterNodeStatusUpdater.setQueueActivation(True): if clusterNodeStatusUpdater.set_queue_activation(True):
pass # all is ok pass # all is ok
else: else:
assert False assert False
clusterNodeStatusUpdater.m_stateLock.acquire() clusterNodeStatusUpdater.state_lock.acquire()
clusterNodeStatusUpdater.m_clusterNode.setPowerState(PowerState.ON) clusterNodeStatusUpdater.cluster_node.set_power_state(PowerState.ON)
clusterNodeStatusUpdater.m_stateLock.release() clusterNodeStatusUpdater.state_lock.release()
if self.m_wakeUpNotifier: if self.wake_up_notifier:
logDebug('ClusterNodeStatusUpdater::run : Sending wakeup notification') logDebug('ClusterNodeStatusUpdater::run : Sending wakeup notification')
self.m_wakeUpNotifier.onWakeUpComplete() self.wake_up_notifier.on_wake_up_complete()
class SleepRequest(IRequest): class SleepRequest(IRequest):
def __init__(self, sleepCompleteNotifier): def __init__(self, sleepCompleteNotifier: ISleepCompleteNotifier):
IRequest.__init__(self, IRequest.GO_TO_SLEEP) IRequest.__init__(self, IRequest.GO_TO_SLEEP)
self.m_sleepCompleteNotifier = sleepCompleteNotifier self.sleep_complete_notifier = sleepCompleteNotifier
def process(self, clusterNodeStatusUpdater): def process(self, clusterNodeStatusUpdater):
assert not clusterNodeStatusUpdater.m_bShouldAlwaysBeOn # are we attempting to put a machine the should stay on to sleep ? assert not clusterNodeStatusUpdater.should_always_be_on # are we attempting to put a machine the should stay on to sleep ?
logInfo('Handling sleep request for %s' % clusterNodeStatusUpdater.getName()) log_info('Handling sleep request for %s' % clusterNodeStatusUpdater.get_name())
if clusterNodeStatusUpdater.setQueueActivation(False): if clusterNodeStatusUpdater.set_queue_activation(False):
if clusterNodeStatusUpdater.queueIsEmpty(): if clusterNodeStatusUpdater.queue_is_empty():
if blockingPutMachineToSleep(clusterNodeStatusUpdater.m_clusterNodeName): if blocking_put_machine_to_sleep(clusterNodeStatusUpdater.cluster_node_name):
# now we know that the machine is asleep # now we know that the machine is asleep
clusterNodeStatusUpdater.m_stateLock.acquire() clusterNodeStatusUpdater.state_lock.acquire()
clusterNodeStatusUpdater.m_clusterNode.setPowerState(PowerState.SLEEP) clusterNodeStatusUpdater.cluster_node.set_power_state(PowerState.SLEEP)
clusterNodeStatusUpdater.m_stateLock.release() clusterNodeStatusUpdater.state_lock.release()
if self.m_sleepCompleteNotifier: if self.sleep_complete_notifier:
self.m_sleepCompleteNotifier.onSleepComplete(True) self.sleep_complete_notifier.on_sleep_complete(True)
else: else:
assert False assert False
else: else:
# reactivate the queue # reactivate the queue
if not clusterNodeStatusUpdater.setQueueActivation(True): if not clusterNodeStatusUpdater.set_queue_activation(True):
assert False assert False
clusterNodeStatusUpdater.m_stateLock.acquire() clusterNodeStatusUpdater.state_lock.acquire()
clusterNodeStatusUpdater.m_clusterNode.setPowerState(PowerState.ON) # this is necessary to reenable the various cyclic checks that were disabled on sleep request clusterNodeStatusUpdater.cluster_node.set_power_state(PowerState.ON) # this is necessary to reenable the various cyclic checks that were disabled on sleep request
clusterNodeStatusUpdater.m_stateLock.release() clusterNodeStatusUpdater.state_lock.release()
clusterNodeStatusUpdater.m_clusterNode.onSleepFailedBecauseAJobJustArrived() clusterNodeStatusUpdater.cluster_node.on_sleep_because_a_job_just_arrived()
if self.m_sleepCompleteNotifier: if self.sleep_complete_notifier:
self.m_sleepCompleteNotifier.onSleepComplete(False) self.sleep_complete_notifier.on_sleep_complete(False)
else: else:
assert False assert False
@ -105,88 +112,99 @@ class CheckPowerStateRequest(IRequest):
IRequest.__init__(self, IRequest.CHECK_POWER_STATE) IRequest.__init__(self, IRequest.CHECK_POWER_STATE)
def process(self, clusterNodeStatusUpdater): def process(self, clusterNodeStatusUpdater):
powerState = getPowerState(clusterNodeStatusUpdater.m_clusterNodeName) powerState = get_power_state(clusterNodeStatusUpdater.cluster_node_name)
clusterNodeStatusUpdater.m_stateLock.acquire() clusterNodeStatusUpdater.state_lock.acquire()
clusterNodeStatusUpdater.m_clusterNode.onNewPowerStateReading(powerState) clusterNodeStatusUpdater.cluster_node.on_new_power_state_reading(powerState)
clusterNodeStatusUpdater.m_lastPowerStateCheckTime = time.time() clusterNodeStatusUpdater.last_power_check_state_time = time.time()
clusterNodeStatusUpdater.m_stateLock.release() clusterNodeStatusUpdater.state_lock.release()
class ClusterNodeStatusUpdater(threading.Thread): class ClusterNodeStatusUpdater(threading.Thread):
cluster_node_name: 'ClusterNodeId'
cluster_node: 'ClusterNode'
grid_engine: 'SunGridEngine'
stop: bool
last_power_check_state_time: Optional[time.time]
check_power_state: bool
check_sensors: Optional[bool]
state_lock: threading.Lock # lock that prevents concurrent access to the state of this instance
should_always_be_on: bool # indicates that the machine should never go to sleep or off for whatever reason (eg simpatix10)
pending_requests_queue: List[IRequest]
DELAY_BETWEEN_POWERSTATE_CHECKS = 5 * 60 # in seconds DELAY_BETWEEN_POWERSTATE_CHECKS = 5 * 60 # in seconds
def __init__(self, machineName, clusterNode, gridEngine): def __init__(self, machineName: 'ClusterNodeId', clusterNode: 'ClusterNode', gridEngine: 'SunGridEngine'):
threading.Thread.__init__(self) threading.Thread.__init__(self)
self.m_clusterNodeName = machineName self.cluster_node_name = machineName
self.m_clusterNode = clusterNode self.cluster_node = clusterNode
self.m_gridEngine = gridEngine self.grid_engine = gridEngine
self.m_bStop = False self.stop = False
self.m_lastPowerStateCheckTime = None # time.time() self.last_power_check_state_time = None
self.m_bCheckPowerState = True self.check_power_state = True
self.m_stateLock = threading.Lock() # lock that prevents concurrent access to the state of this instance self.state_lock = threading.Lock()
self.m_bShouldAlwaysBeOn = False # indicates that the machine should never go to sleep or off for whatever reason (eg simpatix10) self.should_always_be_on = False
self.m_pendingRequestsQueue = [] self.pending_requests_queue = []
self.check_sensors = None
def getGridEngine(self): def get_grid_engine(self):
return self.m_gridEngine return self.grid_engine
def getName(self): def get_name(self):
return self.m_clusterNodeName return self.cluster_node_name
def setShouldAlwaysBeOn(self): def set_should_always_be_on(self):
print('%s should always be on' % (self.getName())) print('%s should always be on' % (self.get_name()))
self.m_bShouldAlwaysBeOn = True self.should_always_be_on = True
def pushRequest(self, request): def push_request(self, request: IRequest):
self.m_stateLock.acquire() self.state_lock.acquire()
self.m_pendingRequestsQueue.append(request) self.pending_requests_queue.append(request)
self.m_stateLock.release() self.state_lock.release()
def popRequest(self): def pop_request(self) -> IRequest:
oldestRequest = None oldest_request = None
self.m_stateLock.acquire() self.state_lock.acquire()
if len(self.m_pendingRequestsQueue) != 0: if len(self.pending_requests_queue) != 0:
oldestRequest = self.m_pendingRequestsQueue.pop(0) oldest_request = self.pending_requests_queue.pop(0)
self.m_stateLock.release() self.state_lock.release()
return oldestRequest return oldest_request
def run(self): def run(self):
try: try:
while not self.m_bStop: while not self.stop:
# handle the oldest request # handle the oldest request
request = self.popRequest() request = self.pop_request()
if request is not None: if request is not None:
request.process(self) request.process(self)
# schedule a power state check if required # schedule a power state check if required
currentTime = time.time() currentTime = time.time()
if self.m_bCheckPowerState: if self.check_power_state:
if not self.m_bShouldAlwaysBeOn: # don't do power checks on such machines because some current implementations of if not self.should_always_be_on: # don't do power checks on such machines because some current implementations of
# operations involved might cause the machine to go to sleep # operations involved might cause the machine to go to sleep
if (not self.m_lastPowerStateCheckTime) or (currentTime > (self.m_lastPowerStateCheckTime + ClusterNodeStatusUpdater.DELAY_BETWEEN_POWERSTATE_CHECKS)): if (not self.last_power_check_state_time) or (currentTime > (self.last_power_check_state_time + ClusterNodeStatusUpdater.DELAY_BETWEEN_POWERSTATE_CHECKS)):
self.pushRequest(CheckPowerStateRequest()) self.push_request(CheckPowerStateRequest()) # pylint: disable=no-value-for-parameter
time.sleep(1) time.sleep(1)
except BaseException as exception: # catches all exceptions, including the ctrl+C (KeyboardInterrupt) except BaseException as exception: # catches all exceptions, including the ctrl+C (KeyboardInterrupt)
onException(exception) on_exception(exception)
def requestSleep(self, sleepCompleteNotifier=None): def request_sleep(self, sleep_complete_notifier: Optional[ISleepCompleteNotifier] = None):
assert not self.m_bShouldAlwaysBeOn assert not self.should_always_be_on
self.pushRequest(SleepRequest(sleepCompleteNotifier)) self.push_request(SleepRequest(sleep_complete_notifier)) # pylint: disable=no-value-for-parameter
def requestWakeUp(self, wakeUpNotifier=None): def request_wake_up(self, wake_up_complete_notifier: Optional[IWakeUpCompleteNotifier] = None):
assert self.m_bShouldAlwaysBeOn is False assert self.should_always_be_on is False
self.pushRequest(WakeUpRequest(wakeUpNotifier)) self.push_request(WakeUpRequest(wake_up_complete_notifier)) # pylint: disable=no-value-for-parameter
def getQueueMachineName(self): def get_queue_machine_name(self):
return self.m_clusterNode.getQueueMachineName() return self.cluster_node.get_queue_machine_name()
def setQueueActivation(self, bEnable): def set_queue_activation(self, bEnable: bool):
""" """
@return true on success, false otherwise @return true on success, false otherwise
""" """
return self.getGridEngine().setQueueInstanceActivation(self.getQueueMachineName(), bEnable) return self.get_grid_engine().set_queue_instance_activation(self.get_queue_machine_name(), bEnable)
def queueIsEmpty(self): def queue_is_empty(self):
return self.getGridEngine().queueIsEmpty(self.getName()) return self.get_grid_engine().queue_is_empty(self.get_name())

View File

@ -1,11 +1,14 @@
import threading
from JobsStateUpdater import JobsStateUpdater
import Lib.Util
import Lib.SimpaDbUtil
from ClusterNode import ClusterNode
from Log import logInfo, logError
from PowerState import PowerState
import time import time
from typing import Dict, Optional, List
import threading
from .Job import TaskUid, Task, QueueMachineId, JobRequirements
from .JobsStateUpdater import JobsStateUpdater
from .ClusterNode import ClusterNode, ClusterNodeId
from .Log import log_info, logError
from .PowerState import PowerState
from ..SimpaDbUtil import get_cluster_machines_names
from .SunGridEngine import SunGridEngine
from .JobsState import JobsState
class ClusterStatus: class ClusterStatus:
@ -14,196 +17,203 @@ class ClusterStatus:
@param gridEngine the interface to the batch job tool (in our case it's sun grid engine) @param gridEngine the interface to the batch job tool (in our case it's sun grid engine)
""" """
def __init__(self, gridEngine): grid_engine: SunGridEngine
self.m_gridEngine = gridEngine cluster_nodes: Dict[ClusterNodeId, ClusterNode]
self.m_clusterNodes = {} lock: threading.Lock # to prevent concurrent access to this instance
self.m_lock = threading.Lock() # to prevent concurrent access to this instance jobs_state_updater: JobsStateUpdater
self.m_jobsStateUpdater = JobsStateUpdater(self) jobs_state: Optional[JobsState]
self.m_jobsState = None controlled_machine_names: List[ClusterNodeId]
# self.m_controlledMachineNames = ['simpatix30']
self.m_controlledMachineNames = [] # ['simpatix30'] def __init__(self, grid_engine: SunGridEngine):
self.grid_engine = grid_engine
self.cluster_nodes = {}
self.lock = threading.Lock()
self.jobs_state_updater = JobsStateUpdater(self)
self.jobs_state = None
# self.controlled_machine_names = ['simpatix30']
self.controlled_machine_names = [] # ['simpatix30']
if False: if False:
for iMachine in range(11, 40): for iMachine in range(11, 40):
if (iMachine == 31) or (iMachine == 32): if (iMachine == 31) or (iMachine == 32):
continue # these machines don't seem to be able to go to sleep properly (bug 00000010) continue # these machines don't seem to be able to go to sleep properly (bug 00000010)
if (iMachine == 18): if (iMachine == 18):
continue # this machine needs maintenance (restarting because it's very slow for an unknown reason) continue # this machine needs maintenance (restarting because it's very slow for an unknown reason)
self.m_controlledMachineNames.append('simpatix%d' % iMachine) self.controlled_machine_names.append('simpatix%d' % iMachine)
nodeNames = Lib.SimpaDbUtil.getClusterMachinesNames() node_names = get_cluster_machines_names()
for nodeName in nodeNames: for node_name in node_names:
if nodeName in self.m_controlledMachineNames: if node_name in self.controlled_machine_names:
logInfo('machine %s is under the cluster controller\'s control' % nodeName) log_info('machine %s is under the cluster controller\'s control' % node_name)
clusterNode = ClusterNode(nodeName, self, gridEngine) cluster_node = ClusterNode(node_name, self, grid_engine)
if nodeName == 'simpatix10': if node_name == 'simpatix10':
clusterNode.setShouldAlwaysBeOn() cluster_node.set_should_always_be_on()
self.m_clusterNodes[nodeName] = clusterNode self.cluster_nodes[node_name] = cluster_node
return return
def setControlOnMachine(self, machineName, bControl): def set_control_on_machine(self, machine_name: ClusterNodeId, control: bool):
if bControl: if control:
# add machineName under control of ClusterController # add machineName under control of ClusterController
for k, v in self.m_clusterNodes.items(): for _k, v in self.cluster_nodes.items():
if v.getName() == machineName: if v.get_name() == machine_name:
return # nothing to do : machineName is already under the control of ClusterController return # nothing to do : machineName is already under the control of ClusterController
clusterNode = ClusterNode(machineName, self, self.m_gridEngine) cluster_node = ClusterNode(machine_name, self, self.grid_engine)
if machineName == 'simpatix10': if machine_name == 'simpatix10':
clusterNode.setShouldAlwaysBeOn() cluster_node.set_should_always_be_on()
self.m_clusterNodes[machineName] = clusterNode self.cluster_nodes[machine_name] = cluster_node
clusterNode.m_machineStatusUpdater.start() cluster_node.machine_status_updater.start()
else: else:
# remove machineName from control of ClusterController # remove machineName from control of ClusterController
clusterNode = self.m_clusterNodes.get(machineName) cluster_node = self.cluster_nodes.get(machine_name)
if clusterNode: if cluster_node:
clusterNode.m_machineStatusUpdater.m_bStop = True cluster_node.machine_status_updater.stop = True
clusterNode.m_machineStatusUpdater.join() cluster_node.machine_status_updater.join()
self.m_clusterNodes.pop(machineName) self.cluster_nodes.pop(machine_name)
def getGridEngine(self): def get_grid_engine(self) -> SunGridEngine:
return self.m_gridEngine return self.grid_engine
def getMachines(self): def get_machines(self) -> Dict[ClusterNodeId, ClusterNode]:
return self.m_clusterNodes return self.cluster_nodes
def startReadingThreads(self): def start_reading_threads(self):
for k, v in self.m_clusterNodes.items(): for _k, v in self.cluster_nodes.items():
v.m_machineStatusUpdater.start() v.machine_status_updater.start()
self.m_jobsStateUpdater.start() self.jobs_state_updater.start()
def stopReadingThreads(self): def stop_reading_threads(self):
for k, v in self.m_clusterNodes.items(): for _k, v in self.cluster_nodes.items():
v.m_machineStatusUpdater.m_bStop = True v.machine_status_updater.stop = True
v.m_machineStatusUpdater.join() v.machine_status_updater.join()
self.m_jobsStateUpdater.m_bStop = True self.jobs_state_updater.stop = True
self.m_jobsStateUpdater.join() self.jobs_state_updater.join()
def onNewJobsState(self, newJobsState): def on_new_jobs_state(self, new_jobs_state: JobsState):
# logDebug('ClusterStatus::onNewJobsState : attempting to acquire lock to access m_jobsState') # logDebug('ClusterStatus::on_new_jobs_state : attempting to acquire lock to access jobs_state')
self.m_lock.acquire() self.lock.acquire()
# logDebug('ClusterStatus::onNewJobsState : got lock to access m_jobsState') # logDebug('ClusterStatus::on_new_jobs_state : got lock to access jobs_state')
self.m_jobsState = newJobsState self.jobs_state = new_jobs_state
self.m_lock.release() self.lock.release()
def getJobsOnMachine(self, machineName): def get_jobs_on_machine(self, machine_name: ClusterNodeId) -> Dict[TaskUid, Task]:
return self.m_jobsState.getJobsOnMachine(machineName) return self.jobs_state.get_jobs_on_machine(machine_name)
def isReady(self): def is_ready(self) -> bool:
for k, v in self.m_clusterNodes.items(): for _k, v in self.cluster_nodes.items():
if not v.isReady(): if not v.is_ready():
logInfo('ClusterStatus::isReady : not ready because of ' + v.getName()) log_info('ClusterStatus::is_ready : not ready because of ' + v.get_name())
return False return False
# log('ClusterStatus::isReady() : '+k+' is ready') # log('ClusterStatus::is_ready() : '+k+' is ready')
# assert(False) # assert(False)
if self.m_jobsState is None: if self.jobs_state is None:
logInfo('ClusterStatus::isReady : not ready because waiting for jobs state') log_info('ClusterStatus::is_ready : not ready because waiting for jobs state')
return False return False
return True return True
def getIdleMachines(self): def get_idle_machines(self) -> Dict[ClusterNodeId, ClusterNode]:
assert self.isReady assert self.is_ready
bBUG_00000009_IS_STILL_ALIVE = True bBUG_00000009_IS_STILL_ALIVE = True
if bBUG_00000009_IS_STILL_ALIVE: if bBUG_00000009_IS_STILL_ALIVE:
currentTime = time.time() currentTime = time.time()
fJOBS_STATE_MAX_ALLOWED_AGE = 3600 fJOBS_STATE_MAX_ALLOWED_AGE = 3600
fJobsStateAge = currentTime - self.m_jobsState.getTime() fJobsStateAge = currentTime - self.jobs_state.get_time()
if fJobsStateAge > fJOBS_STATE_MAX_ALLOWED_AGE: if fJobsStateAge > fJOBS_STATE_MAX_ALLOWED_AGE:
logError('ClusterStatus::getIdleMachines : age of jobs state is too old (%f s). This is bug 00000009.' % (fJobsStateAge)) logError('ClusterStatus::get_idle_machines : age of jobs state is too old (%f s). This is bug 00000009.' % (fJobsStateAge))
assert False assert False
idleMachines = {} idleMachines = {}
for machineName, machine in self.m_clusterNodes.items(): for machineName, machine in self.cluster_nodes.items():
if machine.getPowerState() == PowerState.ON: if machine.get_power_state() == PowerState.ON:
jobsOnThisMachine = self.getJobsOnMachine(machineName) jobsOnThisMachine = self.get_jobs_on_machine(machineName)
if len(jobsOnThisMachine) == 0: if len(jobsOnThisMachine) == 0:
idleMachines[machineName] = machine idleMachines[machineName] = machine
return idleMachines return idleMachines
def getPendingJobs(self): def get_pending_jobs(self) -> Dict[TaskUid, Task]:
return self.m_jobsState.getPendingJobs() return self.jobs_state.get_pending_jobs()
def getJobsState(self): def get_jobs_state(self) -> JobsState:
return self.m_jobsState return self.jobs_state
def queueMachineFitsJobRequirements(self, queueMachine, jobRequirements): def queue_machine_fits_job_requirements(self, queue_machine: QueueMachineId, job_requirements: JobRequirements) -> bool:
if jobRequirements.m_queues: if job_requirements.queues:
bQueueIsInAllowedQueues = False bQueueIsInAllowedQueues = False
for queueName in jobRequirements.m_queues: for queueName in job_requirements.queues:
if queueName == queueMachine.getQueueName(): if queueName == queue_machine.get_queue_name():
bQueueIsInAllowedQueues = True bQueueIsInAllowedQueues = True
if not bQueueIsInAllowedQueues: if not bQueueIsInAllowedQueues:
logInfo('queueMachineFitsJobRequirements : queueMachine ' + queueMachine.getName() + ' rejected because it\'s not in the allowed queues') log_info('queue_machine_fits_job_requirements : queue_machine ' + queue_machine.get_name() + ' rejected because it\'s not in the allowed queues')
return False return False
return True return True
def getEnergyConsumption(self): def get_energy_consumption(self) -> float:
""" """
returns an estimate of the energy consumption since the start of the cluster controller (in joules) returns an estimate of the energy consumption since the start of the cluster controller (in joules)
""" """
fEnergyConsumption = 0.0 fEnergyConsumption = 0.0
for machine in self.m_clusterNodes.values(): for machine in self.cluster_nodes.values():
if machine.isReady(): # there are cases where the machine is not ready yet (for example, it's just been added to clustercontroller's control) if machine.is_ready(): # there are cases where the machine is not ready yet (for example, it's just been added to clustercontroller's control)
fEnergyConsumption += machine.getEnergyConsumption() fEnergyConsumption += machine.get_energy_consumption()
return fEnergyConsumption return fEnergyConsumption
def getEnergySavings(self): def get_energy_savings(self) -> float:
""" """
returns an estimate of the energy saving since the start of the cluster controller (in joules) returns an estimate of the energy saving since the start of the cluster controller (in joules)
""" """
fEnergySavings = 0.0 fEnergySavings = 0.0
for machine in self.m_clusterNodes.values(): for machine in self.cluster_nodes.values():
if machine.isReady(): if machine.is_ready():
fEnergySavings += machine.getEnergySavings() fEnergySavings += machine.get_energy_savings()
return fEnergySavings return fEnergySavings
def getCurrentPowerConsumption(self): def get_current_power_consumption(self) -> float:
fPowerConsumption = 0.0 power_consumption = 0.0
for machine in self.m_clusterNodes.values(): for machine in self.cluster_nodes.values():
if machine.isReady(): if machine.is_ready():
fPowerConsumption += machine.getPowerConsumption() power_consumption += machine.get_power_consumption()
return fPowerConsumption return power_consumption
def getCurrentPowerSavings(self): def get_current_power_savings(self) -> float:
fPowerSavings = 0.0 power_savings = 0.0
for machine in self.m_clusterNodes.values(): for machine in self.cluster_nodes.values():
if machine.isReady(): if machine.is_ready():
fPowerSavings += machine.getPowerConsumptionForPowerState(PowerState.ON) - machine.getPowerConsumption() power_savings += machine.get_power_consumption_for_power_state(PowerState.ON) - machine.get_power_consumption()
return fPowerSavings return power_savings
def getNumControlledSlots(self): def get_num_controlled_slots(self) -> int:
self.m_lock.acquire() self.lock.acquire()
iNumControlledSlots = 0 num_controlled_slots = 0
for machine in self.m_clusterNodes.values(): for machine in self.cluster_nodes.values():
queueMachine = self.m_jobsState.getQueueMachine(machine.getName()) queue_machine = self.jobs_state.get_queue_machine(machine.get_name())
iNumControlledSlots += queueMachine.getNumSlots() num_controlled_slots += queue_machine.get_num_slots()
self.m_lock.release() self.lock.release()
return iNumControlledSlots return num_controlled_slots
def getNumUsedSlots(self): def get_num_used_slots(self) -> int:
self.m_lock.acquire() self.lock.acquire()
iNumUsedSlots = 0 num_used_slots = 0
for machine in self.m_clusterNodes.values(): for machine in self.cluster_nodes.values():
queueMachine = self.m_jobsState.getQueueMachine(machine.getName()) queue_machine = self.jobs_state.get_queue_machine(machine.get_name())
iNumUsedSlotsOnThisMachine = queueMachine.getNumSlots() - self.m_jobsState.getNumFreeSlotsOnQueueMachine(queueMachine) num_used_slots_on_this_machine = queue_machine.get_num_slots() - self.jobs_state.getNumFreeSlotsOnQueueMachine(queue_machine)
assert iNumUsedSlotsOnThisMachine >= 0 assert num_used_slots_on_this_machine >= 0
iNumUsedSlots += iNumUsedSlotsOnThisMachine num_used_slots += num_used_slots_on_this_machine
self.m_lock.release() self.lock.release()
return iNumUsedSlots return num_used_slots
def getNumWastedSlots(self): def get_num_wasted_slots(self) -> int:
self.m_lock.acquire() self.lock.acquire()
iNumWastedSlots = 0 iNumWastedSlots = 0
for machine in self.m_clusterNodes.values(): for machine in self.cluster_nodes.values():
if machine.getPowerState() == PowerState.ON: if machine.get_power_state() == PowerState.ON:
queueMachine = self.m_jobsState.getQueueMachine(machine.getName()) queue_machine = self.jobs_state.get_queue_machine(machine.get_name())
iNumWastedSlots += self.m_jobsState.getNumFreeSlotsOnQueueMachine(queueMachine) iNumWastedSlots += self.jobs_state.getNumFreeSlotsOnQueueMachine(queue_machine)
self.m_lock.release() self.lock.release()
return iNumWastedSlots return iNumWastedSlots
def getNumSleepingSlots(self): def get_num_sleeping_slots(self) -> int:
self.m_lock.acquire() self.lock.acquire()
iNumSleepingSlots = 0 iNumSleepingSlots = 0
for machine in self.m_clusterNodes.values(): for machine in self.cluster_nodes.values():
if machine.getPowerState() == PowerState.SLEEP: if machine.get_power_state() == PowerState.SLEEP:
queueMachine = self.m_jobsState.getQueueMachine(machine.getName()) queue_machine = self.jobs_state.get_queue_machine(machine.get_name())
iNumSleepingSlots += self.m_jobsState.getNumFreeSlotsOnQueueMachine(queueMachine) iNumSleepingSlots += self.jobs_state.getNumFreeSlotsOnQueueMachine(queue_machine)
self.m_lock.release() self.lock.release()
return iNumSleepingSlots return iNumSleepingSlots

View File

@ -26,7 +26,7 @@ if __name__ == '__main__':
remoteCommand += 'launchctl unload /Library/LaunchDaemons/fr.univ-rennes1.ipr.ClusterController.plist;' remoteCommand += 'launchctl unload /Library/LaunchDaemons/fr.univ-rennes1.ipr.ClusterController.plist;'
remoteCommand += 'launchctl load /Library/LaunchDaemons/fr.univ-rennes1.ipr.ClusterController.plist;' remoteCommand += 'launchctl load /Library/LaunchDaemons/fr.univ-rennes1.ipr.ClusterController.plist;'
command = 'ssh root@'+ machineName +' "'+remoteCommand+'"' command = 'ssh root@'+ machineName +' "'+remoteCommand+'"'
( returnCode, stdout, stderr ) = executeCommand( command ) ( returnCode, stdout, stderr ) = execute_command( command )
for strSingleCommand in remoteCommand.split(';'): for strSingleCommand in remoteCommand.split(';'):
print(strSingleCommand) print(strSingleCommand)
print(stdout) print(stdout)

View File

@ -1,3 +1,4 @@
import enum
from typing import Optional, Dict, List from typing import Optional, Dict, List
from datetime import datetime from datetime import datetime
@ -13,7 +14,7 @@ class JobStateFlags:
SUSPENDED = 128 SUSPENDED = 128
class ParallelEnvironment: class ParallelEnvironment(enum.Enum):
MPI = 1 MPI = 1
@ -29,14 +30,14 @@ ResourceRequest = str # eg 'mem_available=5G'
class JobRequirements: class JobRequirements:
num_slots: Optional[int] num_slots: Optional[int]
architecture: Optional[str] # machine architecture architecture: Optional[str] # machine architecture
m_parallelEnvironment: Optional[int] # todo: make ParallelEnvironment an Enum parallel_environment: Optional[ParallelEnvironment]
queues: Optional[List[QueueId]] # the list of queues this job is allowed to run on queues: Optional[List[QueueId]] # the list of queues this job is allowed to run on
resources: Optional[List[ResourceRequest]] resources: Optional[List[ResourceRequest]]
def __init__(self): def __init__(self):
self.num_slots = None self.num_slots = None
self.architecture = None self.architecture = None
self.m_parallelEnvironment = None self.parallel_environment = None
self.queues = None self.queues = None
self.resources = None self.resources = None
@ -62,10 +63,10 @@ class TaskUid:
""" """
required to use a TaskUid as a dict hash key required to use a TaskUid as a dict hash key
""" """
hash = self.job_id * self.MAX_NUM_JOBS_IN_ARRAY _hash = self.job_id * self.MAX_NUM_JOBS_IN_ARRAY
if self.task_id is not None: if self.task_id is not None:
hash += self.task_id _hash += self.task_id
return hash return _hash
def __eq__(self, other: 'TaskUid'): def __eq__(self, other: 'TaskUid'):
""" """

View File

@ -1,26 +1,30 @@
from typing import Dict from typing import Dict
from .Log import * from datetime import datetime
from .Job import Task, TaskUid # from .Log import log_info
from .Job import Task, TaskUid, QueueMachineId
from .QueueMachine import QueueMachine
class JobsState: class JobsState:
""" """
represents a snapshot of the state of SGE jobs as seen by the SGE command "qstat -f -u \*" represents a snapshot of the state of SGE jobs as seen by the SGE command "qstat -f -u \\*"
""" """
tasks: Dict[TaskUid, Task] tasks: Dict[TaskUid, Task] # list of tasks
job_array_tasks: Dict[int, Dict[TaskUid, Task]] job_array_tasks: Dict[int, Dict[TaskUid, Task]] # a dictionary of jobs for each job array, indexed by job array id
queue_machines: Dict[QueueMachineId, QueueMachine] # list of queue machines such as allintel.q@simpatix10
state_time: datetime # the time at which the state was snapshot
def __init__(self): def __init__(self):
self.tasks = {} # list of jobs self.tasks = {}
self.job_array_tasks = {} # a dictionary of jobs for each job array, indexed by job array id self.job_array_tasks = {}
self.m_queueMachines = {} # list of queue machines such as allintel.q@simpatix10 self.queue_machines = {}
self.m_stateTime = None # the time at which the state was snapshot self.state_time = None
def deleteAllJobs(self): def delete_all_tasks(self):
self.tasks = {} self.tasks = {}
self.job_array_tasks = {} self.job_array_tasks = {}
def addTask(self, task: Task): def add_task(self, task: Task):
task_uid = task.get_id() task_uid = task.get_id()
self.tasks[task_uid] = task self.tasks[task_uid] = task
if task_uid.is_job_array_element(): if task_uid.is_job_array_element():
@ -36,56 +40,56 @@ class JobsState:
def get_job_array_tasks(self, job_array_id: int) -> Dict[TaskUid, Task]: def get_job_array_tasks(self, job_array_id: int) -> Dict[TaskUid, Task]:
return self.job_array_tasks.get(job_array_id) return self.job_array_tasks.get(job_array_id)
def setTime(self, stateTime): def set_time(self, state_time: datetime):
self.m_stateTime = stateTime self.state_time = state_time
def getTime(self): def get_time(self) -> datetime:
return self.m_stateTime return self.state_time
def getJobsOnMachine(self, machineName): def get_jobs_on_machine(self, machine_name: str) -> Dict[TaskUid, Task]:
jobs_on_machine = {} jobs_on_machine = {}
for task_uid, task in self.tasks.items(): for task_uid, task in self.tasks.items():
for queueMachineName, numSlots in task.get_slots().items(): for queue_machine_name, _num_slots in task.get_slots().items():
jobMachineName = queueMachineName.split('@')[1] jobMachineName = queue_machine_name.split('@')[1]
if jobMachineName == machineName: if jobMachineName == machine_name:
jobs_on_machine[task_uid] = task jobs_on_machine[task_uid] = task
return jobs_on_machine return jobs_on_machine
def getNumFreeSlotsOnQueueMachine(self, queueMachine): def get_num_free_slots_on_queue_machine(self, queue_machine: QueueMachine) -> int:
# logInfo('getNumFreeSlotsOnQueueMachine : looking for free slots on queuemachine %s' % queueMachine.getName()) # log_info('getNumFreeSlotsOnQueueMachine : looking for free slots on queuemachine %s' % queueMachine.get_name())
numUsedSlots = 0 numUsedSlots = 0
for job in self.tasks.values(): for job in self.tasks.values():
numUsedSlotsByThisJob = job.get_slots().get(queueMachine.getName()) numUsedSlotsByThisJob = job.get_slots().get(queue_machine.get_name())
if numUsedSlotsByThisJob is not None: if numUsedSlotsByThisJob is not None:
# logInfo('getNumFreeSlotsOnQueueMachine : job %d uses %d slots' % (job.getId().asStr(), numUsedSlotsByThisJob)) # log_info('getNumFreeSlotsOnQueueMachine : job %d uses %d slots' % (job.getId().asStr(), numUsedSlotsByThisJob))
numUsedSlots += numUsedSlotsByThisJob numUsedSlots += numUsedSlotsByThisJob
else: else:
None pass
# logInfo('getNumFreeSlotsOnQueueMachine : job %d uses no slot' % job.getId().asStr()) # log_info('getNumFreeSlotsOnQueueMachine : job %d uses no slot' % job.getId().asStr())
numFreeSlots = queueMachine.getNumSlots() - numUsedSlots numFreeSlots = queue_machine.get_num_slots() - numUsedSlots
assert numFreeSlots >= 0 assert numFreeSlots >= 0
return numFreeSlots return numFreeSlots
def addQueueMachine(self, queueMachine): def add_queue_machine(self, queue_machine: QueueMachine):
self.m_queueMachines[queueMachine.getName()] = queueMachine self.queue_machines[queue_machine.get_name()] = queue_machine
def getQueueMachine(self, machineName): def get_queue_machine(self, machine_name) -> QueueMachine:
""" """
finds the queue machine associated with a machine finds the queue machine associated with a machine
""" """
queueMachine = None queueMachine = None
for qmName, qm in self.m_queueMachines.items(): for _qname, qm in self.queue_machines.items():
if qm.m_machineName == machineName: if qm.machine_name == machine_name:
assert queueMachine is None # to be sure that no more than one queue machine is on a given machine assert queueMachine is None # to be sure that no more than one queue machine is on a given machine
queueMachine = qm queueMachine = qm
return queueMachine return queueMachine
def getQueueMachines(self): def get_queue_machines(self) -> Dict[QueueMachineId, QueueMachine]:
return self.m_queueMachines return self.queue_machines
def getPendingJobs(self): def get_pending_jobs(self) -> Dict[TaskUid, Task]:
pendingJobs = {} pending_jobs = {}
for jobId, job in self.tasks.items(): for _task_id, task in self.tasks.items():
if job.is_pending(): if task.is_pending():
pendingJobs[job.get_id()] = job pending_jobs[task.get_id()] = task
return pendingJobs return pending_jobs

View File

@ -1,35 +1,39 @@
from typing import TYPE_CHECKING
import threading import threading
import Util
import os
import traceback
import sys
import time import time
from .Util import on_exception
if TYPE_CHECKING:
from .ClusterStatus import ClusterStatus
class JobsStateUpdater( threading.Thread ):
DELAY_BETWEEN_STATUS_CHECKS=10 # in seconds class JobsStateUpdater(threading.Thread):
def __init__( self, clusterStatus ): cluster_status: 'ClusterStatus'
stop: bool
DELAY_BETWEEN_STATUS_CHECKS = 10 # in seconds
def __init__(self, clusterStatus):
threading.Thread.__init__(self) threading.Thread.__init__(self)
self.m_clusterStatus = clusterStatus self.cluster_status = clusterStatus
self.m_bStop = False self.stop = False
def getName( self ):
return 'JobsStateUpdater'
def getGridEngine( self ):
return self.m_clusterStatus.getGridEngine()
def updateClusterStatus( self ):
#log('JobsStateUpdater::updateClusterStatus : start')
jobsState = self.getGridEngine().getCurrentJobsState() def get_name(self):
return 'JobsStateUpdater'
def get_grid_engine(self):
return self.cluster_status.get_grid_engine()
def updateClusterStatus(self):
# log('JobsStateUpdater::updateClusterStatus : start')
jobsState = self.get_grid_engine().getCurrentJobsState()
# update the jobs in the cluster status # update the jobs in the cluster status
self.m_clusterStatus.onNewJobsState( jobsState ) self.cluster_status.on_new_jobs_state(jobsState)
#log('JobsStateUpdater::updateClusterStatus : end') # log('JobsStateUpdater::updateClusterStatus : end')
def run( self ): def run(self):
try: try:
while not self.m_bStop : while not self.stop:
self.updateClusterStatus() self.updateClusterStatus()
time.sleep(JobsStateUpdater.DELAY_BETWEEN_STATUS_CHECKS) time.sleep(JobsStateUpdater.DELAY_BETWEEN_STATUS_CHECKS)
except BaseException, exception: # catches all exceptions, including the ctrl+C (KeyboardInterrupt) except BaseException as exception: # catches all exceptions, including the ctrl+C (KeyboardInterrupt)
Util.onException(exception) on_exception(exception)

View File

@ -5,10 +5,10 @@ gLogFilePath = '/tmp/ClusterController.log' # '/var/log/ClusterController.log'
def log(message): def log(message):
threadName = threading.currentThread().getName() threadName = threading.currentThread().get_name()
logMessage = time.asctime(time.localtime()) + ' : ' + threadName + ' : ' + message logMessage = time.asctime(time.localtime()) + ' : ' + threadName + ' : ' + message
print(logMessage) print(logMessage)
f = open(gLogFilePath, 'a+') f = open(gLogFilePath, 'a+', encoding='utf8')
assert f assert f
try: try:
f.write(logMessage + '\n') f.write(logMessage + '\n')
@ -21,11 +21,11 @@ def logDebug(message):
return return
def logInfo(message): def log_info(message):
log('[I]' + message) log('[I]' + message)
def logWarning(message): def log_warning(message):
log('[W]' + message) log('[W]' + message)

View File

@ -1,5 +1,8 @@
class PowerState: import enum
class PowerState(enum.Enum):
UNKNOWN = 0 UNKNOWN = 0
OFF = 1 OFF = 1
ON = 2 ON = 2

View File

@ -2,7 +2,6 @@ import io
import re import re
from .JobsState import JobsState from .JobsState import JobsState
from .QueueMachine import QueueMachine, QueueMachineStateFlags from .QueueMachine import QueueMachine, QueueMachineStateFlags
from .Util import *
from .Log import logError from .Log import logError
from .Job import JobStateFlags, TaskUid, Task, ParallelEnvironment, JobState from .Job import JobStateFlags, TaskUid, Task, ParallelEnvironment, JobState
import logging import logging
@ -54,13 +53,13 @@ class QstatParser:
assert False, 'unhandled queue machine state flag :"' + c + '"' assert False, 'unhandled queue machine state flag :"' + c + '"'
return queueMachineState return queueMachineState
def parseQstatOutput(self, qstatOutput, cluster_domain: str = 'ipr.univ-rennes1.fr'): def parse_qstat_output(self, qstat_output: str, cluster_domain: str = 'ipr.univ-rennes1.fr'):
""" """
parses result of command 'qstat -f -u \\* -pri' parses result of command 'qstat -f -u \\* -pri'
cluster_domain: network domain of the cluster (eg 'ipr.univ-rennes.fr'). This information is missing from qstat's output and is used to form the fully qualified domain name of the cluster machines. cluster_domain: network domain of the cluster (eg 'ipr.univ-rennes.fr'). This information is missing from qstat's output and is used to form the fully qualified domain name of the cluster machines.
""" """
logging.debug('qstatOutput type : %s' % type(qstatOutput)) logging.debug('qstatOutput type : %s', type(qstat_output))
def parse_pending_tasks(task_ranges_sequence): def parse_pending_tasks(task_ranges_sequence):
""" """
@ -99,13 +98,13 @@ class QstatParser:
# --------------------------------------------------------------------------------- # ---------------------------------------------------------------------------------
# main.q@physix88.ipr.univ-renne BIP 0/0/36 14.03 lx-amd64 # main.q@physix88.ipr.univ-renne BIP 0/0/36 14.03 lx-amd64
# TODO: fix this properly by parsing the output of 'qstat -f -u \* -xml' instead of 'qstat -f -u \*' # TODO: fix this properly by parsing the output of 'qstat -f -u \* -xml' instead of 'qstat -f -u \*'
qstatOutput = re.sub(r'\.ipr\.univ[^ ]*', f'.{cluster_domain}', qstatOutput) qstat_output = re.sub(r'\.ipr\.univ[^ ]*', f'.{cluster_domain}', qstat_output)
jobsState = JobsState() jobsState = JobsState()
f = io.StringIO(qstatOutput) f = io.StringIO(qstat_output)
line = f.readline() line = f.readline()
currentQueueMachine = None current_queue_machine = None
bInPendingJobsSection = False in_pending_jobs_section = False
# examples of job line : # examples of job line :
# 43521 0.55108 Confidiso3 aghoufi r 08/19/2009 18:40:09 1 # 43521 0.55108 Confidiso3 aghoufi r 08/19/2009 18:40:09 1
# a typical job line in the pending jobs section looks like this : # a typical job line in the pending jobs section looks like this :
@ -120,42 +119,42 @@ class QstatParser:
# ntckts The job's ticket amount in normalized fashion. # ntckts The job's ticket amount in normalized fashion.
# ppri The job's -p priority as specified by the user. # ppri The job's -p priority as specified by the user.
jobRegularExp = re.compile(r'^[ ]*(?P<jobId>[^ ]+)[ ]+(?P<JobPriority>[0-9.]+)[ ]+(?P<nurg>[0-9.]+)[ ]+(?P<npprior>[0-9.]+)[ ]+(?P<ntckts>[0-9.]+)[ ]+(?P<ppri>-?[0-9]+)[ ]+(?P<jobScriptName>[^ ]+)[ ]+(?P<jobOwner>[^ ]+)[ ]+(?P<jobStatus>[^ ]+)[ ]+(?P<jobStartOrSubmitTime>[0-9][0-9]/[0-9][0-9]/[0-9][0-9][0-9][0-9] [0-9][0-9]:[0-9][0-9]:[0-9][0-9])[ ]+(?P<numSlots>[0-9]+)[ ]+(?P<jobArrayDetails>[^\n]*)[\s]*$') job_regular_exp = re.compile(r'^[ ]*(?P<jobId>[^ ]+)[ ]+(?P<JobPriority>[0-9.]+)[ ]+(?P<nurg>[0-9.]+)[ ]+(?P<npprior>[0-9.]+)[ ]+(?P<ntckts>[0-9.]+)[ ]+(?P<ppri>-?[0-9]+)[ ]+(?P<jobScriptName>[^ ]+)[ ]+(?P<jobOwner>[^ ]+)[ ]+(?P<jobStatus>[^ ]+)[ ]+(?P<jobStartOrSubmitTime>[0-9][0-9]/[0-9][0-9]/[0-9][0-9][0-9][0-9] [0-9][0-9]:[0-9][0-9]:[0-9][0-9])[ ]+(?P<numSlots>[0-9]+)[ ]+(?P<jobArrayDetails>[^\n]*)[\s]*$')
# example of machine line : # example of machine line :
# allintel.q@simpatix34.univ-ren BIP 0/6/8 6.00 darwin-x86 # allintel.q@simpatix34.univ-ren BIP 0/6/8 6.00 darwin-x86
machineRegularExp = re.compile(r'^(?P<queueName>[^@]+)@(?P<machineName>[^ ]+)[ ]+(?P<queueTypeString>[^ ]+)[ ]+(?P<numReservedSlots>[^/]+)/(?P<numUsedSlots>[^/]+)/(?P<numTotalSlots>[^ ]+)[ ]+(?P<cpuLoad>[^ ]+)[\s]+(?P<archName>[^ ]+)[\s]+(?P<queueMachineStatus>[^\s]*)') machine_regular_exp = re.compile(r'^(?P<queueName>[^@]+)@(?P<machineName>[^ ]+)[ ]+(?P<queueTypeString>[^ ]+)[ ]+(?P<numReservedSlots>[^/]+)/(?P<numUsedSlots>[^/]+)/(?P<numTotalSlots>[^ ]+)[ ]+(?P<cpuLoad>[^ ]+)[\s]+(?P<archName>[^ ]+)[\s]+(?P<queueMachineStatus>[^\s]*)')
pendingJobsHeaderRegularExp = re.compile('^ - PENDING JOBS - PENDING JOBS - PENDING JOBS - PENDING JOBS - PENDING JOBS[?]*') pending_jobs_header_regular_exp = re.compile('^ - PENDING JOBS - PENDING JOBS - PENDING JOBS - PENDING JOBS - PENDING JOBS[?]*')
while len(line) > 0: while len(line) > 0:
# print line # print line
# check if the current line is a line describing a job running on a machine # check if the current line is a line describing a job running on a machine
matchObj = jobRegularExp.match(line) match_obj = job_regular_exp.match(line)
if matchObj: if match_obj:
# we are dealing with a job line # we are dealing with a job line
if not bInPendingJobsSection: if not in_pending_jobs_section:
assert currentQueueMachine assert current_queue_machine
# log('QstatParser::parseQstatOutput : jobId = "'+matchObj.group('jobId')+'"') # log('QstatParser::parseQstatOutput : jobId = "'+matchObj.group('jobId')+'"')
job_id = int(matchObj.group('jobId')) job_id = int(match_obj.group('jobId'))
logging.debug('iJobId = %d' % job_id) logging.debug('iJobId = %d', job_id)
jobState = self.parseJobState(matchObj.group('jobStatus')) job_state = self.parseJobState(match_obj.group('jobStatus'))
strJobArrayDetails = matchObj.group('jobArrayDetails') job_array_details = match_obj.group('jobArrayDetails')
bIsJobArray = (len(strJobArrayDetails) != 0) is_job_array = (len(job_array_details) != 0)
# logDebug('strJobArrayDetails = "%s", bIsJobArray=%d' % (strJobArrayDetails, int(bIsJobArray))) # logDebug('strJobArrayDetails = "%s", bIsJobArray=%d' % (strJobArrayDetails, int(bIsJobArray)))
# each element of a job array is treated as a separate job for the sake of simplicity. # each element of a job array is treated as a separate job for the sake of simplicity.
# For these elements, the job id in sge sense is the same, but they are different in this program's sense # For these elements, the job id in sge sense is the same, but they are different in this program's sense
task_ids = range(0, 1) # just one element, unless it's a job array task_ids = range(0, 1) # just one element, unless it's a job array
if bIsJobArray: if is_job_array:
if bInPendingJobsSection: if in_pending_jobs_section:
task_ids = parse_pending_tasks(strJobArrayDetails) task_ids = parse_pending_tasks(job_array_details)
else: else:
# we are in the running jobs section, and here we expect the strJobArrayDetails to just contain the index of the job array element # we are in the running jobs section, and here we expect the strJobArrayDetails to just contain the index of the job array element
iJobArrayElementIndex = int(strJobArrayDetails) task_id = int(job_array_details)
assert iJobArrayElementIndex != 0 # sge does not allow element indices to be 0 assert task_id != 0 # sge does not allow element indices to be 0
task_ids = range(iJobArrayElementIndex, iJobArrayElementIndex + 1) task_ids = range(task_id, task_id + 1)
logging.debug('task_ids = %s' % task_ids) logging.debug('task_ids = %s', task_ids)
for task_id in task_ids: for task_id in task_ids:
logging.debug('task_id = %s' % task_id) logging.debug('task_id = %s', task_id)
task_uid = None task_uid = None
if bIsJobArray: if is_job_array:
task_uid = TaskUid(job_id, task_id) task_uid = TaskUid(job_id, task_id)
else: else:
task_uid = TaskUid(job_id) task_uid = TaskUid(job_id)
@ -165,57 +164,57 @@ class QstatParser:
# this job hasn't been encountered yet in the output of qstat ... # this job hasn't been encountered yet in the output of qstat ...
# we could either be in the pending jobs section or in the running jobs section # we could either be in the pending jobs section or in the running jobs section
task = Task(task_uid) task = Task(task_uid)
jobsState.addTask(task) jobsState.add_task(task)
task.set_state(jobState) task.set_state(job_state)
strJobStartOrSubmitTime = matchObj.group('jobStartOrSubmitTime') job_start_or_submit_time_as_str = match_obj.group('jobStartOrSubmitTime')
jobStartOrSubmitTime = time.strptime(strJobStartOrSubmitTime, '%m/%d/%Y %H:%M:%S') job_start_or_submit_time = time.strptime(job_start_or_submit_time_as_str, '%m/%d/%Y %H:%M:%S')
if bInPendingJobsSection: if in_pending_jobs_section:
task.get_submit_time(jobStartOrSubmitTime) task.get_submit_time(job_start_or_submit_time)
else: else:
task.set_start_time(jobStartOrSubmitTime) task.set_start_time(job_start_or_submit_time)
task.set_owner(matchObj.group('jobOwner')) task.set_owner(match_obj.group('jobOwner'))
task.set_script_name(matchObj.group('jobScriptName')) task.set_script_name(match_obj.group('jobScriptName'))
if bInPendingJobsSection: if in_pending_jobs_section:
task.set_num_required_slots(int(matchObj.group('numSlots'))) task.set_num_required_slots(int(match_obj.group('numSlots')))
else: else:
assert not bInPendingJobsSection # if we are in the pending jobs section, the job should be new assert not in_pending_jobs_section # if we are in the pending jobs section, the job should be new
if not bInPendingJobsSection: if not in_pending_jobs_section:
task.add_slots(currentQueueMachine.getName(), int(matchObj.group('numSlots'))) task.add_slots(current_queue_machine.get_name(), int(match_obj.group('numSlots')))
else: else:
# the current line does not describe a job # the current line does not describe a job
if not bInPendingJobsSection: if not in_pending_jobs_section:
# check if this line describes the status of a machine # check if this line describes the status of a machine
matchObj = machineRegularExp.match(line) match_obj = machine_regular_exp.match(line)
if matchObj: if match_obj:
queueName = matchObj.group('queueName') queue_name = match_obj.group('queueName')
machineName = matchObj.group('machineName') machine_name = match_obj.group('machineName')
queueMachine = QueueMachine(queueName, machineName) queue_machine = QueueMachine(queue_name, machine_name)
# log(line) # log(line)
# log('matchObj.group(queueTypeString) :' + matchObj.group('queueTypeString')) # log('matchObj.group(queueTypeString) :' + matchObj.group('queueTypeString'))
# log('matchObj.group(numTotalSlots) :' + matchObj.group('numTotalSlots')) # log('matchObj.group(numTotalSlots) :' + matchObj.group('numTotalSlots'))
queueMachine.setNumSlots(int(matchObj.group('numTotalSlots'))) queue_machine.set_num_slots(int(match_obj.group('numTotalSlots')))
queueMachine.setNumUsedSlots(int(matchObj.group('numUsedSlots'))) queue_machine.set_num_used_slots(int(match_obj.group('numUsedSlots')))
strCpuLoad = matchObj.group('cpuLoad') cpu_load_as_str = match_obj.group('cpuLoad')
if strCpuLoad != '-NA-': if cpu_load_as_str != '-NA-':
queueMachine.setCpuLoad(float(strCpuLoad)) queue_machine.set_cpu_load(float(cpu_load_as_str))
strQueueMachineState = matchObj.group('queueMachineStatus') queue_machine_state_as_str = match_obj.group('queueMachineStatus')
queueMachine.setState(self.parseQueueMachineState(strQueueMachineState)) queue_machine.set_state(self.parseQueueMachineState(queue_machine_state_as_str))
# log('QstatParser::parseQstatOutput : queueName = "'+matchObj.group('queueName')+'"') # log('QstatParser::parseQstatOutput : queueName = "'+matchObj.group('queueName')+'"')
# log('QstatParser::parseQstatOutput : machineName = "'+matchObj.group('machineName')+'"') # log('QstatParser::parseQstatOutput : machineName = "'+matchObj.group('machineName')+'"')
currentQueueMachine = queueMachine current_queue_machine = queue_machine
jobsState.addQueueMachine(queueMachine) jobsState.add_queue_machine(queue_machine)
else: else:
matchObj = pendingJobsHeaderRegularExp.match(line) match_obj = pending_jobs_header_regular_exp.match(line)
if matchObj: if match_obj:
bInPendingJobsSection = True in_pending_jobs_section = True
currentQueueMachine = None current_queue_machine = None
else: else:
pass pass
else: else:
# we are in a pending jobs section # we are in a pending jobs section
matchObj = re.match('^[#]+$', line) match_obj = re.match('^[#]+$', line)
if not matchObj: if not match_obj:
# unexpected line # unexpected line
print('line = "' + line + '"') print('line = "' + line + '"')
assert False assert False
@ -223,11 +222,11 @@ class QstatParser:
f.close() f.close()
return jobsState return jobsState
def parseJobDetails(self, qstatOutput, job): def parse_job_details(self, qstat_output: str, task: Task):
""" """
adds to job the details parsed from the output of the "qstat -j <jobid>" command adds to job the details parsed from the output of the "qstat -j <jobid>" command
""" """
f = io.StringIO(qstatOutput) f = io.StringIO(qstat_output)
line = f.readline() line = f.readline()
fieldRegularExp = re.compile('^(?P<fieldName>[^:]+):[ ]+(?P<fieldValue>[?]*)$') fieldRegularExp = re.compile('^(?P<fieldName>[^:]+):[ ]+(?P<fieldValue>[?]*)$')
while len(line) > 0: while len(line) > 0:
@ -238,20 +237,20 @@ class QstatParser:
fieldName = matchObj.group('fieldName') fieldName = matchObj.group('fieldName')
strFieldValue = matchObj.group('fieldValue') strFieldValue = matchObj.group('fieldValue')
if fieldName == 'job_number': if fieldName == 'job_number':
assert job.getId().asStr() == strFieldValue assert task.getId().asStr() == strFieldValue
elif fieldName == 'hard_queue_list': elif fieldName == 'hard_queue_list':
allowedQueues = strFieldValue.split(',') allowedQueues = strFieldValue.split(',')
assert len(allowedQueues) > 0 assert len(allowedQueues) > 0
job.m_jobRequirements.m_queues = allowedQueues task.job_requirements.queues = allowedQueues
elif fieldName == 'parallel environment': elif fieldName == 'parallel environment':
# the value could be 'ompi range: 32' # the value could be 'ompi range: 32'
matchObj = re.match('ompi range: (?P<numSlots>[0-9]+)[?]*', strFieldValue) matchObj = re.match('ompi range: (?P<numSlots>[0-9]+)[?]*', strFieldValue)
if matchObj: if matchObj:
job.m_jobRequirements.m_parallelEnvironment = ParallelEnvironment.MPI task.job_requirements.parallel_environment = ParallelEnvironment.MPI
else: else:
assert False assert False
else: else:
# ignore he other fields # ignore the other fields
None pass
line = f.readline() line = f.readline()
f.close() f.close()

View File

@ -1,3 +1,7 @@
from typing import Optional
from .Job import QueueMachineId
from .ClusterNode import ClusterNodeId
class QueueMachineStateFlags: # class QueueMachineStateFlags: #
DISABLED = 1 # the queue machine is disabled DISABLED = 1 # the queue machine is disabled
@ -12,70 +16,73 @@ class QueueMachine:
""" """
a QueueMachine instance represents a given SGE queue on a given machine (eg allintel.q@simpatix10) a QueueMachine instance represents a given SGE queue on a given machine (eg allintel.q@simpatix10)
""" """
def __init__(self, queueName, machineName): queue_name: str
self.m_queueName = queueName machine_name: ClusterNodeId
self.m_machineName = machineName num_slots: Optional[int]
self.m_numSlots = None num_used_slots: Optional[int]
self.m_numUsedSlots = None cpu_load: Optional[float]
self.m_fCpuLoad = None state_flags: int
self.m_stateFlags = 0 disable_message: str
self.m_strDisableMessage = ''
def getName(self): def __init__(self, queueName, machineName):
self.queue_name = queueName
self.machine_name = machineName
self.num_slots = None
self.num_used_slots = None
self.cpu_load = None
self.state_flags = 0
self.disable_message = ''
def get_name(self) -> QueueMachineId:
""" """
returns the name of the machine queue (such as allintel.q@simpatix10) returns the name of the machine queue (such as allintel.q@simpatix10)
""" """
return self.m_queueName + '@' + self.m_machineName return self.queue_name + '@' + self.machine_name
def getQueueName(self): def get_queue_name(self) -> str:
return self.m_queueName return self.queue_name
def getMachineName(self): def get_machine_name(self) -> str:
return self.m_machineName return self.machine_name
def setNumSlots(self, numSlots): def set_num_slots(self, num_slots: int):
self.m_numSlots = numSlots self.num_slots = num_slots
def setNumUsedSlots(self, numSlots): def set_num_used_slots(self, num_slots: int):
self.m_numUsedSlots = numSlots self.num_used_slots = num_slots
def getNumSlots(self): def get_num_slots(self) -> int:
assert self.m_numSlots is not None assert self.num_slots is not None
return self.m_numSlots return self.num_slots
def getNumUsedSlots(self): def get_num_used_slots(self) -> int:
assert self.m_numUsedSlots is not None assert self.num_used_slots is not None
return self.m_numUsedSlots return self.num_used_slots
def setCpuLoad(self, fCpuLoad): def set_cpu_load(self, cpu_load: float):
self.m_fCpuLoad = fCpuLoad self.cpu_load = cpu_load
def cpuLoadIsAvailable(self): def cpu_load_is_available(self) -> bool:
return self.m_fCpuLoad is not None return self.cpu_load is not None
def getCpuLoad(self): def get_cpu_load(self) -> float:
assert self.m_fCpuLoad is not None assert self.cpu_load is not None
return self.m_fCpuLoad return self.cpu_load
def setState(self, state): def set_state(self, state: int):
self.m_stateFlags = state self.state_flags = state
def isDisabled(self): def is_disabled(self) -> bool:
return self.m_stateFlags & QueueMachineStateFlags.DISABLED return self.state_flags & QueueMachineStateFlags.DISABLED
def isInErrorState(self): def is_in_error_state(self) -> bool:
return self.m_stateFlags & QueueMachineStateFlags.ERROR return self.state_flags & QueueMachineStateFlags.ERROR
def isResponding(self): def is_responding(self) -> bool:
return not (self.m_stateFlags & QueueMachineStateFlags.UNKNOWN) return not (self.state_flags & QueueMachineStateFlags.UNKNOWN)
def isInAlarmState(self): def is_in_alarm_state(self) -> bool:
return self.m_stateFlags & QueueMachineStateFlags.ALARM return self.state_flags & QueueMachineStateFlags.ALARM
def isSuspended(self): def is_suspended(self) -> bool:
return self.m_stateFlags & QueueMachineStateFlags.SUSPENDED return self.state_flags & QueueMachineStateFlags.SUSPENDED
"""
def getStateAsString(self):
assert(self.m_strState is not None)
return self.m_strState
"""

View File

@ -1,21 +1,24 @@
from PowerState import PowerState
from Log import logInfo import abc
import time import time
import copy import copy
from .PowerState import PowerState
from .Log import log_info
class Slot: class Slot:
def __init__(self): def __init__(self):
self.m_queueMachine = None self.queue_machine = None
self.m_numSlots = None self.num_slots = None
self.m_job = None # job for which this slot is allocated self.jobs = None # job for which this slot is allocated
class SlotAllocator: class SlotAllocator(abc.ABCMeta):
""" """
a class that defines a strategy for allocating free slots for the given pending jobs a class that defines a strategy for allocating free slots for the given pending jobs
""" """
def getMachinesThatNeedWakeUp(self, pendingJobs, clusterState): @abc.abstractmethod
def get_machinesThatNeedWakeUp(self, pendingJobs, clusterState):
""" """
returns the list of machines that need to wake up to make pending jobs running returns the list of machines that need to wake up to make pending jobs running
""" """
@ -23,44 +26,45 @@ class SlotAllocator:
class SimpleSlotAllocator(SlotAllocator): class SimpleSlotAllocator(SlotAllocator):
def getMachinesThatNeedWakeUp(self, pendingJobs, clusterState):
def get_machinesThatNeedWakeUp(self, pendingJobs, clusterState):
machinesThatNeedWakeUp = {} machinesThatNeedWakeUp = {}
highestPriorityPendingJob = pendingJobs.values()[0] highestPriorityPendingJob = pendingJobs.values()[0]
logInfo('SimpleSlotAllocator::getMachinesThatNeedWakeUp : looking for free slots for job ' + highestPriorityPendingJob.getId().asStr()) log_info('SimpleSlotAllocator::get_machinesThatNeedWakeUp : looking for free slots for job ' + highestPriorityPendingJob.getId().asStr())
numFreeSlots = {} # contains the number of free slots for each queueMachine numFreeSlots = {} # contains the number of free slots for each queueMachine
for queueMachine in clusterState.getJobsState().getQueueMachines().values(): for queueMachine in clusterState.get_jobs_state().get_queue_machines().values():
numFreeSlots[queueMachine] = clusterState.getJobsState().getNumFreeSlotsOnQueueMachine(queueMachine) numFreeSlots[queueMachine] = clusterState.get_jobs_state().getNumFreeSlotsOnQueueMachine(queueMachine)
logInfo('SimpleSlotAllocator::getMachinesThatNeedWakeUp : init numFreeSlots[%s] with %d ' % (queueMachine.getName(), numFreeSlots[queueMachine])) log_info('SimpleSlotAllocator::get_machinesThatNeedWakeUp : init numFreeSlots[%s] with %d ' % (queueMachine.get_name(), numFreeSlots[queueMachine]))
remainingNumSlotsToAllocate = highestPriorityPendingJob.m_jobRequirements.m_numSlots remainingNumSlotsToAllocate = highestPriorityPendingJob.job_requirements.num_slots
logInfo('SimpleSlotAllocator::getMachinesThatNeedWakeUp : still %d slots to find' % remainingNumSlotsToAllocate) log_info('SimpleSlotAllocator::get_machinesThatNeedWakeUp : still %d slots to find' % remainingNumSlotsToAllocate)
# first look in running machines if there are available slots # first look in running machines if there are available slots
for queueMachine in clusterState.getJobsState().getQueueMachines().values(): for queueMachine in clusterState.get_jobs_state().get_queue_machines().values():
logInfo('SimpleSlotAllocator::getMachinesThatNeedWakeUp : examining queueMachine %s ' % queueMachine.getName()) log_info('SimpleSlotAllocator::get_machinesThatNeedWakeUp : examining queueMachine %s ' % queueMachine.get_name())
machine = clusterState.getMachines()[queueMachine.getMachineName()] machine = clusterState.get_machines()[queueMachine.get_machine_name()]
if machine.getPowerState() == PowerState.ON: if machine.get_power_state() == PowerState.ON:
if clusterState.queueMachineFitsJobRequirements(queueMachine, highestPriorityPendingJob.m_jobRequirements): if clusterState.queue_machine_fits_job_requirements(queueMachine, highestPriorityPendingJob.job_requirements):
numSlotsAllocatedOnThisMachine = min(numFreeSlots[queueMachine], remainingNumSlotsToAllocate) numSlotsAllocatedOnThisMachine = min(numFreeSlots[queueMachine], remainingNumSlotsToAllocate)
logInfo('SimpleSlotAllocator::getMachinesThatNeedWakeUp : found %d slots on already running %s ' % (numSlotsAllocatedOnThisMachine, queueMachine.getMachineName())) log_info('SimpleSlotAllocator::get_machinesThatNeedWakeUp : found %d slots on already running %s ' % (numSlotsAllocatedOnThisMachine, queueMachine.get_machine_name()))
remainingNumSlotsToAllocate -= numSlotsAllocatedOnThisMachine remainingNumSlotsToAllocate -= numSlotsAllocatedOnThisMachine
numFreeSlots[queueMachine] -= numSlotsAllocatedOnThisMachine numFreeSlots[queueMachine] -= numSlotsAllocatedOnThisMachine
logInfo('SimpleSlotAllocator::getMachinesThatNeedWakeUp : still %d slots to find' % remainingNumSlotsToAllocate) log_info('SimpleSlotAllocator::get_machinesThatNeedWakeUp : still %d slots to find' % remainingNumSlotsToAllocate)
assert remainingNumSlotsToAllocate >= 0 assert remainingNumSlotsToAllocate >= 0
if remainingNumSlotsToAllocate == 0: if remainingNumSlotsToAllocate == 0:
break break
if remainingNumSlotsToAllocate > 0: if remainingNumSlotsToAllocate > 0:
# now look into machines that are asleep # now look into machines that are asleep
for queueMachine in clusterState.getJobsState().getQueueMachines().values(): for queueMachine in clusterState.get_jobs_state().get_queue_machines().values():
logInfo('SimpleSlotAllocator::getMachinesThatNeedWakeUp : examining queueMachine %s ' % queueMachine.getName()) log_info('SimpleSlotAllocator::get_machinesThatNeedWakeUp : examining queueMachine %s ' % queueMachine.get_name())
machine = clusterState.getMachines()[queueMachine.getMachineName()] machine = clusterState.get_machines()[queueMachine.get_machine_name()]
if machine.getPowerState() == PowerState.SLEEP: if machine.get_power_state() == PowerState.SLEEP:
if clusterState.queueMachineFitsJobRequirements(queueMachine, highestPriorityPendingJob.m_jobRequirements): if clusterState.queue_machine_fits_job_requirements(queueMachine, highestPriorityPendingJob.job_requirements):
numSlotsAllocatedOnThisMachine = min(numFreeSlots[queueMachine], remainingNumSlotsToAllocate) numSlotsAllocatedOnThisMachine = min(numFreeSlots[queueMachine], remainingNumSlotsToAllocate)
logInfo('SimpleSlotAllocator::getMachinesThatNeedWakeUp : found %d slots on sleeping %s ' % (numSlotsAllocatedOnThisMachine, queueMachine.getMachineName())) log_info('SimpleSlotAllocator::get_machinesThatNeedWakeUp : found %d slots on sleeping %s ' % (numSlotsAllocatedOnThisMachine, queueMachine.get_machine_name()))
remainingNumSlotsToAllocate -= numSlotsAllocatedOnThisMachine remainingNumSlotsToAllocate -= numSlotsAllocatedOnThisMachine
numFreeSlots[queueMachine] -= numSlotsAllocatedOnThisMachine numFreeSlots[queueMachine] -= numSlotsAllocatedOnThisMachine
machinesThatNeedWakeUp[machine.getName()] = machine machinesThatNeedWakeUp[machine.get_name()] = machine
logInfo('SimpleSlotAllocator::getMachinesThatNeedWakeUp : still %d slots to find' % remainingNumSlotsToAllocate) log_info('SimpleSlotAllocator::get_machinesThatNeedWakeUp : still %d slots to find' % remainingNumSlotsToAllocate)
assert remainingNumSlotsToAllocate >= 0 assert remainingNumSlotsToAllocate >= 0
if remainingNumSlotsToAllocate == 0: if remainingNumSlotsToAllocate == 0:
break break
@ -75,9 +79,9 @@ class DecoupledSlotAllocator(SlotAllocator):
Instead, it uses a very simple strategy : it wakes up all the machines periodically to allow jobs to get in. Instead, it uses a very simple strategy : it wakes up all the machines periodically to allow jobs to get in.
""" """
def __init__(self): def __init__(self):
self.m_delayBetweenPeriodicChecks = -1 # in seconds. Disable periodic checks by setting this to -1 self.delay_between_periodic_checks = -1 # in seconds. Disable periodic checks by setting this to -1
self.m_lastCheckTime = time.time() self.last_check_time = time.time()
self.m_lastClusterState = None self.last_cluster_state = None
def jobsStateHasChanged(self, newClusterState): def jobsStateHasChanged(self, newClusterState):
""" """
@ -85,26 +89,24 @@ class DecoupledSlotAllocator(SlotAllocator):
to start (provided all machines are enabled) to start (provided all machines are enabled)
""" """
oldJobs = {} oldJobs = {}
if self.m_lastClusterState: if self.last_cluster_state:
oldJobs = self.m_lastClusterState.m_jobsState.m_jobs oldJobs = self.last_cluster_state.jobs_state.jobs
newJobs = newClusterState.m_jobsState.m_jobs newJobs = newClusterState.jobs_state.jobs
bJobsHaveChanged = False bJobsHaveChanged = False
oldJobsOnly = oldJobs.copy() # shallow copy oldJobsOnly = oldJobs.copy() # shallow copy
# print 'oldJobs : ', oldJobs # print 'oldJobs : ', oldJobs
# print 'newJobs : ', newJobs # print 'newJobs : ', newJobs
""" # print 'self.last_cluster_state', self.last_cluster_state
print 'self.m_lastClusterState', self.m_lastClusterState # print 'newClusterState', newClusterState
print 'newClusterState', newClusterState # if self.last_cluster_state:
if self.m_lastClusterState: # print 'self.last_cluster_state.jobs_state', self.last_cluster_state.jobs_state
print 'self.m_lastClusterState.m_jobsState', self.m_lastClusterState.m_jobsState # print 'newClusterState.jobs_state', newClusterState.jobs_state
print 'newClusterState.m_jobsState', newClusterState.m_jobsState # print 'id(self.last_cluster_state) : ', id(self.last_cluster_state)
print 'id(self.m_lastClusterState) : ', id(self.m_lastClusterState) # print 'id(newClusterState) : ', id(newClusterState)
print 'id(newClusterState) : ', id(newClusterState) # print 'len(oldJobs) : ', len(oldJobs)
print 'len(oldJobs) : ', len(oldJobs) # print 'len(newJobs) : ', len(newJobs)
print 'len(newJobs) : ', len(newJobs) # print 'id(oldJobs) : ', id(oldJobs)
print 'id(oldJobs) : ', id(oldJobs) # print 'id(newJobs) : ', id(newJobs)
print 'id(newJobs) : ', id(newJobs)
"""
for newJob in newJobs.values(): for newJob in newJobs.values():
# logDebug('DecoupledSlotAllocator::jobsStateHasChanged newJob id=%s' % newJob.getId().asStr()) # logDebug('DecoupledSlotAllocator::jobsStateHasChanged newJob id=%s' % newJob.getId().asStr())
if newJob.getId() in oldJobs: if newJob.getId() in oldJobs:
@ -112,36 +114,36 @@ class DecoupledSlotAllocator(SlotAllocator):
del oldJobsOnly[newJob.getId()] del oldJobsOnly[newJob.getId()]
else: else:
# ah ... a new job has arrived # ah ... a new job has arrived
logInfo('A new job (jobId =%s) has been detected ' % newJob.getId().asStr()) log_info('A new job (jobId =%s) has been detected ' % newJob.getId().asStr())
bJobsHaveChanged = True bJobsHaveChanged = True
if len(oldJobsOnly) != 0: if len(oldJobsOnly) != 0:
for oldJob in oldJobsOnly.values(): for oldJob in oldJobsOnly.values():
logInfo('Job (jobId =%s) has finished' % oldJob.getId().asStr()) log_info('Job (jobId =%s) has finished' % oldJob.getId().asStr())
# at least one old job has finished, freeing some slots # at least one old job has finished, freeing some slots
bJobsHaveChanged = True bJobsHaveChanged = True
return bJobsHaveChanged return bJobsHaveChanged
def getMachinesThatNeedWakeUp(self, pendingJobs, clusterState): def get_machinesThatNeedWakeUp(self, pendingJobs, clusterState):
machinesThatNeedWakeUp = {} machinesThatNeedWakeUp = {}
bJobsStateHasChanged = self.jobsStateHasChanged(clusterState) bJobsStateHasChanged = self.jobsStateHasChanged(clusterState) # pylint: disable=no-value-for-parameter
currentTime = time.time() currentTime = time.time()
# we do periodic checks to detect changes in cluster state that are not detected by jobsStateHasChanged # we do periodic checks to detect changes in cluster state that are not detected by jobsStateHasChanged
# for example changes in the requirements, in the allocation policy, etc... # for example changes in the requirements, in the allocation policy, etc...
bItsTimeForPeriodicCheck = False bItsTimeForPeriodicCheck = False
if self.m_delayBetweenPeriodicChecks > 0: if self.delay_between_periodic_checks > 0:
bItsTimeForPeriodicCheck = (currentTime - self.m_lastCheckTime) > self.m_delayBetweenPeriodicChecks bItsTimeForPeriodicCheck = (currentTime - self.last_check_time) > self.delay_between_periodic_checks
if bJobsStateHasChanged or bItsTimeForPeriodicCheck: if bJobsStateHasChanged or bItsTimeForPeriodicCheck:
if bJobsStateHasChanged: if bJobsStateHasChanged:
logInfo('DecoupledSlotAllocator::getMachinesThatNeedWakeUp : waking up machines that are asleep because jobs state has changed') log_info('DecoupledSlotAllocator::get_machinesThatNeedWakeUp : waking up machines that are asleep because jobs state has changed')
else: else:
logInfo('DecoupledSlotAllocator::getMachinesThatNeedWakeUp : waking up machines that are asleep for periodic check (to be sure pending jobs get a chance to start)') log_info('DecoupledSlotAllocator::get_machinesThatNeedWakeUp : waking up machines that are asleep for periodic check (to be sure pending jobs get a chance to start)')
for queueMachine in clusterState.getJobsState().getQueueMachines().values(): for queueMachine in clusterState.get_jobs_state().get_queue_machines().values():
if queueMachine.getMachineName() in clusterState.getMachines(): if queueMachine.get_machine_name() in clusterState.get_machines():
# this means that the machine is under the cluster controller's control # this means that the machine is under the cluster controller's control
machine = clusterState.getMachines()[queueMachine.getMachineName()] machine = clusterState.get_machines()[queueMachine.get_machine_name()]
if machine.getPowerState() == PowerState.SLEEP: if machine.get_power_state() == PowerState.SLEEP:
machinesThatNeedWakeUp[machine.getName()] = machine machinesThatNeedWakeUp[machine.get_name()] = machine
self.m_lastCheckTime = currentTime self.last_check_time = currentTime
self.m_lastClusterState = copy.copy(clusterState) self.last_cluster_state = copy.copy(clusterState)
# print 'self.m_lastClusterState', self.m_lastClusterState # print 'self.last_cluster_state', self.last_cluster_state
return machinesThatNeedWakeUp return machinesThatNeedWakeUp

View File

@ -1,58 +1,58 @@
import time import time
from Util import executeProgram from .Util import execute_program
from QstatParser import QstatParser from .QstatParser import QstatParser
from Log import logDebug, logWarning from .Log import logDebug, log_warning
class SunGridEngine: class SunGridEngine:
def getCurrentJobsState(self): def get_current_job_state(self):
bBUG_00000009_IS_STILL_ALIVE = True bBUG_00000009_IS_STILL_ALIVE = True
if bBUG_00000009_IS_STILL_ALIVE: if bBUG_00000009_IS_STILL_ALIVE:
logDebug('Querying the current state of jobs') logDebug('Querying the current state of jobs')
returnCode = -1 return_code = -1
delayBetweenAttemps = 5 # in seconds delay_between_attempts = 5 # in seconds
while returnCode != 0: while return_code != 0:
command = ['qstat', '-f', '-u', '*'] command = ['qstat', '-f', '-u', '*']
(returnCode, qstatOutput, stderr) = executeProgram(command) (return_code, qstat_output, stderr) = execute_program(command)
if returnCode != 0: if return_code != 0:
logWarning('command "%s" failed (returnCode = %d, stdout="%s", stderr="%s"). Retrying in %d seconds' % (' '.join(command), returnCode, qstatOutput, stderr, delayBetweenAttemps)) log_warning('command "%s" failed (returnCode = %d, stdout="%s", stderr="%s"). Retrying in %d seconds' % (' '.join(command), return_code, qstat_output, stderr, delay_between_attempts))
time.sleep(delayBetweenAttemps) time.sleep(delay_between_attempts)
if bBUG_00000009_IS_STILL_ALIVE: if bBUG_00000009_IS_STILL_ALIVE:
logDebug('Just got current state of jobs') logDebug('Just got current state of jobs')
jobsState = QstatParser().parseQstatOutput(qstatOutput) jobs_state = QstatParser().parse_qstat_output(qstat_output)
jobsState.setTime(time.time()) jobs_state.set_time(time.time())
# read the requirements for pending jobs (which parallel environment, which queue, which architecture) from sge # read the requirements for pending jobs (which parallel environment, which queue, which architecture) from sge
if False: # no need for job details at the moment and since it's very slow, it's been disabled if False: # no need for job details at the moment and since it's very slow, it's been disabled
for unused_jobId, job in jobsState.getPendingJobs().items(): for unused_jobId, job in jobs_state.get_pending_jobs().items():
(returnCode, stdout, stderr) = executeProgram(['qstat', '-j', job.getId().asStr()]) (return_code, stdout, stderr) = execute_program(['qstat', '-j', job.getId().asStr()])
assert returnCode != 0, 'prout' assert return_code != 0, 'prout'
QstatParser().parseJobDetails(stdout, job) QstatParser().parse_job_details(stdout, job)
return jobsState return jobs_state
def setQueueInstanceActivation(self, strQueueInstanceName, bEnable): def set_queue_instance_activation(self, queue_instance_name: str, enable: bool):
argument = 'd' argument = 'd'
if bEnable: if enable:
argument = 'e' argument = 'e'
bBUG_00000269_IS_STILL_ALIVE = True # for some reason, qmod -d (and maybe any sge command) could fail with error: commlib error: can't connect to service (Address already in use) bBUG_00000269_IS_STILL_ALIVE = True # for some reason, qmod -d (and maybe any sge command) could fail with error: commlib error: can't connect to service (Address already in use)
delayBetweenAttemps = 5 # in seconds delay_between_attempts = 5 # in seconds
while True: while True:
errorCode, unused_stdout, unused_stderr = executeProgram(['qmod', '-' + argument, strQueueInstanceName]) error_code, unused_stdout, unused_stderr = execute_program(['qmod', '-' + argument, queue_instance_name])
if bBUG_00000269_IS_STILL_ALIVE: if bBUG_00000269_IS_STILL_ALIVE:
# if the command failed, try again # if the command failed, try again
if errorCode == 0: if error_code == 0:
break break
time.sleep(delayBetweenAttemps) time.sleep(delay_between_attempts)
else: else:
break break
return (errorCode == 0) return (error_code == 0)
def queueIsEmpty(self, strMachineName): def queue_is_empty(self, machine_name: str):
(returnCode, qstatOutput, unused_stderr) = executeProgram(['qstat', '-f', '-u', '*']) (returnCode, qstat_output, unused_stderr) = execute_program(['qstat', '-f', '-u', '*'])
assert returnCode == 0 assert returnCode == 0
jobsState = QstatParser().parseQstatOutput(qstatOutput) jobs_state = QstatParser().parse_qstat_output(qstat_output)
jobs = jobsState.getJobsOnMachine(strMachineName) jobs = jobs_state.get_jobs_on_machine(machine_name)
return (len(jobs) == 0) return (len(jobs) == 0)

View File

@ -1,24 +1,21 @@
#!/usr/bin/env python #!/usr/bin/env python
import sys from .Log import log_info
sys.path.insert(0, '..') from .Util import get_power_state, blocking_put_machine_to_sleep, blocking_wake_up_machine, execute_command, execute_ipmi_command
from Log import logInfo from .PowerState import PowerState
import Util
from PowerState import PowerState
from HTMLParser import HTMLParser
def Test0000(): def Test0000():
logInfo('Testing bug 00000003 if a series of wake up, goto sleep can shutdown a machine') log_info('Testing bug 00000003 if a series of wake up, goto sleep can shutdown a machine')
strTargetMachineName = 'simpatix12' strTarget_machine_name = 'simpatix12'
ePowerState = Util.getPowerState(strTargetMachineName) ePowerState = get_power_state(strTarget_machine_name)
while True: while True:
if ePowerState == PowerState.ON: if ePowerState == PowerState.ON:
bSuccess = Util.blockingPutMachineToSleep(strTargetMachineName) bSuccess = blocking_put_machine_to_sleep(strTarget_machine_name)
assert bSuccess assert bSuccess
bSuccess = Util.blockingPutMachineToSleep(strTargetMachineName) bSuccess = blocking_put_machine_to_sleep(strTarget_machine_name)
ePowerState = PowerState.SLEEP ePowerState = PowerState.SLEEP
elif ePowerState == PowerState.SLEEP: elif ePowerState == PowerState.SLEEP:
bSuccess = Util.blockingWakeUpMachine(strTargetMachineName) bSuccess = blocking_wake_up_machine(strTarget_machine_name)
assert bSuccess assert bSuccess
ePowerState = PowerState.ON ePowerState = PowerState.ON
else: else:
@ -26,30 +23,30 @@ def Test0000():
def Test0001(): def Test0001():
logInfo('Testing bug 00000003 : could it be caused by a sleep and a power on at the same tim ?') log_info('Testing bug 00000003 : could it be caused by a sleep and a power on at the same tim ?')
strTargetMachineName = 'simpatix12' strTarget_machine_name = 'simpatix12'
ePowerState = Util.getPowerState(strTargetMachineName) ePowerState = get_power_state(strTarget_machine_name)
if ePowerState == PowerState.SLEEP: if ePowerState == PowerState.SLEEP:
bSuccess = Util.blockingWakeUpMachine(strTargetMachineName) bSuccess = blocking_wake_up_machine(strTarget_machine_name)
assert bSuccess assert bSuccess
ePowerState = PowerState.ON ePowerState = PowerState.ON
assert ePowerState == PowerState.ON assert ePowerState == PowerState.ON
Util.executeCommand("ssh %s 'pmset sleepnow'" % strTargetMachineName) execute_command("ssh %s 'pmset sleepnow'" % strTarget_machine_name)
bSuccess = Util.blockingWakeUpMachine(strTargetMachineName) bSuccess = blocking_wake_up_machine(strTarget_machine_name)
assert bSuccess assert bSuccess
def Test0002(): def Test0002():
logInfo('Testing bug 00000003 : could it be caused by a power on quickly followed by a sleep ?') log_info('Testing bug 00000003 : could it be caused by a power on quickly followed by a sleep ?')
strTargetMachineName = 'simpatix12' strTarget_machine_name = 'simpatix12'
ePowerState = Util.getPowerState(strTargetMachineName) ePowerState = get_power_state(strTarget_machine_name)
if ePowerState == PowerState.ON: if ePowerState == PowerState.ON:
bSuccess = Util.blockingWakeUpMachine(strTargetMachineName) bSuccess = blocking_wake_up_machine(strTarget_machine_name)
assert bSuccess assert bSuccess
ePowerState = PowerState.SLEEP ePowerState = PowerState.SLEEP
assert ePowerState == PowerState.SLEEP assert ePowerState == PowerState.SLEEP
Util.executeIpmiCommand(strTargetMachineName, 'chassis power on') execute_ipmi_command(strTarget_machine_name, 'chassis power on')
Util.executeCommand("ssh %s 'pmset sleepnow'" % strTargetMachineName) execute_command("ssh %s 'pmset sleepnow'" % strTarget_machine_name)
if __name__ == '__main__': if __name__ == '__main__':

View File

@ -1,80 +1,81 @@
# import .Util
# import ..SimpaDbUtil
from .Log import logDebug, logInfo, logWarning, logError
from .PowerState import PowerState, PowerStateToStr
import re import re
import io import io
import os import os
import traceback import traceback
import sys import sys
import time
from ..Util import execute_program as exe_prog
from ..Util import execute_command as exe_comm
from ..Util import send_text_mail
from ..SimpaDbUtil import getLightOutManagementIpAddress, is_machine_responding
from .Log import logDebug, log_info, log_warning, logError
from .PowerState import PowerState, PowerStateToStr
def executeProgram(astrArguments): def execute_program(astrArguments):
bBUG_00000008_IS_STILL_ACTIVE = True bBUG_00000008_IS_STILL_ACTIVE = True
if bBUG_00000008_IS_STILL_ACTIVE: if bBUG_00000008_IS_STILL_ACTIVE:
logDebug('executeProgram : program = [%s]' % (','.join(astrArguments))) logDebug('execute_program : program = [%s]' % (','.join(astrArguments)))
(returnCode, stdout, stderr) = Lib.Util.executeProgram(astrArguments) (returnCode, stdout, stderr) = exe_prog(astrArguments)
if bBUG_00000008_IS_STILL_ACTIVE: if bBUG_00000008_IS_STILL_ACTIVE:
logDebug('executeCommand : return code of [%s] = %d' % (','.join(astrArguments), returnCode)) logDebug('execute_command : return code of [%s] = %d' % (','.join(astrArguments), returnCode))
# for debugging purpose, log info in case the command failed # for debugging purpose, log info in case the command failed
if returnCode != 0: if returnCode != 0:
logDebug('executeCommand : return code of [%s] = %d' % (','.join(astrArguments), returnCode)) logDebug('execute_command : return code of [%s] = %d' % (','.join(astrArguments), returnCode))
logDebug('executeCommand : stdout of [%s] = %s' % (','.join(astrArguments), stdout)) logDebug('execute_command : stdout of [%s] = %s' % (','.join(astrArguments), stdout))
logDebug('executeCommand : stderr of [%s] = %s' % (','.join(astrArguments), stderr)) logDebug('execute_command : stderr of [%s] = %s' % (','.join(astrArguments), stderr))
return (returnCode, stdout, stderr) return (returnCode, stdout, stderr)
def executeCommand(command): def execute_command(command):
# logDebug('executeCommand : command = ' + command) # logDebug('execute_command : command = ' + command)
(returnCode, stdout, stderr) = Lib.Util.executeCommand(command) (returnCode, stdout, stderr) = exe_comm(command)
# logDebug('executeCommand : return code of "'+command+'" = '+str(returnCode)) # logDebug('execute_command : return code of "'+command+'" = '+str(returnCode))
return (returnCode, stdout, stderr) return (returnCode, stdout, stderr)
def executeIpmiCommand(machineName, ipmiCommandArgs): def execute_ipmi_command(machineName, ipmiCommandArgs):
lomIpAddress = Lib.SimpaDbUtil.getLightOutManagementIpAddress(machineName) lomIpAddress = getLightOutManagementIpAddress(machineName)
lomPasswordFilepath = '/usr/local/etc/LightOutManagementPassword.txt' lomPasswordFilepath = '/usr/local/etc/LightOutManagementPassword.txt'
astrProgram = ['ipmitool', '-U', 'admin', '-H', lomIpAddress, '-f', lomPasswordFilepath] astrProgram = ['ipmitool', '-U', 'admin', '-H', lomIpAddress, '-f', lomPasswordFilepath]
astrProgram.extend(ipmiCommandArgs) astrProgram.extend(ipmiCommandArgs)
# print 'executeIpmiCommand' # print 'execute_ipmi_command'
# print astrProgram # print astrProgram
bBUG_00000005_IS_STILL_ACTIVE = True bBUG_00000005_IS_STILL_ACTIVE = True
if bBUG_00000005_IS_STILL_ACTIVE: if bBUG_00000005_IS_STILL_ACTIVE:
# bug 00000005 causes ipmitool to randomly fail for no apparent reason (two consecutive calls might give different errors, these errors not always being timeouts). Therefore we try and try again, until the command succeeds. If we don't do this, cluster controller keeps stopping because ipmi commands fail. The effect of this hack is that the UNPLUGGED power state is no longer detected; therefore, with this hack, cluster controller is expecting all machines to be plugged. # bug 00000005 causes ipmitool to randomly fail for no apparent reason (two consecutive calls might give different errors, these errors not always being timeouts). Therefore we try and try again, until the command succeeds. If we don't do this, cluster controller keeps stopping because ipmi commands fail. The effect of this hack is that the UNPLUGGED power state is no longer detected; therefore, with this hack, cluster controller is expecting all machines to be plugged.
bCommandSucceeded = False bCommandSucceeded = False
while not bCommandSucceeded: while not bCommandSucceeded:
(returnCode, stdout, stderr) = executeProgram(astrProgram) (returnCode, stdout, stderr) = execute_program(astrProgram)
if returnCode == 0: if returnCode == 0:
bCommandSucceeded = True bCommandSucceeded = True
else: else:
logWarning('the command "%s" failed. Retrying a bit later' % ' '.join(astrProgram)) log_warning('the command "%s" failed. Retrying a bit later' % ' '.join(astrProgram))
time.sleep(5) # wait for 5 seconds before the next attempt, in order not to saturate activity time.sleep(5) # wait for 5 seconds before the next attempt, in order not to saturate activity
else: else:
(returnCode, stdout, stderr) = executeProgram(astrProgram) (returnCode, stdout, stderr) = execute_program(astrProgram)
""" # sh-3.2# ipmitool -U admin -H 129.20.27.220 -f /usr/local/etc/LightOutManagementPassword.txt sensor get 'ACPI State'
sh-3.2# ipmitool -U admin -H 129.20.27.220 -f /usr/local/etc/LightOutManagementPassword.txt sensor get 'ACPI State' # Unabled to establish a session with the BMC.
Unabled to establish a session with the BMC. # Command failed due to insufficient resources for session (0xFFFEF901)
Command failed due to insufficient resources for session (0xFFFEF901) # -> this error means that the number of active conections to the BMC has reached the maximum (usually 5).
-> this error means that the number of active conections to the BMC has reached the maximum (usually 5).
sh-3.2# ipmitool -U admin -H 129.20.27.212 -f /usr/local/etc/LightOutManagementPassword.txt sensor get 'ACPI State' # sh-3.2# ipmitool -U admin -H 129.20.27.212 -f /usr/local/etc/LightOutManagementPassword.txt sensor get 'ACPI State'
Unabled to establish a session with the BMC. # Unabled to establish a session with the BMC.
Command failed due to Unknown (0xFFFEF923) (0xFFFEF923) # Command failed due to Unknown (0xFFFEF923) (0xFFFEF923)
sh-3.2# ipmitool -U admin -H 129.20.27.212 -f /usr/local/etc/LightOutManagementPassword.txt sensor get 'ACPI State' # sh-3.2# ipmitool -U admin -H 129.20.27.212 -f /usr/local/etc/LightOutManagementPassword.txt sensor get 'ACPI State'
Unabled to establish a session with the BMC. # Unabled to establish a session with the BMC.
Command failed due to Timeout (0xFFFEF9C3) # Command failed due to Timeout (0xFFFEF9C3)
"""
return (returnCode, stdout, stderr) return (returnCode, stdout, stderr)
def getPowerState(machineName): def get_power_state(machineName):
ePowerState = PowerState.UNKNOWN ePowerState = PowerState.UNKNOWN
bPowerStateRead = False bPowerStateRead = False
iNumFailedAttempts = 0 iNumFailedAttempts = 0
while not bPowerStateRead: while not bPowerStateRead:
(returnCode, stdout, stderr) = executeIpmiCommand(machineName, ['sensor', 'get', 'ACPI State']) (returnCode, stdout, _stderr) = execute_ipmi_command(machineName, ['sensor', 'get', 'ACPI State'])
if returnCode == 0: if returnCode == 0:
matchObj = re.search(r'\[(?P<AcpiState>S[0-9][^\:]*)\:', stdout) matchObj = re.search(r'\[(?P<AcpiState>S[0-9][^\:]*)\:', stdout)
bBUG_00000002_IS_STILL_ACTIVE = True bBUG_00000002_IS_STILL_ACTIVE = True
@ -83,7 +84,7 @@ def getPowerState(machineName):
# the following warning has been commented out because it pollutes the logs and apparently # the following warning has been commented out because it pollutes the logs and apparently
# it's a 'feature' of all 4 core machines : if the machine is woken up using ipmitool, then # it's a 'feature' of all 4 core machines : if the machine is woken up using ipmitool, then
# no power on event is logged ... # no power on event is logged ...
# logWarning('degenerate ipmitool output for machine %s (see bug 00000002). Assuming power in on because that''s what I noticed when I had the case.' % machineName) # log_warning('degenerate ipmitool output for machine %s (see bug 00000002). Assuming power in on because that''s what I noticed when I had the case.' % machineName)
return PowerState.ON return PowerState.ON
else: else:
assert matchObj assert matchObj
@ -103,31 +104,31 @@ def getPowerState(machineName):
iMAX_NUM_ATTEMPTS = 5 iMAX_NUM_ATTEMPTS = 5
iNumFailedAttempts += 1 iNumFailedAttempts += 1
if iNumFailedAttempts < iMAX_NUM_ATTEMPTS: if iNumFailedAttempts < iMAX_NUM_ATTEMPTS:
logWarning('failed to read the power state of %s. I\'ll try a again a bit later....' % machineName) log_warning('failed to read the power state of %s. I\'ll try a again a bit later....' % machineName)
time.sleep(5) time.sleep(5)
else: else:
logWarning('failed to read the power state of %s too many times. I assume this machine is unplugged' % machineName) log_warning('failed to read the power state of %s too many times. I assume this machine is unplugged' % machineName)
ePowerState = PowerState.UNPLUGGED # too many attempts failed ... I guess it's because the machine is unplugged ePowerState = PowerState.UNPLUGGED # too many attempts failed ... I guess it's because the machine is unplugged
bPowerStateRead = True bPowerStateRead = True
return ePowerState return ePowerState
def wakeUpMachine(machineName): def wake_up_machine(machineName):
""" """
this method seems more reliable than wake on lan (sometimes, sending wake on lan packet seems to have no effect) this method seems more reliable than wake on lan (sometimes, sending wake on lan packet seems to have no effect)
@return true on success, false otherwise @return true on success, false otherwise
@note I once had this method failing for no obvious reason.. maybe this command does not succeed if the machine is in a transition state @note I once had this method failing for no obvious reason.. maybe this command does not succeed if the machine is in a transition state
""" """
(returnCode, stdout, stderr) = executeIpmiCommand(machineName, ['chassis', 'power', 'on']) (returnCode, _stdout, _stderr) = execute_ipmi_command(machineName, ['chassis', 'power', 'on'])
bSuccess = (returnCode == 0) # this command can fail if the machine is manually unplugged for example bSuccess = (returnCode == 0) # this command can fail if the machine is manually unplugged for example
return bSuccess return bSuccess
def blockingPutMachineToSleep(machineName): def blocking_put_machine_to_sleep(machineName):
""" """
@return true on success, false otherwise @return true on success, false otherwise
""" """
logInfo('putting machine %s to sleep...' % machineName) log_info('putting machine %s to sleep...' % machineName)
iMaxNumAttempts = 5 iMaxNumAttempts = 5
bSuccess = False bSuccess = False
bBUG_239_IS_STILL_ALIVE = True bBUG_239_IS_STILL_ALIVE = True
@ -135,36 +136,36 @@ def blockingPutMachineToSleep(machineName):
# note : each sleep order is not actually succeeding (god knows why). Therefore, we need to try again and again. # note : each sleep order is not actually succeeding (god knows why). Therefore, we need to try again and again.
while not bSuccess: while not bSuccess:
# note : pmset must be executed as root # note : pmset must be executed as root
(returnCode, stdout, stderr) = executeProgram(['ssh', machineName, 'pmset sleepnow']) (_returnCode, _stdout, _stderr) = execute_program(['ssh', machineName, 'pmset sleepnow'])
# check if the machine actually went to sleep # check if the machine actually went to sleep
iMaxGoToSleepDuration = 30 # in seconds iMaxGoToSleepDuration = 30 # in seconds
iDelay = 0 iDelay = 0
while iDelay < iMaxGoToSleepDuration: while iDelay < iMaxGoToSleepDuration:
time.sleep(5) time.sleep(5)
iDelay += 5 iDelay += 5
ePowerState = getPowerState(machineName) ePowerState = get_power_state(machineName)
if ePowerState == PowerState.SLEEP: if ePowerState == PowerState.SLEEP:
logInfo('machine %s is now sleeping (put to sleep succeeded)' % machineName) log_info('machine %s is now sleeping (put to sleep succeeded)' % machineName)
return True return True
else: else:
if ePowerState != PowerState.ON: if ePowerState != PowerState.ON:
logWarning('unexpectedly, powerState of %s is %s' % (machineName, PowerStateToStr(ePowerState))) log_warning('unexpectedly, powerState of %s is %s' % (machineName, PowerStateToStr(ePowerState)))
assert ePowerState == PowerState.ON assert ePowerState == PowerState.ON
iAttempt += 1 iAttempt += 1
if iAttempt > iMaxNumAttempts: if iAttempt > iMaxNumAttempts:
if bBUG_239_IS_STILL_ALIVE: if bBUG_239_IS_STILL_ALIVE:
logWarning('the attempt to put %s to sleep failed too many times (probably because of bug 239 (machine is in a weird state : power on but no ssh possible) ?)... giving up. ' % (machineName)) log_warning('the attempt to put %s to sleep failed too many times (probably because of bug 239 (machine is in a weird state : power on but no ssh possible) ?)... giving up. ' % (machineName))
return False return False
else: else:
logWarning('the attempt to put %s to sleep failed too many times... giving up' % (machineName)) log_warning('the attempt to put %s to sleep failed too many times... giving up' % (machineName))
return False return False
else: else:
logWarning('the attempt to put %s to sleep failed... trying again' % (machineName)) log_warning('the attempt to put %s to sleep failed... trying again' % (machineName))
return True return True
def blockingWakeUpMachine(machineName): def blocking_wake_up_machine(machineName):
logInfo('waking up machine %s...' % machineName) log_info('waking up machine %s...' % machineName)
numAttempts = 0 numAttempts = 0
bWakeUpFailed = True bWakeUpFailed = True
while bWakeUpFailed: # try more than once because sometimes for an unknown reason, the wake up order is ignored by the machine ... to be investigated while bWakeUpFailed: # try more than once because sometimes for an unknown reason, the wake up order is ignored by the machine ... to be investigated
@ -172,45 +173,45 @@ def blockingWakeUpMachine(machineName):
iNumWakeUpAttempts = 0 iNumWakeUpAttempts = 0
bWakeUpMachineSucceeded = False bWakeUpMachineSucceeded = False
while not bWakeUpMachineSucceeded: while not bWakeUpMachineSucceeded:
bWakeUpMachineSucceeded = wakeUpMachine(machineName) bWakeUpMachineSucceeded = wake_up_machine(machineName)
iNumWakeUpAttempts += 1 iNumWakeUpAttempts += 1
# the previous command can fail if the machine is already in a transition # the previous command can fail if the machine is already in a transition
# in that case we try sevral times bevire giving up # in that case we try sevral times bevire giving up
if not bWakeUpMachineSucceeded: if not bWakeUpMachineSucceeded:
if iNumWakeUpAttempts < iMaxNumWakeUpAttempts: if iNumWakeUpAttempts < iMaxNumWakeUpAttempts:
iDelay = 5 iDelay = 5
logWarning('wake up attempt %d of %s failed... I\'ll try again in %d seconds' % (iNumWakeUpAttempts, machineName, iDelay)) log_warning('wake up attempt %d of %s failed... I\'ll try again in %d seconds' % (iNumWakeUpAttempts, machineName, iDelay))
time.sleep(iDelay) time.sleep(iDelay)
else: else:
logWarning('wake up attempt %d of %s failed too many times... giving up' % (iNumWakeUpAttempts, machineName)) log_warning('wake up attempt %d of %s failed too many times... giving up' % (iNumWakeUpAttempts, machineName))
return False # couldn't wake up to machine for whatever reason return False # couldn't wake up to machine for whatever reason
bWakeUpFailed = False bWakeUpFailed = False
# wait until the machine is operational # wait until the machine is operational
WAKEUPTIMEOUT = 5 * 60 # max number of seconds allowed for a machine to be alive after a wakeup request WAKEUPTIMEOUT = 5 * 60 # max number of seconds allowed for a machine to be alive after a wakeup request
wakeUpToAliveDuration = 0 wakeUpToAliveDuration = 0
while not Lib.SimpaDbUtil.isMachineResponding(machineName): while not is_machine_responding(machineName):
time.sleep(5) time.sleep(5)
wakeUpToAliveDuration += 5 wakeUpToAliveDuration += 5
if wakeUpToAliveDuration > WAKEUPTIMEOUT: if wakeUpToAliveDuration > WAKEUPTIMEOUT:
# the wake up failed for whatever reason (power state changed manually ? wake up order got lost ?) # the wake up failed for whatever reason (power state changed manually ? wake up order got lost ?)
logWarning('%s took too long (more than %d seconds) to respond after a successful wakeup request.' % (machineName, WAKEUPTIMEOUT)) log_warning('%s took too long (more than %d seconds) to respond after a successful wakeup request.' % (machineName, WAKEUPTIMEOUT))
bWakeUpFailed = True bWakeUpFailed = True
break break
if bWakeUpFailed: if bWakeUpFailed:
numAttempts += 1 numAttempts += 1
if numAttempts >= 2: if numAttempts >= 2:
logWarning('giving up waking up %s because the wake up request succeeded but the machine never actually came alive (and this too many times)' % (machineName)) log_warning('giving up waking up %s because the wake up request succeeded but the machine never actually came alive (and this too many times)' % (machineName))
return False # power state changed manually ? return False # power state changed manually ?
else: else:
logWarning('attempting to wake up %s one more time' % (machineName)) log_warning('attempting to wake up %s one more time' % (machineName))
else: else:
# wake up completed # wake up completed
logInfo('Waking up of machine %s completed successfully' % machineName) log_info('Waking up of machine %s completed successfully' % machineName)
return True return True
def onException(exception): def on_exception(exception):
sys.stdout.flush() sys.stdout.flush()
strExceptionType = type(exception) strExceptionType = type(exception)
strMessage = 'exception %s : %s\n' % (strExceptionType, exception.message) strMessage = 'exception %s : %s\n' % (strExceptionType, exception.message)
@ -224,11 +225,10 @@ def onException(exception):
try: try:
# I had the case (see bugzilla 234) where an assert failed in a child thread, but the main process kept going. I suspect that it was caused # I had the case (see bugzilla 234) where an assert failed in a child thread, but the main process kept going. I suspect that it was caused
# by a failure of sendTextMail... that's why I've embedded the sendmail inside a try except block, so that is this operation fails, then the # by a failure of send_text_mail... that's why I've embedded the sendmail inside a try except block, so that is this operation fails, then the
# kill of the main process is still executed. # kill of the main process is still executed.
Lib.Util.sendTextMail('ClusterController <guillaume.raffy@univ-rennes1.fr>', 'guillaume.raffy@univ-rennes1.fr', 'ClusterController has stopped because of an exception', strMessage) send_text_mail('ClusterController <guillaume.raffy@univ-rennes1.fr>', 'guillaume.raffy@univ-rennes1.fr', 'ClusterController has stopped because of an exception', strMessage)
except BaseException: except BaseException:
logError("Could not send the email to notify the administrator that cluster controller failed") logError("Could not send the email to notify the administrator that cluster controller failed")
pass execute_command('kill -9 %d' % os.getpid()) # stop other threads immediately
executeCommand('kill -9 %d' % os.getpid()) # stop other threads immediately
exit() exit()

View File

@ -1,109 +1,109 @@
#Copyright Jon Berg , turtlemeat.com # Copyright Jon Berg , turtlemeat.com
import cgi
import string,cgi,time import time
from os import curdir, sep from os import curdir, sep
from BaseHTTPServer import BaseHTTPRequestHandler, HTTPServer from BaseHTTPServer import BaseHTTPRequestHandler, HTTPServer # pylint:disable=import-error
import threading import threading
import Util # import pri
#import pri from urlparse import urlparse, parse_qs # pylint:disable=import-error
from urlparse import urlparse, parse_qs
import xml.dom.minidom import xml.dom.minidom
#>>> url = 'http://example.com/?foo=bar&one=1' from .Util import on_exception
#>>> parse_qs(urlparse(url).query) # >>> url = 'http://example.com/?foo=bar&one=1'
#{'foo': ['bar'], 'one': ['1']} # >>> parse_qs(urlparse(url).query)
# {'foo': ['bar'], 'one': ['1']}
class MyHandler(BaseHTTPRequestHandler): class MyHandler(BaseHTTPRequestHandler):
def do_GET(self): def do_GET(self):
try: try:
paramsDict=parse_qs(urlparse(self.path).query) paramsDict = parse_qs(urlparse(self.path).query)
if self.path.endswith(".html"): if self.path.endswith(".html"):
f = open(curdir + sep + self.path) #self.path has /test.html f = open(curdir + sep + self.path, encoding='utf8') # self.path has /test.html
#note that this potentially makes every file on your computer readable by the internet # note that this potentially makes every file on your computer readable by the internet
self.send_response(200) self.send_response(200)
self.send_header('Content-type', 'text/html') self.send_header('Content-type', 'text/html')
self.end_headers() self.end_headers()
self.wfile.write(f.read()) self.wfile.write(f.read())
f.close() f.close()
return return
if self.path.endswith(".esp"): #our dynamic content if self.path.endswith(".esp"): # our dynamic content
self.send_response(200) self.send_response(200)
self.send_header('Content-type', 'text/html') self.send_header('Content-type', 'text/html')
self.end_headers() self.end_headers()
self.wfile.write("hey, today is the" + str(time.localtime()[7])) self.wfile.write("hey, today is the" + str(time.localtime()[7]))
self.wfile.write(" day in the year " + str(time.localtime()[0])) self.wfile.write(" day in the year " + str(time.localtime()[0]))
return return
if self.path.endswith("ShowControlledMachines"): #http://simpatix10.univ-rennes1.fr:8080/ShowControlledMachines if self.path.endswith("ShowControlledMachines"): # http://simpatix10.univ-rennes1.fr:8080/ShowControlledMachines
self.send_response(200) self.send_response(200)
self.send_header('Content-type', 'text/xml') self.send_header('Content-type', 'text/xml')
self.end_headers() self.end_headers()
# Create the minidom document # Create the minidom document
doc = xml.dom.minidom.Document() doc = xml.dom.minidom.Document()
# Create the <ControlledMachines> root element # Create the <ControlledMachines> root element
controlledMachinesElement = doc.createElement("ControlledMachines") controlledMachinesElement = doc.createElement("ControlledMachines")
doc.appendChild(controlledMachinesElement) doc.appendChild(controlledMachinesElement)
for machine in self.server.m_clusterController.m_clusterStatus.m_clusterNodes.values(): for machine in self.server.cluster_controller.cluster_status.cluster_nodes.values():
# Create the main <card> element # Create the main <card> element
controlledMachineElement = doc.createElement("Machine") controlledMachineElement = doc.createElement("Machine")
controlledMachineElement.setAttribute("name", machine.getName()) controlledMachineElement.setAttribute("name", machine.get_name())
controlledMachinesElement.appendChild(controlledMachineElement) controlledMachinesElement.appendChild(controlledMachineElement)
# Print our newly created XML # Print our newly created XML
self.wfile.write(doc.toprettyxml(indent=" ")) self.wfile.write(doc.toprettyxml(indent=" "))
return return
if urlparse(self.path).path == '/SetControlOnMachine': #http://simpatix10.univ-rennes1.fr:8080/SetControlOnMachine?machineName=simpatix30&control=1 if urlparse(self.path).path == '/SetControlOnMachine': # http://simpatix10.univ-rennes1.fr:8080/SetControlOnMachine?machineName=simpatix30&control=1
machineName = paramsDict['machineName'][0] machineName = paramsDict['machineName'][0]
bControl = (paramsDict['control'][0] == '1') bControl = (paramsDict['control'][0] == '1')
self.server.m_clusterController.setControlOnMachine(machineName, bControl) self.server.cluster_controller.set_control_on_machine(machineName, bControl)
self.send_response(200) self.send_response(200)
self.send_header('Content-type', 'text/html') self.send_header('Content-type', 'text/html')
self.end_headers() self.end_headers()
if bControl == True: if bControl is True:
self.wfile.write("%s is now controlled by ClusterController" % machineName) self.wfile.write("%s is now controlled by ClusterController" % machineName)
else: else:
self.wfile.write("%s is no longer controlled by ClusterController" % machineName) self.wfile.write("%s is no longer controlled by ClusterController" % machineName)
return return
return return
except IOError: except IOError:
self.send_error(404,'File Not Found: %s' % self.path) self.send_error(404, 'File Not Found: %s' % self.path)
def do_POST(self): def do_POST(self):
global rootnode
try: try:
ctype, pdict = cgi.parse_header(self.headers.getheader('content-type')) ctype, pdict = cgi.parse_header(self.headers.getheader('content-type'))
if ctype == 'multipart/form-data': if ctype == 'multipart/form-data':
query=cgi.parse_multipart(self.rfile, pdict) query = cgi.parse_multipart(self.rfile, pdict)
self.send_response(301) self.send_response(301)
self.end_headers() self.end_headers()
upfilecontent = query.get('upfile') upfilecontent = query.get('upfile')
print "filecontent", upfilecontent[0] print("filecontent", upfilecontent[0])
self.wfile.write("<HTML>POST OK.<BR><BR>"); self.wfile.write("<HTML>POST OK.<BR><BR>")
self.wfile.write(upfilecontent[0]); self.wfile.write(upfilecontent[0])
except : except BaseException:
pass pass
class WebServerThread( threading.Thread ):
def __init__( self, clusterController ):
threading.Thread.__init__(self)
#self.m_clusterController = clusterController
self.m_bStop = False
self.m_httpServer = HTTPServer(('', 8080), MyHandler)
self.m_httpServer.m_clusterController = clusterController
def run( self ):
try:
while not self.m_bStop:
self.m_httpServer.handle_request()
#self.m_httpServer.serve_forever()
except BaseException, exception: # catches all exceptions, including the ctrl+C (KeyboardInterrupt)
self.m_httpServer.socket.close()
Util.onException(exception)
class WebServerThread(threading.Thread):
stop: bool
http_server: HTTPServer
def __init__(self, clusterController):
threading.Thread.__init__(self)
# self.cluster_controller = clusterController
self.stop = False
self.http_server = HTTPServer(('', 8080), MyHandler)
self.http_server.cluster_controller = clusterController
def run(self):
try:
while not self.stop:
self.http_server.handle_request()
# self.http_server.serve_forever()
except BaseException as exception: # catches all exceptions, including the ctrl+C (KeyboardInterrupt)
self.http_server.socket.close()
on_exception(exception)

View File

@ -12,35 +12,35 @@ class ClusterNodeSensorsReadings:
POWERSTATE_SLEEP=3 POWERSTATE_SLEEP=3
""" """
def __init__(self, clusterNodeName): def __init__(self, clusterNodeName):
self.m_clusterNodeName = clusterNodeName self.cluster_node_name = clusterNodeName
self.m_sensors = {} self.sensors = {}
# self.m_powerState = ClusterNodeStatus.POWERSTATE_UNKNOWN # self.power_state = ClusterNodeStatus.POWERSTATE_UNKNOWN
return return
def addSensor(self, sensor): def addSensor(self, sensor):
self.m_sensors[sensor.m_name] = sensor self.sensors[sensor.name] = sensor
def dump(self): def dump(self):
for key, sensor in self.m_sensors.items(): for key, sensor in self.sensors.items():
sensor.dump() sensor.dump()
return return
# def getPowerState(self): # def get_power_state(self):
# return self.m_powerState # return self.power_state
def getLowestTemperature(self): def getLowestTemperature(self):
# log('ClusterNodeSensorsReadings::getLowestTemperature : start') # log('ClusterNodeSensorsReadings::getLowestTemperature : start')
lowestTemperature = 0.0 lowestTemperature = 0.0
lowestTemperatureIsDefined = False lowestTemperatureIsDefined = False
for key, sensor in self.m_sensors.items(): for key, sensor in self.sensors.items():
# log('ClusterNodeSensorsReadings::getLowestTemperature : start') # log('ClusterNodeSensorsReadings::getLowestTemperature : start')
if sensor.typeName() == 'Temperature': if sensor.typeName() == 'Temperature':
sensor.m_temperature sensor.temperature
if lowestTemperatureIsDefined: if lowestTemperatureIsDefined:
if sensor.m_temperature < lowestTemperature: if sensor.temperature < lowestTemperature:
lowestTemperature = sensor.m_temperature lowestTemperature = sensor.temperature
else: else:
lowestTemperature = sensor.m_temperature lowestTemperature = sensor.temperature
lowestTemperatureIsDefined = True lowestTemperatureIsDefined = True
assert lowestTemperatureIsDefined assert lowestTemperatureIsDefined
# log('ClusterNodeSensorsReadings::getLowestTemperature : end') # log('ClusterNodeSensorsReadings::getLowestTemperature : end')

View File

@ -31,12 +31,12 @@ class IpmiTool202Parser:
rpms = self.parseFanSensorOutput(f) rpms = self.parseFanSensorOutput(f)
if temperature is not None: if temperature is not None:
sensor = FanSensor(sensorName) sensor = FanSensor(sensorName)
sensor.m_rpms = rpms sensor.rpms = rpms
elif sensorType == 'Temperature': elif sensorType == 'Temperature':
temperature = self.parseTemperatureSensorOutput(f) temperature = self.parseTemperatureSensorOutput(f)
if temperature is not None: if temperature is not None:
sensor = TemperatureSensor(sensorName) sensor = TemperatureSensor(sensorName)
sensor.m_temperature = temperature sensor.temperature = temperature
else: else:
# ignoring other sensors # ignoring other sensors
sensor = None sensor = None

View File

@ -22,10 +22,10 @@ class IpmiTool218Parser:
sensor = None sensor = None
if sensorUnit == 'degrees C': if sensorUnit == 'degrees C':
sensor = TemperatureSensor(sensorName) sensor = TemperatureSensor(sensorName)
sensor.m_temperature = float(sensorValue) sensor.temperature = float(sensorValue)
elif sensorUnit == 'RPM': elif sensorUnit == 'RPM':
sensor = FanSensor(sensorName) sensor = FanSensor(sensorName)
sensor.m_rpms = float(sensorValue) sensor.rpms = float(sensorValue)
else: else:
None None
if sensor: if sensor:

View File

@ -1,23 +1,40 @@
from typing import Optional
class Sensor: class Sensor:
def __init__(self, sensorName): def __init__(self, sensorName):
self.m_name = sensorName self.name = sensorName
self.m_isValid = True # false if this sensor is not actually present on the target machine self.is_valid = True # false if this sensor is not actually present on the target machine
return return
def dump(self): def dump(self):
print self.m_name print(self.name)
class FanSensor(Sensor): class FanSensor(Sensor):
rpms: Optional[float]
def __init__(self, sensorName): def __init__(self, sensorName):
Sensor.__init__(self, sensorName) Sensor.__init__(self, sensorName)
self.rpms = None
def dump(self): def dump(self):
print 'Fan \'', self.m_name, '\' rpm=',self.m_rpms print('Fan \'', self.name, '\' rpm=', self.rpms)
def typeName(self): def typeName(self):
return 'Fan' return 'Fan'
class TemperatureSensor(Sensor): class TemperatureSensor(Sensor):
temperature: Optional[float]
def __init__(self, sensorName): def __init__(self, sensorName):
Sensor.__init__(self, sensorName) Sensor.__init__(self, sensorName)
self.temperature = None
def dump(self): def dump(self):
print 'Temperature \'', self.m_name, '\' temperature=',self.m_temperature print('Temperature \'', self.name, '\' temperature=', self.temperature)
def typeName(self): def typeName(self):
return 'Temperature' return 'Temperature'

View File

@ -8,31 +8,31 @@ else:
import re import re
from .wol import wake_on_lan from .wol import wake_on_lan
import os import os
from .Util import executeProgram, executeCommand, log from .Util import execute_program, execute_command, log
import abc import abc
import sqlite3 import sqlite3
from .mysql2sqlite import mysql_to_sqlite from .mysql2sqlite import mysql_to_sqlite
def isMachineResponding(machineName): def is_machine_responding(machineName):
(returnCode, stdout, stderr) = executeProgram(['ping', '-o', '-t', '1', machineName]) (returnCode, stdout, stderr) = execute_program(['ping', '-o', '-t', '1', machineName])
# log( 'isMachineResponding : result of command %s : %d' % (command, returnCode) ) # log( 'is_machine_responding : result of command %s : %d' % (command, returnCode) )
if returnCode == 0: if returnCode == 0:
return True return True
else: else:
bMachineNameIsNotKnown = (returnCode == 68) # bMachineNameIsNotKnown = (returnCode == 68)
bMachineIsNotResponding = (returnCode == 2) bMachineIsNotResponding = (returnCode == 2)
if bMachineIsNotResponding is False: if bMachineIsNotResponding is False:
bBUG_00000004_IS_STILL_ALIVE = True bBUG_00000004_IS_STILL_ALIVE = True
if bBUG_00000004_IS_STILL_ALIVE is True and returnCode == 142: if bBUG_00000004_IS_STILL_ALIVE is True and returnCode == 142:
log('isMachineResponding : bug00000004 Unexpected return code : returnCode=%d, stdout="%s", stderr="%s" , machineName = %s' % (returnCode, stdout, stderr, machineName)) log('is_machine_responding : bug00000004 Unexpected return code : returnCode=%d, stdout="%s", stderr="%s" , machineName = %s' % (returnCode, stdout, stderr, machineName))
# don't stop the program until we understand bug00000004 # don't stop the program until we understand bug00000004
elif bBUG_00000004_IS_STILL_ALIVE is True and returnCode == -14: # I had this error code on 07/09/2009 20:38 but I don't know yet what that means elif bBUG_00000004_IS_STILL_ALIVE is True and returnCode == -14: # I had this error code on 07/09/2009 20:38 but I don't know yet what that means
log('isMachineResponding : bug00000004 Unexpected return code : returnCode=%d, stdout="%s", stderr="%s" , machineName = %s' % (returnCode, stdout, stderr, machineName)) log('is_machine_responding : bug00000004 Unexpected return code : returnCode=%d, stdout="%s", stderr="%s" , machineName = %s' % (returnCode, stdout, stderr, machineName))
# don't stop the program until we understand bug00000004 # don't stop the program until we understand bug00000004
else: else:
log('isMachineResponding : Unexpected return code : returnCode=%d, stdout="%s", stderr="%s" , machineName = %s' % (returnCode, stdout, stderr, machineName)) log('is_machine_responding : Unexpected return code : returnCode=%d, stdout="%s", stderr="%s" , machineName = %s' % (returnCode, stdout, stderr, machineName))
assert False assert False
return False return False
@ -46,7 +46,6 @@ class ISqlDatabaseBackend(object):
""" """
:param str sql_query: the sql query to perform :param str sql_query: the sql query to perform
""" """
pass
class RemoteMysqlDb(ISqlDatabaseBackend): class RemoteMysqlDb(ISqlDatabaseBackend):
@ -70,7 +69,7 @@ class RemoteMysqlDb(ISqlDatabaseBackend):
:param str sql_query: the sql query to perform :param str sql_query: the sql query to perform
""" """
self._conn.query(sql_query) self._conn.query(sql_query)
rows = conn.store_result() rows = self._conn.store_result()
return rows return rows
@ -87,7 +86,7 @@ class SqlFile(ISqlDatabaseBackend):
# - the file is stored on a solid state disk # - the file is stored on a solid state disk
try: try:
os.remove(sqlite_db_path) os.remove(sqlite_db_path)
except: except BaseException:
pass pass
check_same_thread = False check_same_thread = False
# this is to prevent the following error when run from apache/django : SQLite objects created in a thread can only be used in that same thread.The object was created in thread id 139672342353664 and this is thread id 139672333960960 # this is to prevent the following error when run from apache/django : SQLite objects created in a thread can only be used in that same thread.The object was created in thread id 139672342353664 and this is thread id 139672333960960
@ -95,7 +94,7 @@ class SqlFile(ISqlDatabaseBackend):
# If set False, the returned connection may be shared across multiple threads. When using multiple threads with the same connection writing operations should be serialized by the user to avoid data corruption # If set False, the returned connection may be shared across multiple threads. When using multiple threads with the same connection writing operations should be serialized by the user to avoid data corruption
# I hope it's safe here but I'm not 100% sure though. Anyway, if the database gets corrupt, it not a big deal since this memory resident database gets reconstructed from the sql file... # I hope it's safe here but I'm not 100% sure though. Anyway, if the database gets corrupt, it not a big deal since this memory resident database gets reconstructed from the sql file...
self._con = sqlite3.connect(sqlite_db_path, check_same_thread=check_same_thread) self._con = sqlite3.connect(sqlite_db_path, check_same_thread=check_same_thread)
with open(str(self._sql_file_path), 'r') as f: # str conversion has been added to support older versions of python in which open don't accept arguments of type Path with open(str(self._sql_file_path), 'r', encoding='utf8') as f: # str conversion has been added to support older versions of python in which open don't accept arguments of type Path
sql = f.read() # watch out for built-in `str` sql = f.read() # watch out for built-in `str`
# print(sql) # print(sql)
self._cur = self._con.cursor() self._cur = self._con.cursor()
@ -111,7 +110,6 @@ class SqlFile(ISqlDatabaseBackend):
""" """
:param str sql_query: the sql query to perform :param str sql_query: the sql query to perform
""" """
pass
self._cur.execute(sql_query) self._cur.execute(sql_query)
rows = self._cur.fetchall() rows = self._cur.fetchall()
return rows return rows
@ -196,7 +194,7 @@ def getLightOutManagementIpAddress(machineName):
return ipAddress return ipAddress
def getClusterMachinesNames(): def get_cluster_machines_names():
clusterMachinesNames = [] clusterMachinesNames = []
conn = MySQLdb.connect('simpatix10', 'simpadb_reader', '', 'simpadb') conn = MySQLdb.connect('simpatix10', 'simpadb_reader', '', 'simpadb')
assert conn assert conn
@ -223,14 +221,12 @@ def machineSupportsIpmi(machineName):
def putToSleep(machineName): def putToSleep(machineName):
# note : pmset must be executed as root # note : pmset must be executed as root
(returnCode, stdout, stderr) = executeCommand(['ssh', machineName, 'pmset sleepnow']) (returnCode, stdout, _stderr) = execute_command(['ssh', machineName, 'pmset sleepnow'])
""" # print returnCode
print returnCode # print 'stdout :'
print 'stdout :' # print stdout
print stdout # print 'stderr :'
print 'stderr :' # print stderr
print stderr
"""
assert returnCode == 0 assert returnCode == 0
# check if the command succeeded by looking at the output (that's the only way I found) # check if the command succeeded by looking at the output (that's the only way I found)
f = StringIO.StringIO(stdout) f = StringIO.StringIO(stdout)
@ -255,7 +251,7 @@ def isNonRespondingMachineSleeping(machineName):
""" """
wakeUp(machineName) wakeUp(machineName)
time.sleep(120) time.sleep(120)
if isMachineResponding(machineName): if is_machine_responding(machineName):
putToSleep(machineName) putToSleep(machineName)
time.sleep(30) # allow a little time to make sure the machine is ready to receive other wake on lan messages time.sleep(30) # allow a little time to make sure the machine is ready to receive other wake on lan messages
return True return True
@ -264,11 +260,9 @@ def isNonRespondingMachineSleeping(machineName):
if __name__ == '__main__': if __name__ == '__main__':
""" # for i in range(30):
for i in range(30): # machineName = 'simpatix%d' % (i+10)
machineName = 'simpatix%d' % (i+10) # print 'lom ip of %s is %s' % (machineName, getLightOutManagementIpAddress(machineName))
print 'lom ip of %s is %s' % (machineName, getLightOutManagementIpAddress(machineName))
"""
wakeUp('simpatix21') wakeUp('simpatix21')
# print putToSleep('simpatix13') # print putToSleep('simpatix13')
# print isNonRespondingMachineSleeping('simpatix13') # print isNonRespondingMachineSleeping('simpatix13')

View File

@ -1,6 +1,5 @@
import time import time
import subprocess import subprocess
import io
import re import re
import logging import logging
# from wol import * # from wol import *
@ -16,7 +15,7 @@ else:
from email.mime.text import MIMEText from email.mime.text import MIMEText
def sendTextMail(strFrom, to, strSubject, text): def send_text_mail(strFrom, to, strSubject, text):
# from = "SimpaCluster <guillaume.raffy@univ-rennes1.fr>" # from = "SimpaCluster <guillaume.raffy@univ-rennes1.fr>"
mail = MIMEText(text) mail = MIMEText(text)
mail['From'] = strFrom mail['From'] = strFrom
@ -30,12 +29,14 @@ def sendTextMail(strFrom, to, strSubject, text):
class Error(Exception): class Error(Exception):
message: str
def __init__(self, strMessage): def __init__(self, strMessage):
self.m_strMessage = strMessage self.message = strMessage
def getHostName(): def getHostName():
(returnCode, stdout, stderr) = executeProgram(['hostname', '-s']) (returnCode, stdout, stderr) = execute_program(['hostname', '-s'])
if returnCode != 0: if returnCode != 0:
raise Error(stderr) raise Error(stderr)
strHostName = re.sub(r"\n", "", stdout) strHostName = re.sub(r"\n", "", stdout)
@ -46,18 +47,18 @@ def log(message):
print(time.asctime(time.localtime()) + ' : ' + message) print(time.asctime(time.localtime()) + ' : ' + message)
def executeProgram(astrArguments): def execute_program(astrArguments):
# log('executeProgram : program [%s]' % (','.join(astrArguments))) # log('execute_program : program [%s]' % (','.join(astrArguments)))
popen = subprocess.Popen(astrArguments, shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE) # bufsize=1 seems to prevent deadlocks that happen 50% the time popen = subprocess.Popen(astrArguments, shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE) # bufsize=1 seems to prevent deadlocks that happen 50% the time
stdout, stderr = popen.communicate() stdout, stderr = popen.communicate()
# popen.wait() # popen.wait()
result = (popen.returncode, stdout.decode(), stderr) result = (popen.returncode, stdout.decode(), stderr)
# log('executeProgram : command %s popen.pid = %d' % (astrArguments[0], popen.pid)) # log('execute_program : command %s popen.pid = %d' % (astrArguments[0], popen.pid))
# os.kill(popen.pid, signal.SIGTERM) # os.kill(popen.pid, signal.SIGTERM)
return result return result
def executeCommand(command): def execute_command(command):
""" """
executes the shell command such as 'set x=1; myprog $x' executes the shell command such as 'set x=1; myprog $x'
""" """
@ -69,25 +70,25 @@ def executeCommand(command):
return result return result
def executeCommandOn(target_machine_fqdn: str, command: str, user: str = None): def execute_commandOn(target_machine_fqdn: str, command: str, user: str = None):
""" """
execute command on a local or remote machine (using ssh then) execute command on a local or remote machine (using ssh then)
:param str user: if not None, the user that should be used to execute the command (instead of the current user) :param str user: if not None, the user that should be used to execute the command (instead of the current user)
""" """
logging.debug("executing %s on %s as %s" % (command, target_machine_fqdn, user)) logging.debug("executing %s on %s as %s", command, target_machine_fqdn, user)
if getHostName() == target_machine_fqdn.split('.')[0]: if getHostName() == target_machine_fqdn.split('.')[0]:
if user is not None: if user is not None:
# su -c "ls -l /tmp" graffy # su -c "ls -l /tmp" graffy
result = executeCommand("su -c '%s' %s" % (command, user)) result = execute_command("su -c '%s' %s" % (command, user))
else: else:
result = executeCommand(command) result = execute_command(command)
else: else:
if user is not None: if user is not None:
target = '%s@%s' % (user, target_machine_fqdn) target = '%s@%s' % (user, target_machine_fqdn)
else: else:
target = target_machine_fqdn target = target_machine_fqdn
result = executeProgram(['ssh', target, "%s" % command]) result = execute_program(['ssh', target, "%s" % command])
logging.debug("finished executing %s on %s as %s" % (command, target_machine_fqdn, user)) logging.debug("finished executing %s on %s as %s", command, target_machine_fqdn, user)
return result return result
@ -97,16 +98,16 @@ def getUpsStatus():
def __init__(self): def __init__(self):
HTMLParser.__init__(self) HTMLParser.__init__(self)
self.TokenList = [] self.token_list = []
def handle_data(self, data): def handle_data(self, data):
data = data.strip() data = data.strip()
if data and len(data) > 0: if data and len(data) > 0:
self.TokenList.append(data) self.token_list.append(data)
# print data # print data
def GetTokenList(self): def get_token_list(self):
return self.TokenList return self.token_list
from urllib.request import urlopen from urllib.request import urlopen
try: try:
@ -114,21 +115,20 @@ def getUpsStatus():
f = urlopen(url) f = urlopen(url)
res = f.read() res = f.read()
f.close() f.close()
except: except BaseException:
print("bad read") print("bad read")
return return
h = MyHTMLParser() h = MyHTMLParser()
h.feed(res) h.feed(res)
tokensList = h.GetTokenList() # noqa:F841 _tokensList = h.get_token_list() # noqa:F841
raise NotImplementedError('the implementation is not complete')
if __name__ == '__main__': if __name__ == '__main__':
from SimpaDbUtil import wakeUp from .SimpaDbUtil import wakeUp
""" # for i in range(30):
for i in range(30): # machineName = 'simpatix%d' % (i+10)
machineName = 'simpatix%d' % (i+10) # print 'lom ip of %s is %s' % (machineName, getLightOutManagementIpAddress(machineName))
print 'lom ip of %s is %s' % (machineName, getLightOutManagementIpAddress(machineName))
"""
wakeUp('simpatix21') wakeUp('simpatix21')
# print putToSleep('simpatix13') # print putToSleep('simpatix13')
# print isNonRespondingMachineSleeping('simpatix13') # print isNonRespondingMachineSleeping('simpatix13')

View File

@ -20,7 +20,7 @@ class CoclutoTestCase(unittest.TestCase):
qstat_output = file.read() qstat_output = file.read()
# qstatParser = ClusterController.QstatParser() # qstatParser = ClusterController.QstatParser()
qstatParser = QstatParser() qstatParser = QstatParser()
job_state = qstatParser.parseQstatOutput(qstat_output, cluster_domain='ipr.univ-rennes1.fr') job_state = qstatParser.parse_qstat_output(qstat_output, cluster_domain='ipr.univ-rennes1.fr')
self.assertIsInstance(job_state, JobsState) self.assertIsInstance(job_state, JobsState)