fixed styling errors and added more type hinting to increase maintainability of cocluto
- all styling errors are now fixed, but there are still warnings and information - most functions have been converted to snake case - most functions now have type hinting work related to [https://bugzilla.ipr.univ-rennes.fr/show_bug.cgi?id=3873]
This commit is contained in:
parent
6bf69f909b
commit
9f4a80b11e
|
@ -1,21 +1,26 @@
|
||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
|
from typing import Dict, Optional
|
||||||
import sys
|
import sys
|
||||||
sys.path.insert(0, '..')
|
|
||||||
import os
|
import os
|
||||||
import MySQLdb
|
import MySQLdb
|
||||||
import threading
|
import threading
|
||||||
from Lib.Util import *
|
|
||||||
from Lib.SimpaDbUtil import *
|
|
||||||
import time
|
import time
|
||||||
from ClusterStatus import ClusterStatus
|
from datetime import datetime
|
||||||
from SlotAllocator import DecoupledSlotAllocator
|
if sys.version_info < (3, 0):
|
||||||
from Log import logDebug, logInfo
|
from HTMLParser import HTMLParser
|
||||||
from ClusterNodeStatusUpdater import IWakeUpCompleteNotifier, ISleepCompleteNotifier
|
else:
|
||||||
from SunGridEngine import SunGridEngine
|
from html.parser import HTMLParser
|
||||||
from Util import log, onException
|
from ..Util import log
|
||||||
from WebServer import WebServerThread
|
# from ..SimpaDbUtil import toto
|
||||||
from PowerState import PowerState
|
from .ClusterNode import ClusterNodeId, ClusterNode
|
||||||
from HTMLParser import HTMLParser
|
from .ClusterStatus import ClusterStatus
|
||||||
|
from .SlotAllocator import DecoupledSlotAllocator, SlotAllocator
|
||||||
|
from .Log import logDebug, log_info
|
||||||
|
from .ClusterNodeStatusUpdater import IWakeUpCompleteNotifier, ISleepCompleteNotifier
|
||||||
|
from .SunGridEngine import SunGridEngine
|
||||||
|
from .Util import on_exception
|
||||||
|
from .WebServer import WebServerThread
|
||||||
|
from .PowerState import PowerState
|
||||||
|
|
||||||
VERSION = '1.18'
|
VERSION = '1.18'
|
||||||
|
|
||||||
|
@ -23,38 +28,38 @@ VERSION = '1.18'
|
||||||
class MyHTMLParser(HTMLParser):
|
class MyHTMLParser(HTMLParser):
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
HTMLParser.__init__(self)
|
HTMLParser.__init__(self)
|
||||||
self.TokenList = []
|
self.token_list = []
|
||||||
|
|
||||||
def handle_data(self, data):
|
def handle_data(self, data):
|
||||||
data = data.strip()
|
data = data.strip()
|
||||||
if data and len(data) > 0:
|
if data and len(data) > 0:
|
||||||
self.TokenList.append(data)
|
self.token_list.append(data)
|
||||||
# print data
|
# print data
|
||||||
|
|
||||||
def GetTokenList(self):
|
def get_token_list(self):
|
||||||
return self.TokenList
|
return self.token_list
|
||||||
|
|
||||||
|
|
||||||
class WakeUpCompleteNotifier(IWakeUpCompleteNotifier):
|
class WakeUpCompleteNotifier(IWakeUpCompleteNotifier):
|
||||||
|
|
||||||
def __init__(self, machineName, clusterController):
|
def __init__(self, machineName, clusterController):
|
||||||
self.m_machineName = machineName
|
self.machine_name = machineName
|
||||||
self.m_clusterController = clusterController
|
self.cluster_controller = clusterController
|
||||||
|
|
||||||
def onWakeUpComplete(self):
|
def on_wake_up_complete(self):
|
||||||
logDebug('WakeUpCompleteNotifier::onWakeUpComplete : start')
|
logDebug('WakeUpCompleteNotifier::on_wake_up_complete : start')
|
||||||
self.m_clusterController.onMachineWakeUpComplete(self.m_machineName)
|
self.cluster_controller.onMachineWakeUpComplete(self.machine_name)
|
||||||
|
|
||||||
|
|
||||||
class SleepCompleteNotifier(ISleepCompleteNotifier):
|
class SleepCompleteNotifier(ISleepCompleteNotifier):
|
||||||
|
|
||||||
def __init__(self, machineName, clusterController):
|
def __init__(self, machineName, clusterController):
|
||||||
self.m_machineName = machineName
|
self.machine_name = machineName
|
||||||
self.m_clusterController = clusterController
|
self.cluster_controller = clusterController
|
||||||
|
|
||||||
def onSleepComplete(self, bSleepSucceeded):
|
def on_sleep_complete(self, bSleepSucceeded):
|
||||||
logDebug('SleepCompleteNotifier::onSleepComplete : start')
|
logDebug('SleepCompleteNotifier::on_sleep_complete : start')
|
||||||
self.m_clusterController.onMachineSleepComplete(self.m_machineName, bSleepSucceeded)
|
self.cluster_controller.onMachineSleepComplete(self.machine_name, bSleepSucceeded)
|
||||||
|
|
||||||
|
|
||||||
def jouleToKwh(fEnergyInJoules):
|
def jouleToKwh(fEnergyInJoules):
|
||||||
|
@ -79,23 +84,36 @@ class ClusterController:
|
||||||
jobs (eg add some machines to a queue).
|
jobs (eg add some machines to a queue).
|
||||||
Mechanism to let user get priority
|
Mechanism to let user get priority
|
||||||
"""
|
"""
|
||||||
|
cluster_status: ClusterStatus
|
||||||
|
slot_allocator = SlotAllocator
|
||||||
|
machines_that_need_wake_up: Dict[ClusterNodeId, ClusterNode]
|
||||||
|
machines_that_need_wake_up_lock: threading.Lock # to prevent concurrent access to machines_that_need_wake_up
|
||||||
|
machines_that_need_sleeping: Dict[ClusterNodeId, ClusterNode]
|
||||||
|
machines_that_need_sleeping_lock: threading.Lock # to prevent concurrent access to machines_that_need_sleeping
|
||||||
|
last_energy_status_log_time: Optional[datetime]
|
||||||
|
DELAY_BETWEEN_ENERGY_STATUS_LOGS: int # in seconds
|
||||||
|
session_id: Optional[int] # session (run) identifier in database
|
||||||
|
web_server: WebServerThread
|
||||||
|
stop: bool
|
||||||
|
stop_lock: threading.Lock # to prevent concurrent access to stop
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
gridEngine = SunGridEngine()
|
gridEngine = SunGridEngine()
|
||||||
self.m_clusterStatus = ClusterStatus(gridEngine)
|
self.cluster_status = ClusterStatus(gridEngine)
|
||||||
self.m_slotAllocator = DecoupledSlotAllocator() # SimpleSlotAllocator()
|
self.slot_allocator = DecoupledSlotAllocator() # SimpleSlotAllocator() pylint: disable=no-value-for-parameter
|
||||||
self.m_machinesThatNeedWakeUp = {}
|
self.machines_that_need_wake_up = {}
|
||||||
self.m_machinesThatNeedWakeupLock = threading.Lock() # to prevent concurrent access to m_machinesThatNeedWakeUp
|
self.machines_that_need_wake_up_lock = threading.Lock()
|
||||||
self.m_machinesThatNeedSleeping = {}
|
self.machines_that_need_sleeping = {}
|
||||||
self.m_machinesThatNeedSleepingLock = threading.Lock() # to prevent concurrent access to m_machinesThatNeedSleeping
|
self.machines_that_need_sleeping_lock = threading.Lock()
|
||||||
self.m_lastEnergyStatusLogTime = None
|
self.last_energy_status_log_time = None
|
||||||
self.DELAY_BETWEEN_ENERGY_STATUS_LOGS = 60 # in seconds
|
self.DELAY_BETWEEN_ENERGY_STATUS_LOGS = 60
|
||||||
self.m_iSessionId = None # session (run) identifier in database
|
self.session_id = None
|
||||||
self.m_webServer = WebServerThread(self)
|
self.web_server = WebServerThread(self)
|
||||||
self.m_bStop = False
|
self.stop = False
|
||||||
self.m_bStopLock = threading.Lock() # to prevent concurrent access to m_bStop
|
self.stop_lock = threading.Lock()
|
||||||
|
|
||||||
def getClusterStatus(self):
|
def getClusterStatus(self):
|
||||||
return self.m_clusterStatus
|
return self.cluster_status
|
||||||
|
|
||||||
def log(self, message):
|
def log(self, message):
|
||||||
print(message)
|
print(message)
|
||||||
|
@ -104,79 +122,76 @@ class ClusterController:
|
||||||
self.log("ClusterController::shutdownLeastImportantNode : start")
|
self.log("ClusterController::shutdownLeastImportantNode : start")
|
||||||
|
|
||||||
def onMachineWakeUpComplete(self, machineName):
|
def onMachineWakeUpComplete(self, machineName):
|
||||||
self.m_machinesThatNeedWakeupLock.acquire()
|
self.machines_that_need_wake_up_lock.acquire()
|
||||||
# logDebug('ClusterController::onMachineWakeUpComplete : machine %s old len(self.m_machinesThatNeedWakeUp) = %d' % (machineName,len(self.m_machinesThatNeedWakeUp)))
|
# logDebug('ClusterController::onMachineWakeUpComplete : machine %s old len(self.machines_that_need_wake_up) = %d' % (machineName,len(self.machines_that_need_wake_up)))
|
||||||
del self.m_machinesThatNeedWakeUp[machineName]
|
del self.machines_that_need_wake_up[machineName]
|
||||||
# logDebug('ClusterController::onMachineWakeUpComplete : machine %s new len(self.m_machinesThatNeedWakeUp) = %d' % (machineName,len(self.m_machinesThatNeedWakeUp)))
|
# logDebug('ClusterController::onMachineWakeUpComplete : machine %s new len(self.machines_that_need_wake_up) = %d' % (machineName,len(self.machines_that_need_wake_up)))
|
||||||
self.m_machinesThatNeedWakeupLock.release()
|
self.machines_that_need_wake_up_lock.release()
|
||||||
logDebug('ClusterController::onMachineWakeUpComplete : removed %s from the list of machines that need waking up because it\'s now awake' % machineName)
|
logDebug('ClusterController::onMachineWakeUpComplete : removed %s from the list of machines that need waking up because it\'s now awake' % machineName)
|
||||||
|
|
||||||
def onMachineSleepComplete(self, machineName, bSleepSucceeded):
|
def onMachineSleepComplete(self, machineName, bSleepSucceeded):
|
||||||
self.m_machinesThatNeedSleepingLock.acquire()
|
self.machines_that_need_sleeping_lock.acquire()
|
||||||
# logDebug('ClusterController::onMachineSleepComplete : machine %s old len(self.m_machinesThatNeedWakeUp) = %d' % (machineName,len(self.m_machinesThatNeedWakeUp)))
|
# logDebug('ClusterController::onMachineSleepComplete : machine %s old len(self.machines_that_need_wake_up) = %d' % (machineName,len(self.machines_that_need_wake_up)))
|
||||||
del self.m_machinesThatNeedSleeping[machineName]
|
del self.machines_that_need_sleeping[machineName]
|
||||||
# logDebug('ClusterController::onMachineSleepComplete : machine %s new len(self.m_machinesThatNeedWakeUp) = %d' % (machineName,len(self.m_machinesThatNeedWakeUp)))
|
# logDebug('ClusterController::onMachineSleepComplete : machine %s new len(self.machines_that_need_wake_up) = %d' % (machineName,len(self.machines_that_need_wake_up)))
|
||||||
self.m_machinesThatNeedSleepingLock.release()
|
self.machines_that_need_sleeping_lock.release()
|
||||||
if bSleepSucceeded:
|
if bSleepSucceeded:
|
||||||
logDebug('ClusterController::onMachineSleepComplete : removed %s from the list of machines that need sleeping because it\'s now sleeping' % machineName)
|
logDebug('ClusterController::onMachineSleepComplete : removed %s from the list of machines that need sleeping because it\'s now sleeping' % machineName)
|
||||||
else:
|
else:
|
||||||
logDebug('ClusterController::onMachineSleepComplete : removed %s from the list of machines that need sleeping because it can\'t be put to sleep at the moment (eg a job just arrived)' % machineName)
|
logDebug('ClusterController::onMachineSleepComplete : removed %s from the list of machines that need sleeping because it can\'t be put to sleep at the moment (eg a job just arrived)' % machineName)
|
||||||
|
|
||||||
def getNumPendingWakeUps(self):
|
def getNumPendingWakeUps(self):
|
||||||
self.m_machinesThatNeedWakeupLock.acquire()
|
self.machines_that_need_wake_up_lock.acquire()
|
||||||
numPendingWakeUps = len(self.m_machinesThatNeedWakeUp)
|
numPendingWakeUps = len(self.machines_that_need_wake_up)
|
||||||
self.m_machinesThatNeedWakeupLock.release()
|
self.machines_that_need_wake_up_lock.release()
|
||||||
return numPendingWakeUps
|
return numPendingWakeUps
|
||||||
|
|
||||||
def getNumPendingSleeps(self):
|
def getNumPendingSleeps(self):
|
||||||
self.m_machinesThatNeedSleepingLock.acquire()
|
self.machines_that_need_sleeping_lock.acquire()
|
||||||
numPendingSleeps = len(self.m_machinesThatNeedSleeping)
|
numPendingSleeps = len(self.machines_that_need_sleeping)
|
||||||
self.m_machinesThatNeedSleepingLock.release()
|
self.machines_that_need_sleeping_lock.release()
|
||||||
return numPendingSleeps
|
return numPendingSleeps
|
||||||
|
|
||||||
def putIdleMachinesToSleep(self):
|
def putIdleMachinesToSleep(self):
|
||||||
self.m_clusterStatus.m_lock.acquire()
|
self.cluster_status.lock.acquire()
|
||||||
idleMachines = self.m_clusterStatus.getIdleMachines()
|
idleMachines = self.cluster_status.get_idle_machines()
|
||||||
# logInfo('idleMachines :')
|
# log_info('idleMachines :')
|
||||||
self.m_machinesThatNeedToSleep = []
|
for _machineName, idleMachine in idleMachines.items():
|
||||||
for machineName, idleMachine in idleMachines.items():
|
if idleMachine.get_power_state() == PowerState.ON:
|
||||||
if idleMachine.getPowerState() == PowerState.ON:
|
# log_info('\t%s' % machineName)
|
||||||
# logInfo('\t%s' % machineName)
|
if idleMachine.get_name() != 'simpatix10': # never put simpatix10 to sleep because it's the sge master and is also server for other things
|
||||||
if idleMachine.getName() != 'simpatix10': # never put simpatix10 to sleep because it's the sge master and is also server for other things
|
self.machines_that_need_sleeping[idleMachine.get_name()] = idleMachine
|
||||||
self.m_machinesThatNeedSleeping[idleMachine.getName()] = idleMachine
|
self.cluster_status.lock.release()
|
||||||
self.m_clusterStatus.m_lock.release()
|
|
||||||
|
|
||||||
listOfMachinesThatNeedSleeping = self.m_machinesThatNeedSleeping.values() # duplicate the list so that we don't iterate on m_machinesThatNeedSleeping, which could cause a runtime error because callbacks alter m_machinesThatNeedWakeUp
|
listOfMachinesThatNeedSleeping = self.machines_that_need_sleeping.values() # duplicate the list so that we don't iterate on machines_that_need_sleeping, which could cause a runtime error because callbacks alter machines_that_need_wake_up
|
||||||
for machine in listOfMachinesThatNeedSleeping:
|
for machine in listOfMachinesThatNeedSleeping:
|
||||||
logInfo('ClusterController::putIdleMachinesToSleep : requesting sleep for %s because it\'s idle' % machine.getName())
|
log_info('ClusterController::putIdleMachinesToSleep : requesting sleep for %s because it\'s idle' % machine.get_name())
|
||||||
machine.requestSleep(SleepCompleteNotifier(machine.getName(), self))
|
machine.request_sleep(SleepCompleteNotifier(machine.get_name(), self)) # pylint: disable=no-value-for-parameter
|
||||||
|
|
||||||
if len(listOfMachinesThatNeedSleeping) != 0:
|
if len(listOfMachinesThatNeedSleeping) != 0:
|
||||||
# hack : wait until the sleep requests are handled so that we don't request the same machine to sleep multiple times
|
# hack : wait until the sleep requests are handled so that we don't request the same machine to sleep multiple times
|
||||||
while self.getNumPendingSleeps() > 0:
|
while self.getNumPendingSleeps() > 0:
|
||||||
time.sleep(1)
|
time.sleep(1)
|
||||||
|
|
||||||
def wakeUpMachinesForPendingJobs(self):
|
def wake_up_machinesForPendingJobs(self):
|
||||||
listOfMachinesThatNeedWakeUp = []
|
listOfMachinesThatNeedWakeUp = []
|
||||||
|
|
||||||
self.m_clusterStatus.m_lock.acquire()
|
self.cluster_status.lock.acquire()
|
||||||
pendingJobs = self.m_clusterStatus.getPendingJobs()
|
pendingJobs = self.cluster_status.get_pending_jobs()
|
||||||
"""
|
# log_info('pending jobs :')
|
||||||
logInfo('pending jobs :')
|
# for job in pendingJobs.values():
|
||||||
for job in pendingJobs.values():
|
# log_info('\t%d' % job.getId().asStr())
|
||||||
logInfo('\t%d' % job.getId().asStr())
|
|
||||||
"""
|
|
||||||
if len(pendingJobs) != 0:
|
if len(pendingJobs) != 0:
|
||||||
self.m_machinesThatNeedWakeUp = self.m_slotAllocator.getMachinesThatNeedWakeUp(pendingJobs, self.m_clusterStatus)
|
self.machines_that_need_wake_up = self.slot_allocator.get_machinesThatNeedWakeUp(pendingJobs, self.cluster_status)
|
||||||
if len(self.m_machinesThatNeedWakeUp) == 0:
|
if len(self.machines_that_need_wake_up) == 0:
|
||||||
None
|
pass
|
||||||
# logInfo('ClusterController::updateNormalState : no machine needs waking up')
|
# log_info('ClusterController::updateNormalState : no machine needs waking up')
|
||||||
else:
|
else:
|
||||||
listOfMachinesThatNeedWakeUp = self.m_machinesThatNeedWakeUp.values() # duplicate the list so that we don't iterate on m_machinesThatNeedWakeUp, which would cause a runtime error because callbacks alter m_machinesThatNeedWakeUp
|
listOfMachinesThatNeedWakeUp = self.machines_that_need_wake_up.values() # duplicate the list so that we don't iterate on machines_that_need_wake_up, which would cause a runtime error because callbacks alter machines_that_need_wake_up
|
||||||
for machine in listOfMachinesThatNeedWakeUp:
|
for machine in listOfMachinesThatNeedWakeUp:
|
||||||
logInfo('ClusterController::wakeUpMachinesForPendingJobs : requesting wake up for ' + machine.getName())
|
log_info('ClusterController::wake_up_machinesForPendingJobs : requesting wake up for ' + machine.get_name())
|
||||||
machine.requestWakeUp(WakeUpCompleteNotifier(machine.getName(), self))
|
machine.request_wake_up(WakeUpCompleteNotifier(machine.get_name(), self)) # pylint: disable=no-value-for-parameter
|
||||||
self.m_clusterStatus.m_lock.release()
|
self.cluster_status.lock.release()
|
||||||
|
|
||||||
if len(listOfMachinesThatNeedWakeUp) != 0:
|
if len(listOfMachinesThatNeedWakeUp) != 0:
|
||||||
# hack : wait until the wakeup requests are handled so that a later sleep request doesn't cancel it
|
# hack : wait until the wakeup requests are handled so that a later sleep request doesn't cancel it
|
||||||
|
@ -184,16 +199,16 @@ class ClusterController:
|
||||||
while self.getNumPendingWakeUps() > 0:
|
while self.getNumPendingWakeUps() > 0:
|
||||||
time.sleep(1)
|
time.sleep(1)
|
||||||
iSGE_CHEK_RUNNABLE_JOBS_DELAY = 60 * 5 # max time it takes for sge between the fact that a queued job is runnable and SGE actually starting it (I've put a long time here because sometimes, qstat takes a long time to ralise that the machine is available after I wake it up)
|
iSGE_CHEK_RUNNABLE_JOBS_DELAY = 60 * 5 # max time it takes for sge between the fact that a queued job is runnable and SGE actually starting it (I've put a long time here because sometimes, qstat takes a long time to ralise that the machine is available after I wake it up)
|
||||||
logInfo('ClusterController::wakeUpMachinesForPendingJobs : all required machines are awake. Now give %d seconds to SGE to allocate slots.' % iSGE_CHEK_RUNNABLE_JOBS_DELAY)
|
log_info('ClusterController::wake_up_machinesForPendingJobs : all required machines are awake. Now give %d seconds to SGE to allocate slots.' % iSGE_CHEK_RUNNABLE_JOBS_DELAY)
|
||||||
# wait until SGE has a chance to allocate slots
|
# wait until SGE has a chance to allocate slots
|
||||||
time.sleep(iSGE_CHEK_RUNNABLE_JOBS_DELAY) # note : this is annoying because it blocks the main thread. This could be improved if we forbid the machines to go to sleep for that much time....
|
time.sleep(iSGE_CHEK_RUNNABLE_JOBS_DELAY) # note : this is annoying because it blocks the main thread. This could be improved if we forbid the machines to go to sleep for that much time....
|
||||||
logInfo('ClusterController::wakeUpMachinesForPendingJobs : end of the delay given to SGE to allocate slots')
|
log_info('ClusterController::wake_up_machinesForPendingJobs : end of the delay given to SGE to allocate slots')
|
||||||
|
|
||||||
def updateNormalState(self):
|
def updateNormalState(self):
|
||||||
# attempt to shut down machines that are idle
|
# attempt to shut down machines that are idle
|
||||||
self.putIdleMachinesToSleep()
|
self.putIdleMachinesToSleep()
|
||||||
# wake up necessary machines if there are pending jobs
|
# wake up necessary machines if there are pending jobs
|
||||||
self.wakeUpMachinesForPendingJobs()
|
self.wake_up_machinesForPendingJobs()
|
||||||
|
|
||||||
def storeSessionInDatabase(self):
|
def storeSessionInDatabase(self):
|
||||||
conn = MySQLdb.connect('simpatix10', 'clusterctrl', '', 'clustercontroller')
|
conn = MySQLdb.connect('simpatix10', 'clusterctrl', '', 'clustercontroller')
|
||||||
|
@ -207,7 +222,7 @@ class ClusterController:
|
||||||
iSessionId = r.fetch_row()[0][0]
|
iSessionId = r.fetch_row()[0][0]
|
||||||
|
|
||||||
# stores information about the session
|
# stores information about the session
|
||||||
sqlCommand = "INSERT INTO `sessions_desc` (`start_time`, end_time, `program_version`, `machine_name`, `pid`, num_controlled_machines) VALUES (NOW(), NOW(), '%s', 'simpatix10', %d, %d);" % (VERSION, os.getpid(), len(self.m_clusterStatus.m_clusterNodes))
|
sqlCommand = "INSERT INTO `sessions_desc` (`start_time`, end_time, `program_version`, `machine_name`, `pid`, num_controlled_machines) VALUES (NOW(), NOW(), '%s', 'simpatix10', %d, %d);" % (VERSION, os.getpid(), len(self.cluster_status.cluster_nodes))
|
||||||
print(sqlCommand)
|
print(sqlCommand)
|
||||||
conn.query(sqlCommand)
|
conn.query(sqlCommand)
|
||||||
|
|
||||||
|
@ -225,64 +240,62 @@ class ClusterController:
|
||||||
assert conn
|
assert conn
|
||||||
|
|
||||||
# update energy savings for the current session
|
# update energy savings for the current session
|
||||||
sqlCommand = "UPDATE session_to_energy_savings SET energy_savings_kwh=%f WHERE session_id=%d;" % (jouleToKwh(self.m_clusterStatus.getEnergySavings()), self.m_iSessionId)
|
sqlCommand = "UPDATE session_to_energy_savings SET energy_savings_kwh=%f WHERE session_id=%d;" % (jouleToKwh(self.cluster_status.get_energy_savings()), self.session_id)
|
||||||
print(sqlCommand)
|
print(sqlCommand)
|
||||||
conn.query(sqlCommand)
|
conn.query(sqlCommand)
|
||||||
|
|
||||||
# update the end time of the current session
|
# update the end time of the current session
|
||||||
sqlCommand = "UPDATE sessions_desc SET end_time=NOW() WHERE session_id=%d;" % (self.m_iSessionId)
|
sqlCommand = "UPDATE sessions_desc SET end_time=NOW() WHERE session_id=%d;" % (self.session_id)
|
||||||
print(sqlCommand)
|
print(sqlCommand)
|
||||||
conn.query(sqlCommand)
|
conn.query(sqlCommand)
|
||||||
|
|
||||||
conn.close()
|
conn.close()
|
||||||
|
|
||||||
def setControlOnMachine(self, machineName, bControl):
|
def set_control_on_machine(self, machineName, bControl):
|
||||||
"""
|
"""
|
||||||
adds or removes the control of ClusterController on the given machine
|
adds or removes the control of ClusterController on the given machine
|
||||||
"""
|
"""
|
||||||
self.m_clusterStatus.setControlOnMachine(machineName, bControl)
|
self.cluster_status.set_control_on_machine(machineName, bControl)
|
||||||
|
|
||||||
def run(self):
|
def run(self):
|
||||||
"""
|
"""
|
||||||
"""
|
"""
|
||||||
self.m_iSessionId = self.storeSessionInDatabase()
|
self.session_id = self.storeSessionInDatabase()
|
||||||
log("storeSessionInDatabase completed")
|
log("storeSessionInDatabase completed")
|
||||||
DELAY_BETWEEN_MEASURES = 10 # in seconds
|
DELAY_BETWEEN_MEASURES = 10 # in seconds
|
||||||
self.m_clusterStatus.startReadingThreads()
|
self.cluster_status.start_reading_threads()
|
||||||
self.m_webServer.start()
|
self.web_server.start()
|
||||||
while not self.m_clusterStatus.isReady():
|
while not self.cluster_status.is_ready():
|
||||||
log('waiting for system to be ready')
|
log('waiting for system to be ready')
|
||||||
time.sleep(1)
|
time.sleep(1)
|
||||||
None
|
log_info('ClusterController::run : cluster initial readings have completed')
|
||||||
logInfo('ClusterController::run : cluster initial readings have completed')
|
|
||||||
startTime = time.localtime()
|
startTime = time.localtime()
|
||||||
while not self.m_bStop:
|
while not self.stop:
|
||||||
currentTime = time.time()
|
currentTime = time.time()
|
||||||
# clusterStatus.m_nodesStatus['simpatix10'].dump()
|
if (not self.last_energy_status_log_time) or (currentTime > (self.last_energy_status_log_time + self.DELAY_BETWEEN_ENERGY_STATUS_LOGS)):
|
||||||
if (not self.m_lastEnergyStatusLogTime) or (currentTime > (self.m_lastEnergyStatusLogTime + self.DELAY_BETWEEN_ENERGY_STATUS_LOGS)):
|
iNumMachines = len(self.cluster_status.cluster_nodes)
|
||||||
iNumMachines = len(self.m_clusterStatus.m_clusterNodes)
|
|
||||||
iNumMachinesOn = 0
|
iNumMachinesOn = 0
|
||||||
iNumSleepingMachines = 0
|
iNumSleepingMachines = 0
|
||||||
for machine in self.m_clusterStatus.m_clusterNodes.values():
|
for machine in self.cluster_status.cluster_nodes.values():
|
||||||
ePowerState = machine.getPowerState()
|
ePowerState = machine.get_power_state()
|
||||||
if ePowerState == PowerState.ON:
|
if ePowerState == PowerState.ON:
|
||||||
iNumMachinesOn += 1
|
iNumMachinesOn += 1
|
||||||
elif ePowerState == PowerState.SLEEP:
|
elif ePowerState == PowerState.SLEEP:
|
||||||
iNumSleepingMachines += 1
|
iNumSleepingMachines += 1
|
||||||
logInfo('%d machines (%d ON, %d SLEEPING)' % (iNumMachines, iNumMachinesOn, iNumSleepingMachines))
|
log_info('%d machines (%d ON, %d SLEEPING)' % (iNumMachines, iNumMachinesOn, iNumSleepingMachines))
|
||||||
iNumSlots = self.m_clusterStatus.getNumControlledSlots()
|
iNumSlots = self.cluster_status.get_num_controlled_slots()
|
||||||
iNumUsedSlots = self.m_clusterStatus.getNumUsedSlots()
|
iNumUsedSlots = self.cluster_status.get_num_used_slots()
|
||||||
iNumWastedSlots = self.m_clusterStatus.getNumWastedSlots()
|
iNumWastedSlots = self.cluster_status.get_num_wasted_slots()
|
||||||
iNumSleepingSlots = self.m_clusterStatus.getNumSleepingSlots()
|
iNumSleepingSlots = self.cluster_status.get_num_sleeping_slots()
|
||||||
logInfo('%d slots (%d used, %d wasted, %d sleeping)' % (iNumSlots, iNumUsedSlots, iNumWastedSlots, iNumSleepingSlots))
|
log_info('%d slots (%d used, %d wasted, %d sleeping)' % (iNumSlots, iNumUsedSlots, iNumWastedSlots, iNumSleepingSlots))
|
||||||
logInfo('cluster estimated power consumption : %f W (saving from cluster controller : %f W)' % (self.m_clusterStatus.getCurrentPowerConsumption(), self.m_clusterStatus.getCurrentPowerSavings()))
|
log_info('cluster estimated power consumption : %f W (saving from cluster controller : %f W)' % (self.cluster_status.get_current_power_consumption(), self.cluster_status.get_current_power_savings()))
|
||||||
logInfo('cluster estimated energy consumption since %s : %f kWh (saving from cluster controller : %f kWh)' % (time.asctime(startTime), jouleToKwh(self.m_clusterStatus.getEnergyConsumption()), jouleToKwh(self.m_clusterStatus.getEnergySavings())))
|
log_info('cluster estimated energy consumption since %s : %f kWh (saving from cluster controller : %f kWh)' % (time.asctime(startTime), jouleToKwh(self.cluster_status.get_energy_consumption()), jouleToKwh(self.cluster_status.get_energy_savings())))
|
||||||
self.updateSessionEnergyConsumptionInDatabase()
|
self.updateSessionEnergyConsumptionInDatabase()
|
||||||
self.m_lastEnergyStatusLogTime = currentTime
|
self.last_energy_status_log_time = currentTime
|
||||||
|
|
||||||
self.updateNormalState()
|
self.updateNormalState()
|
||||||
time.sleep(DELAY_BETWEEN_MEASURES)
|
time.sleep(DELAY_BETWEEN_MEASURES)
|
||||||
self.m_clusterStatus.stopReadingThreads()
|
self.cluster_status.stop_reading_threads()
|
||||||
|
|
||||||
|
|
||||||
def storeClusterNodeStatus(clusterNodeStatus):
|
def storeClusterNodeStatus(clusterNodeStatus):
|
||||||
|
@ -290,19 +303,17 @@ def storeClusterNodeStatus(clusterNodeStatus):
|
||||||
conn = MySQLdb.connect('simpatix10', 'root', '', 'simpa_measurements')
|
conn = MySQLdb.connect('simpatix10', 'root', '', 'simpa_measurements')
|
||||||
assert conn
|
assert conn
|
||||||
# conn.query("""INSERT INTO `fan_rpm_logs` (`fan_id`, `rpm`, `date`) VALUES ('titi', 2000, NOW());""")
|
# conn.query("""INSERT INTO `fan_rpm_logs` (`fan_id`, `rpm`, `date`) VALUES ('titi', 2000, NOW());""")
|
||||||
'''
|
# conn.query("""SELECT * FROM fan_rpm_logs""")
|
||||||
conn.query("""SELECT * FROM fan_rpm_logs""")
|
# r=conn.store_result()
|
||||||
r=conn.store_result()
|
# print r.fetch_row()[0]
|
||||||
print r.fetch_row()[0]
|
for _key, sensor in clusterNodeStatus.sensors.items():
|
||||||
'''
|
sensorId = clusterNodeStatus.cluster_node_name + '_' + sensor.name
|
||||||
for key, sensor in clusterNodeStatus.m_sensors.items():
|
|
||||||
sensorId = clusterNodeStatus.m_clusterNodeName + '_' + sensor.m_name
|
|
||||||
if sensor.typeName() == 'Fan':
|
if sensor.typeName() == 'Fan':
|
||||||
sqlCommand = """INSERT INTO `fan_rpm_logs` (`fan_id`, `rpm`, `date`) VALUES ('""" + sensorId + """', """ + str(sensor.m_rpms) + """, NOW());"""
|
sqlCommand = """INSERT INTO `fan_rpm_logs` (`fan_id`, `rpm`, `date`) VALUES ('""" + sensorId + """', """ + str(sensor.rpms) + """, NOW());"""
|
||||||
print(sqlCommand)
|
print(sqlCommand)
|
||||||
conn.query(sqlCommand)
|
conn.query(sqlCommand)
|
||||||
elif sensor.typeName() == 'Temperature':
|
elif sensor.typeName() == 'Temperature':
|
||||||
sqlCommand = """INSERT INTO `temperature_logs` (`temp_sensor_id`, `temperature`, `date`) VALUES ('""" + sensorId + """', """ + str(sensor.m_temperature) + """, NOW());"""
|
sqlCommand = """INSERT INTO `temperature_logs` (`temp_sensor_id`, `temperature`, `date`) VALUES ('""" + sensorId + """', """ + str(sensor.temperature) + """, NOW());"""
|
||||||
print(sqlCommand)
|
print(sqlCommand)
|
||||||
conn.query(sqlCommand)
|
conn.query(sqlCommand)
|
||||||
else:
|
else:
|
||||||
|
@ -311,11 +322,11 @@ def storeClusterNodeStatus(clusterNodeStatus):
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
# Lib.Util.sendTextMail('SimpaCluster <guillaume.raffy@univ-rennes1.fr>', 'guillaume.raffy@univ-rennes1.fr', 'mail subject', 'mail content')
|
# Lib.Util.send_text_mail('SimpaCluster <guillaume.raffy@univ-rennes1.fr>', 'guillaume.raffy@univ-rennes1.fr', 'mail subject', 'mail content')
|
||||||
try:
|
try:
|
||||||
logInfo('ClusterController v. %s starting....' % VERSION)
|
log_info('ClusterController v. %s starting....' % VERSION)
|
||||||
# executeCommand('ping -o -t 1 simpatix310 > /dev/null')
|
# execute_command('ping -o -t 1 simpatix310 > /dev/null')
|
||||||
# print executeCommand('ssh simpatix10 "ipmitool sensor"')
|
# print execute_command('ssh simpatix10 "ipmitool sensor"')
|
||||||
# assert False, 'prout'
|
# assert False, 'prout'
|
||||||
controller = ClusterController()
|
controller = ClusterController()
|
||||||
controller.run()
|
controller.run()
|
||||||
|
@ -323,4 +334,4 @@ if __name__ == '__main__':
|
||||||
# except AssertionError, error:
|
# except AssertionError, error:
|
||||||
# except KeyboardInterrupt, error:
|
# except KeyboardInterrupt, error:
|
||||||
except BaseException as exception: # catches all exceptions, including the ctrl+C (KeyboardInterrupt)
|
except BaseException as exception: # catches all exceptions, including the ctrl+C (KeyboardInterrupt)
|
||||||
onException(exception)
|
on_exception(exception)
|
||||||
|
|
|
@ -1,142 +1,161 @@
|
||||||
import threading
|
from typing import TYPE_CHECKING
|
||||||
from PowerState import PowerState, PowerStateToStr
|
from typing import Optional
|
||||||
from ClusterNodeStatusUpdater import ClusterNodeStatusUpdater
|
from .PowerState import PowerState, PowerStateToStr
|
||||||
import Lib.Util
|
from .Log import log_info, log_warning
|
||||||
import Lib.SimpaDbUtil
|
from .ClusterNodeStatusUpdater import ClusterNodeStatusUpdater
|
||||||
from Log import logInfo, logWarning
|
if TYPE_CHECKING:
|
||||||
|
from .ClusterStatus import ClusterStatus
|
||||||
|
from .SunGridEngine import SunGridEngine
|
||||||
|
from .ClusterController import SleepCompleteNotifier
|
||||||
|
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
|
||||||
|
ClusterNodeId = str # eg 'physix99'
|
||||||
|
|
||||||
|
|
||||||
class ClusterNode:
|
class ClusterNode:
|
||||||
"""
|
"""
|
||||||
the state of a machine node
|
the state of a machine node
|
||||||
"""
|
"""
|
||||||
def __init__(self, machineName, cluster, gridEngine):
|
name: ClusterNodeId
|
||||||
self.m_name = machineName
|
cluster: 'ClusterStatus' # the cluster this machine belongs to
|
||||||
self.m_cluster = cluster # the cluster this machine belongs to
|
requested_power_state: PowerState
|
||||||
self.m_requestedPowerState = PowerState.ON
|
power_state: PowerState
|
||||||
self.m_powerState = PowerState.UNKNOWN
|
last_power_state_time: Optional[datetime] # time at which the last value of self.power_state has been set
|
||||||
self.m_lastPowerStateTime = None # time at which the last value of self.m_powerState has been set
|
machine_status_updater: ClusterNodeStatusUpdater
|
||||||
self.m_machineStatusUpdater = ClusterNodeStatusUpdater(machineName, self, gridEngine)
|
energy_consumption: float # estimate of the energy consumption of this machine since the start of cluster controller (in joules)
|
||||||
self.m_energyConsumption = 0.0 # estimate of the energy consumption of this machine since the start of cluster controller (in joules)
|
energy_savings: float # estimate of the energy savings on this machine caused by the cluster controller since it started (in joules)
|
||||||
self.m_energySavings = 0.0 # estimate of the energy savings on this machine caused by the cluster controller since it started (in joules)
|
|
||||||
|
|
||||||
def getName(self):
|
def __init__(self, machine_name: ClusterNodeId, cluster: 'ClusterStatus', grid_engine: 'SunGridEngine'):
|
||||||
return self.m_name
|
self.name = machine_name
|
||||||
|
self.cluster = cluster # the cluster this machine belongs to
|
||||||
|
self.requested_power_state = PowerState.ON
|
||||||
|
self.power_state = PowerState.UNKNOWN
|
||||||
|
self.last_power_state_time = None # time at which the last value of self.power_state has been set
|
||||||
|
self.machine_status_updater = ClusterNodeStatusUpdater(machine_name, self, grid_engine)
|
||||||
|
self.energy_consumption = 0.0 # estimate of the energy consumption of this machine since the start of cluster controller (in joules)
|
||||||
|
self.energy_savings = 0.0 # estimate of the energy savings on this machine caused by the cluster controller since it started (in joules)
|
||||||
|
|
||||||
def isReady(self):
|
def get_name(self) -> ClusterNodeId:
|
||||||
if self.m_powerState == PowerState.UNKNOWN:
|
return self.name
|
||||||
# logInfo(self.m_name + ' is not ready (waiting for power state)')
|
|
||||||
|
def is_ready(self) -> bool:
|
||||||
|
if self.power_state == PowerState.UNKNOWN:
|
||||||
|
# log_info(self.name + ' is not ready (waiting for power state)')
|
||||||
return False
|
return False
|
||||||
if self.m_powerState == PowerState.ON:
|
if self.power_state == PowerState.ON:
|
||||||
return True
|
return True
|
||||||
# log(self.m_name + ' is ready')
|
# log(self.name + ' is ready')
|
||||||
return True
|
return True
|
||||||
|
|
||||||
def getPowerState(self):
|
def get_power_state(self) -> PowerState:
|
||||||
return self.m_powerState
|
return self.power_state
|
||||||
|
|
||||||
def setShouldAlwaysBeOn(self):
|
def set_should_always_be_on(self):
|
||||||
self.m_machineStatusUpdater.setShouldAlwaysBeOn()
|
self.machine_status_updater.set_should_always_be_on()
|
||||||
self.setPowerState(PowerState.ON)
|
self.set_power_state(PowerState.ON)
|
||||||
|
|
||||||
def setPowerState(self, powerState):
|
def set_power_state(self, power_state: PowerState):
|
||||||
bUpdateRequiredChecks = False
|
bUpdateRequiredChecks = False
|
||||||
if self.m_powerState == PowerState.UNKNOWN:
|
if self.power_state == PowerState.UNKNOWN:
|
||||||
logInfo('ClusterNode::setPowerState : ' + self.m_name + '\'s power state has been initialized to ' + PowerStateToStr(powerState))
|
log_info('ClusterNode::set_power_state : ' + self.name + '\'s power state has been initialized to ' + PowerStateToStr(power_state))
|
||||||
self.m_powerState = powerState
|
self.power_state = power_state
|
||||||
self.m_lastPowerStateTime = datetime.now()
|
self.last_power_state_time = datetime.now()
|
||||||
bUpdateRequiredChecks = True
|
bUpdateRequiredChecks = True
|
||||||
else:
|
else:
|
||||||
# update the estimation of energy consumption
|
# update the estimation of energy consumption
|
||||||
self.updateEnergyMeasurements()
|
self.update_energy_measurements()
|
||||||
# then change the power state
|
# then change the power state
|
||||||
if self.m_powerState != powerState:
|
if self.power_state != power_state:
|
||||||
logInfo('ClusterNode::setPowerState : ' + self.m_name + '\'s power state has been changed to ' + PowerStateToStr(powerState))
|
log_info('ClusterNode::set_power_state : ' + self.name + '\'s power state has been changed to ' + PowerStateToStr(power_state))
|
||||||
self.m_powerState = powerState
|
self.power_state = power_state
|
||||||
self.m_lastPowerStateTime = datetime.now()
|
self.last_power_state_time = datetime.now()
|
||||||
bUpdateRequiredChecks = True
|
bUpdateRequiredChecks = True
|
||||||
if bUpdateRequiredChecks:
|
if bUpdateRequiredChecks:
|
||||||
if self.m_powerState == PowerState.ON:
|
if self.power_state == PowerState.ON:
|
||||||
self.m_machineStatusUpdater.m_bCheckPowerState = True
|
self.machine_status_updater.check_power_state = True
|
||||||
self.m_machineStatusUpdater.m_bCheckSensors = True
|
self.machine_status_updater.check_sensors = True
|
||||||
elif self.m_powerState == PowerState.OFF:
|
elif self.power_state == PowerState.OFF:
|
||||||
self.m_machineStatusUpdater.m_bCheckPowerState = True
|
self.machine_status_updater.check_power_state = True
|
||||||
self.m_machineStatusUpdater.m_bCheckSensors = False
|
self.machine_status_updater.check_sensors = False
|
||||||
elif self.m_powerState == PowerState.SLEEP:
|
elif self.power_state == PowerState.SLEEP:
|
||||||
self.m_machineStatusUpdater.m_bCheckPowerState = True
|
self.machine_status_updater.check_power_state = True
|
||||||
self.m_machineStatusUpdater.m_bCheckSensors = False
|
self.machine_status_updater.check_sensors = False
|
||||||
elif self.m_powerState == PowerState.UNPLUGGED:
|
elif self.power_state == PowerState.UNPLUGGED:
|
||||||
self.m_machineStatusUpdater.m_bCheckPowerState = True
|
self.machine_status_updater.check_power_state = True
|
||||||
self.m_machineStatusUpdater.m_bCheckSensors = False
|
self.machine_status_updater.check_sensors = False
|
||||||
else:
|
else:
|
||||||
assert False
|
assert False
|
||||||
|
|
||||||
def onNewPowerStateReading(self, powerState):
|
def on_new_power_state_reading(self, power_state: PowerState):
|
||||||
"""
|
"""
|
||||||
called when a new powerstate reading arrives
|
called when a new powerstate reading arrives
|
||||||
"""
|
"""
|
||||||
if powerState != self.getPowerState():
|
if power_state != self.get_power_state():
|
||||||
if self.getPowerState() != PowerState.UNKNOWN:
|
if self.get_power_state() != PowerState.UNKNOWN:
|
||||||
logWarning('ClusterNode::onNewPowerStateReading : ' + self.m_name + '\'s power state has been (manually it seems) changed to ' + PowerStateToStr(powerState))
|
log_warning('ClusterNode::on_new_power_state_reading : ' + self.name + '\'s power state has been (manually it seems) changed to ' + PowerStateToStr(power_state))
|
||||||
self.setPowerState(powerState)
|
self.set_power_state(power_state)
|
||||||
|
|
||||||
def getPowerConsumptionForPowerState(self, ePowerState):
|
def get_power_consumption_for_power_state(self, power_state: PowerState) -> float:
|
||||||
"""
|
"""
|
||||||
returns the power consumption estimation (in watts) of this machine for the given power state
|
returns the power consumption estimation (in watts) of this machine for the given power state
|
||||||
"""
|
"""
|
||||||
fCurrentIntensity = 0.0
|
fCurrentIntensity = 0.0
|
||||||
fCurrentVoltage = 220.0
|
fCurrentVoltage = 220.0
|
||||||
# noticed on 26.08.2009 that putting 22 machines from sleep to on eats 17 A, resulting in difference of 0.77 A per machine
|
# noticed on 26.08.2009 that putting 22 machines from sleep to on eats 17 A, resulting in difference of 0.77 A per machine
|
||||||
if ePowerState == PowerState.ON:
|
if power_state == PowerState.ON:
|
||||||
fCurrentIntensity = 0.9 # value when the machine is doing nothing
|
fCurrentIntensity = 0.9 # value when the machine is doing nothing
|
||||||
elif ePowerState == PowerState.OFF:
|
elif power_state == PowerState.OFF:
|
||||||
fCurrentIntensity = 0.1
|
fCurrentIntensity = 0.1
|
||||||
elif ePowerState == PowerState.SLEEP:
|
elif power_state == PowerState.SLEEP:
|
||||||
fCurrentIntensity = 0.1
|
fCurrentIntensity = 0.1
|
||||||
elif ePowerState == PowerState.UNPLUGGED:
|
elif power_state == PowerState.UNPLUGGED:
|
||||||
fCurrentIntensity = 0.0
|
fCurrentIntensity = 0.0
|
||||||
else:
|
else:
|
||||||
assert False
|
assert False
|
||||||
return fCurrentIntensity * fCurrentVoltage
|
return fCurrentIntensity * fCurrentVoltage
|
||||||
|
|
||||||
def updateEnergyMeasurements(self):
|
def update_energy_measurements(self):
|
||||||
timeInterval = datetime.now() - self.m_lastPowerStateTime
|
timeInterval = datetime.now() - self.last_power_state_time
|
||||||
self.m_energyConsumption += self.getPowerConsumptionForPowerState(self.m_powerState) * timeInterval.seconds
|
self.energy_consumption += self.get_power_consumption_for_power_state(self.power_state) * timeInterval.seconds
|
||||||
self.m_energySavings += (self.getPowerConsumptionForPowerState(PowerState.ON) - self.getPowerConsumptionForPowerState(self.m_powerState)) * timeInterval.seconds
|
self.energy_savings += (self.get_power_consumption_for_power_state(PowerState.ON) - self.get_power_consumption_for_power_state(self.power_state)) * timeInterval.seconds
|
||||||
self.m_lastPowerStateTime = datetime.now()
|
self.last_power_state_time = datetime.now()
|
||||||
# logDebug('energy savings on %s : %f J' %(self.getName(), self.m_energySavings))
|
# logDebug('energy savings on %s : %f J' %(self.get_name(), self.energy_savings))
|
||||||
|
|
||||||
def getEnergyConsumption(self):
|
def get_energy_consumption(self) -> float:
|
||||||
"""
|
"""
|
||||||
in joules
|
in joules
|
||||||
"""
|
"""
|
||||||
self.updateEnergyMeasurements()
|
self.update_energy_measurements()
|
||||||
return self.m_energyConsumption
|
return self.energy_consumption
|
||||||
|
|
||||||
def getPowerConsumption(self):
|
def get_power_consumption(self) -> float:
|
||||||
fCurrentPowerConsumption = self.getPowerConsumptionForPowerState(self.m_powerState)
|
fCurrentPowerConsumption = self.get_power_consumption_for_power_state(self.power_state)
|
||||||
# logDebug('getPowerConsumption of %s : %f (powerstate = %d)' % (self.getName(), fCurrentPowerConsumption, self.m_powerState))
|
# logDebug('get_power_consumption of %s : %f (powerstate = %d)' % (self.get_name(), fCurrentPowerConsumption, self.power_state))
|
||||||
return fCurrentPowerConsumption
|
return fCurrentPowerConsumption
|
||||||
|
|
||||||
def getEnergySavings(self):
|
def get_energy_savings(self) -> float:
|
||||||
self.updateEnergyMeasurements()
|
self.update_energy_measurements()
|
||||||
return self.m_energySavings
|
return self.energy_savings
|
||||||
|
|
||||||
def onSleepFailedBecauseAJobJustArrived(self):
|
def on_sleep_because_a_job_just_arrived(self):
|
||||||
logInfo('%s was scheduled to sleep but the sleep is canceled because it\'s currently executing a new job' % self.m_name)
|
log_info('%s was scheduled to sleep but the sleep is canceled because it\'s currently executing a new job' % self.name)
|
||||||
|
|
||||||
def requestSleep(self, sleepCompleteNotifier=None):
|
def request_sleep(self, sleep_complete_notifier: Optional['SleepCompleteNotifier'] = None):
|
||||||
self.m_machineStatusUpdater.requestSleep(sleepCompleteNotifier)
|
self.machine_status_updater.request_sleep(sleep_complete_notifier)
|
||||||
|
|
||||||
def requestWakeUp(self, wakeUpCompleteNotifier=None):
|
def request_wake_up(self, wake_up_complete_notifier: Optional['SleepCompleteNotifier'] = None):
|
||||||
self.m_machineStatusUpdater.requestWakeUp(wakeUpCompleteNotifier)
|
self.machine_status_updater.request_wake_up(wake_up_complete_notifier)
|
||||||
|
|
||||||
def getQueueMachineName(self):
|
def get_queue_machine_name(self) -> ClusterNodeId:
|
||||||
return self.getCluster().getJobsState().getQueueMachine(self.m_name).getName()
|
return self.get_cluster().get_jobs_state().get_queue_machine(self.name).get_name()
|
||||||
assert self.m_queueName is not None
|
# assert self.queue_name is not None
|
||||||
return self.m_queueName
|
# return self.queue_name
|
||||||
|
|
||||||
def getCluster(self):
|
def get_cluster(self) -> 'ClusterStatus':
|
||||||
return self.m_cluster
|
return self.cluster
|
||||||
|
|
||||||
|
|
||||||
|
# from .ClusterStatus import ClusterStatus # noqa: E402, pylint: disable=wrong-import-position
|
||||||
|
# from .SunGridEngine import SunGridEngine # noqa: E402, pylint: disable=wrong-import-position
|
||||||
|
# from .ClusterController import SleepCompleteNotifier # noqa: E402, pylint: disable=wrong-import-position
|
||||||
|
|
|
@ -1,39 +1,46 @@
|
||||||
|
from typing import TYPE_CHECKING
|
||||||
|
from typing import Optional, List
|
||||||
import threading
|
import threading
|
||||||
import time
|
import time
|
||||||
import Lib.Util
|
import abc
|
||||||
import Lib.SimpaDbUtil
|
from .PowerState import PowerState
|
||||||
from PowerState import PowerState
|
from .Log import log_info, logDebug
|
||||||
from Log import logInfo, logDebug
|
from .Util import blocking_wake_up_machine, blocking_put_machine_to_sleep, get_power_state, on_exception
|
||||||
from Util import blockingWakeUpMachine, blockingPutMachineToSleep, getPowerState, onException
|
if TYPE_CHECKING:
|
||||||
|
from .ClusterNode import ClusterNodeId, ClusterNode
|
||||||
|
from .SunGridEngine import SunGridEngine
|
||||||
|
|
||||||
|
|
||||||
class IWakeUpCompleteNotifier:
|
class IWakeUpCompleteNotifier(abc.ABCMeta):
|
||||||
"""
|
"""
|
||||||
interface for wakeup notifiers
|
interface for wakeup notifiers
|
||||||
"""
|
"""
|
||||||
def onWakeUpComplete(self):
|
@abc.abstractmethod
|
||||||
|
def on_wake_up_complete(self):
|
||||||
assert False
|
assert False
|
||||||
|
|
||||||
|
|
||||||
class ISleepCompleteNotifier:
|
class ISleepCompleteNotifier(abc.ABCMeta):
|
||||||
"""
|
"""
|
||||||
interface for sleep notifiers
|
interface for sleep notifiers
|
||||||
"""
|
"""
|
||||||
def onSleepComplete(self, bSleepSucceeded):
|
@abc.abstractmethod
|
||||||
|
def on_sleep_complete(self, bSleepSucceeded):
|
||||||
assert False
|
assert False
|
||||||
|
|
||||||
|
|
||||||
class IRequest:
|
class IRequest(abc.ABCMeta):
|
||||||
GO_TO_SLEEP = 1
|
GO_TO_SLEEP = 1
|
||||||
WAKE_UP = 2
|
WAKE_UP = 2
|
||||||
CHECK_POWER_STATE = 3
|
CHECK_POWER_STATE = 3
|
||||||
|
|
||||||
def __init__(self, requestType):
|
def __init__(self, requestType):
|
||||||
self.m_type = requestType
|
self.type = requestType
|
||||||
|
|
||||||
def getType(self):
|
def getType(self):
|
||||||
return self.m_type
|
return self.type
|
||||||
|
|
||||||
|
@abc.abstractmethod
|
||||||
def process(self, clusterNodeStatusUpdater):
|
def process(self, clusterNodeStatusUpdater):
|
||||||
"""
|
"""
|
||||||
processes this request
|
processes this request
|
||||||
|
@ -43,58 +50,58 @@ class IRequest:
|
||||||
|
|
||||||
class WakeUpRequest(IRequest):
|
class WakeUpRequest(IRequest):
|
||||||
|
|
||||||
def __init__(self, wakeUpNotifier):
|
def __init__(self, wakeUpNotifier: IWakeUpCompleteNotifier):
|
||||||
IRequest.__init__(self, IRequest.WAKE_UP)
|
IRequest.__init__(self, IRequest.WAKE_UP)
|
||||||
self.m_wakeUpNotifier = wakeUpNotifier
|
self.wake_up_notifier = wakeUpNotifier
|
||||||
|
|
||||||
def process(self, clusterNodeStatusUpdater):
|
def process(self, clusterNodeStatusUpdater):
|
||||||
assert clusterNodeStatusUpdater.m_bShouldAlwaysBeOn is False # are we attempting to wake up a machine that should always be on ?
|
assert clusterNodeStatusUpdater.should_always_be_on is False # are we attempting to wake up a machine that should always be on ?
|
||||||
logInfo('Handling wakeup request for %s' % clusterNodeStatusUpdater.getName())
|
log_info('Handling wakeup request for %s' % clusterNodeStatusUpdater.get_name())
|
||||||
bSuccess = blockingWakeUpMachine(clusterNodeStatusUpdater.getName())
|
bSuccess = blocking_wake_up_machine(clusterNodeStatusUpdater.get_name())
|
||||||
assert bSuccess
|
assert bSuccess
|
||||||
# activate the associated machine queue
|
# activate the associated machine queue
|
||||||
if clusterNodeStatusUpdater.setQueueActivation(True):
|
if clusterNodeStatusUpdater.set_queue_activation(True):
|
||||||
pass # all is ok
|
pass # all is ok
|
||||||
else:
|
else:
|
||||||
assert False
|
assert False
|
||||||
clusterNodeStatusUpdater.m_stateLock.acquire()
|
clusterNodeStatusUpdater.state_lock.acquire()
|
||||||
clusterNodeStatusUpdater.m_clusterNode.setPowerState(PowerState.ON)
|
clusterNodeStatusUpdater.cluster_node.set_power_state(PowerState.ON)
|
||||||
clusterNodeStatusUpdater.m_stateLock.release()
|
clusterNodeStatusUpdater.state_lock.release()
|
||||||
if self.m_wakeUpNotifier:
|
if self.wake_up_notifier:
|
||||||
logDebug('ClusterNodeStatusUpdater::run : Sending wakeup notification')
|
logDebug('ClusterNodeStatusUpdater::run : Sending wakeup notification')
|
||||||
self.m_wakeUpNotifier.onWakeUpComplete()
|
self.wake_up_notifier.on_wake_up_complete()
|
||||||
|
|
||||||
|
|
||||||
class SleepRequest(IRequest):
|
class SleepRequest(IRequest):
|
||||||
|
|
||||||
def __init__(self, sleepCompleteNotifier):
|
def __init__(self, sleepCompleteNotifier: ISleepCompleteNotifier):
|
||||||
IRequest.__init__(self, IRequest.GO_TO_SLEEP)
|
IRequest.__init__(self, IRequest.GO_TO_SLEEP)
|
||||||
self.m_sleepCompleteNotifier = sleepCompleteNotifier
|
self.sleep_complete_notifier = sleepCompleteNotifier
|
||||||
|
|
||||||
def process(self, clusterNodeStatusUpdater):
|
def process(self, clusterNodeStatusUpdater):
|
||||||
assert not clusterNodeStatusUpdater.m_bShouldAlwaysBeOn # are we attempting to put a machine the should stay on to sleep ?
|
assert not clusterNodeStatusUpdater.should_always_be_on # are we attempting to put a machine the should stay on to sleep ?
|
||||||
logInfo('Handling sleep request for %s' % clusterNodeStatusUpdater.getName())
|
log_info('Handling sleep request for %s' % clusterNodeStatusUpdater.get_name())
|
||||||
if clusterNodeStatusUpdater.setQueueActivation(False):
|
if clusterNodeStatusUpdater.set_queue_activation(False):
|
||||||
if clusterNodeStatusUpdater.queueIsEmpty():
|
if clusterNodeStatusUpdater.queue_is_empty():
|
||||||
if blockingPutMachineToSleep(clusterNodeStatusUpdater.m_clusterNodeName):
|
if blocking_put_machine_to_sleep(clusterNodeStatusUpdater.cluster_node_name):
|
||||||
# now we know that the machine is asleep
|
# now we know that the machine is asleep
|
||||||
clusterNodeStatusUpdater.m_stateLock.acquire()
|
clusterNodeStatusUpdater.state_lock.acquire()
|
||||||
clusterNodeStatusUpdater.m_clusterNode.setPowerState(PowerState.SLEEP)
|
clusterNodeStatusUpdater.cluster_node.set_power_state(PowerState.SLEEP)
|
||||||
clusterNodeStatusUpdater.m_stateLock.release()
|
clusterNodeStatusUpdater.state_lock.release()
|
||||||
if self.m_sleepCompleteNotifier:
|
if self.sleep_complete_notifier:
|
||||||
self.m_sleepCompleteNotifier.onSleepComplete(True)
|
self.sleep_complete_notifier.on_sleep_complete(True)
|
||||||
else:
|
else:
|
||||||
assert False
|
assert False
|
||||||
else:
|
else:
|
||||||
# reactivate the queue
|
# reactivate the queue
|
||||||
if not clusterNodeStatusUpdater.setQueueActivation(True):
|
if not clusterNodeStatusUpdater.set_queue_activation(True):
|
||||||
assert False
|
assert False
|
||||||
clusterNodeStatusUpdater.m_stateLock.acquire()
|
clusterNodeStatusUpdater.state_lock.acquire()
|
||||||
clusterNodeStatusUpdater.m_clusterNode.setPowerState(PowerState.ON) # this is necessary to reenable the various cyclic checks that were disabled on sleep request
|
clusterNodeStatusUpdater.cluster_node.set_power_state(PowerState.ON) # this is necessary to reenable the various cyclic checks that were disabled on sleep request
|
||||||
clusterNodeStatusUpdater.m_stateLock.release()
|
clusterNodeStatusUpdater.state_lock.release()
|
||||||
clusterNodeStatusUpdater.m_clusterNode.onSleepFailedBecauseAJobJustArrived()
|
clusterNodeStatusUpdater.cluster_node.on_sleep_because_a_job_just_arrived()
|
||||||
if self.m_sleepCompleteNotifier:
|
if self.sleep_complete_notifier:
|
||||||
self.m_sleepCompleteNotifier.onSleepComplete(False)
|
self.sleep_complete_notifier.on_sleep_complete(False)
|
||||||
else:
|
else:
|
||||||
assert False
|
assert False
|
||||||
|
|
||||||
|
@ -105,88 +112,99 @@ class CheckPowerStateRequest(IRequest):
|
||||||
IRequest.__init__(self, IRequest.CHECK_POWER_STATE)
|
IRequest.__init__(self, IRequest.CHECK_POWER_STATE)
|
||||||
|
|
||||||
def process(self, clusterNodeStatusUpdater):
|
def process(self, clusterNodeStatusUpdater):
|
||||||
powerState = getPowerState(clusterNodeStatusUpdater.m_clusterNodeName)
|
powerState = get_power_state(clusterNodeStatusUpdater.cluster_node_name)
|
||||||
clusterNodeStatusUpdater.m_stateLock.acquire()
|
clusterNodeStatusUpdater.state_lock.acquire()
|
||||||
clusterNodeStatusUpdater.m_clusterNode.onNewPowerStateReading(powerState)
|
clusterNodeStatusUpdater.cluster_node.on_new_power_state_reading(powerState)
|
||||||
clusterNodeStatusUpdater.m_lastPowerStateCheckTime = time.time()
|
clusterNodeStatusUpdater.last_power_check_state_time = time.time()
|
||||||
clusterNodeStatusUpdater.m_stateLock.release()
|
clusterNodeStatusUpdater.state_lock.release()
|
||||||
|
|
||||||
|
|
||||||
class ClusterNodeStatusUpdater(threading.Thread):
|
class ClusterNodeStatusUpdater(threading.Thread):
|
||||||
|
cluster_node_name: 'ClusterNodeId'
|
||||||
|
cluster_node: 'ClusterNode'
|
||||||
|
grid_engine: 'SunGridEngine'
|
||||||
|
stop: bool
|
||||||
|
last_power_check_state_time: Optional[time.time]
|
||||||
|
check_power_state: bool
|
||||||
|
check_sensors: Optional[bool]
|
||||||
|
state_lock: threading.Lock # lock that prevents concurrent access to the state of this instance
|
||||||
|
should_always_be_on: bool # indicates that the machine should never go to sleep or off for whatever reason (eg simpatix10)
|
||||||
|
pending_requests_queue: List[IRequest]
|
||||||
DELAY_BETWEEN_POWERSTATE_CHECKS = 5 * 60 # in seconds
|
DELAY_BETWEEN_POWERSTATE_CHECKS = 5 * 60 # in seconds
|
||||||
|
|
||||||
def __init__(self, machineName, clusterNode, gridEngine):
|
def __init__(self, machineName: 'ClusterNodeId', clusterNode: 'ClusterNode', gridEngine: 'SunGridEngine'):
|
||||||
threading.Thread.__init__(self)
|
threading.Thread.__init__(self)
|
||||||
self.m_clusterNodeName = machineName
|
self.cluster_node_name = machineName
|
||||||
self.m_clusterNode = clusterNode
|
self.cluster_node = clusterNode
|
||||||
self.m_gridEngine = gridEngine
|
self.grid_engine = gridEngine
|
||||||
self.m_bStop = False
|
self.stop = False
|
||||||
self.m_lastPowerStateCheckTime = None # time.time()
|
self.last_power_check_state_time = None
|
||||||
self.m_bCheckPowerState = True
|
self.check_power_state = True
|
||||||
self.m_stateLock = threading.Lock() # lock that prevents concurrent access to the state of this instance
|
self.state_lock = threading.Lock()
|
||||||
self.m_bShouldAlwaysBeOn = False # indicates that the machine should never go to sleep or off for whatever reason (eg simpatix10)
|
self.should_always_be_on = False
|
||||||
self.m_pendingRequestsQueue = []
|
self.pending_requests_queue = []
|
||||||
|
self.check_sensors = None
|
||||||
|
|
||||||
def getGridEngine(self):
|
def get_grid_engine(self):
|
||||||
return self.m_gridEngine
|
return self.grid_engine
|
||||||
|
|
||||||
def getName(self):
|
def get_name(self):
|
||||||
return self.m_clusterNodeName
|
return self.cluster_node_name
|
||||||
|
|
||||||
def setShouldAlwaysBeOn(self):
|
def set_should_always_be_on(self):
|
||||||
print('%s should always be on' % (self.getName()))
|
print('%s should always be on' % (self.get_name()))
|
||||||
self.m_bShouldAlwaysBeOn = True
|
self.should_always_be_on = True
|
||||||
|
|
||||||
def pushRequest(self, request):
|
def push_request(self, request: IRequest):
|
||||||
self.m_stateLock.acquire()
|
self.state_lock.acquire()
|
||||||
self.m_pendingRequestsQueue.append(request)
|
self.pending_requests_queue.append(request)
|
||||||
self.m_stateLock.release()
|
self.state_lock.release()
|
||||||
|
|
||||||
def popRequest(self):
|
def pop_request(self) -> IRequest:
|
||||||
oldestRequest = None
|
oldest_request = None
|
||||||
self.m_stateLock.acquire()
|
self.state_lock.acquire()
|
||||||
if len(self.m_pendingRequestsQueue) != 0:
|
if len(self.pending_requests_queue) != 0:
|
||||||
oldestRequest = self.m_pendingRequestsQueue.pop(0)
|
oldest_request = self.pending_requests_queue.pop(0)
|
||||||
self.m_stateLock.release()
|
self.state_lock.release()
|
||||||
return oldestRequest
|
return oldest_request
|
||||||
|
|
||||||
def run(self):
|
def run(self):
|
||||||
try:
|
try:
|
||||||
|
|
||||||
while not self.m_bStop:
|
while not self.stop:
|
||||||
# handle the oldest request
|
# handle the oldest request
|
||||||
request = self.popRequest()
|
request = self.pop_request()
|
||||||
if request is not None:
|
if request is not None:
|
||||||
request.process(self)
|
request.process(self)
|
||||||
|
|
||||||
# schedule a power state check if required
|
# schedule a power state check if required
|
||||||
currentTime = time.time()
|
currentTime = time.time()
|
||||||
if self.m_bCheckPowerState:
|
if self.check_power_state:
|
||||||
if not self.m_bShouldAlwaysBeOn: # don't do power checks on such machines because some current implementations of
|
if not self.should_always_be_on: # don't do power checks on such machines because some current implementations of
|
||||||
# operations involved might cause the machine to go to sleep
|
# operations involved might cause the machine to go to sleep
|
||||||
if (not self.m_lastPowerStateCheckTime) or (currentTime > (self.m_lastPowerStateCheckTime + ClusterNodeStatusUpdater.DELAY_BETWEEN_POWERSTATE_CHECKS)):
|
if (not self.last_power_check_state_time) or (currentTime > (self.last_power_check_state_time + ClusterNodeStatusUpdater.DELAY_BETWEEN_POWERSTATE_CHECKS)):
|
||||||
self.pushRequest(CheckPowerStateRequest())
|
self.push_request(CheckPowerStateRequest()) # pylint: disable=no-value-for-parameter
|
||||||
|
|
||||||
time.sleep(1)
|
time.sleep(1)
|
||||||
except BaseException as exception: # catches all exceptions, including the ctrl+C (KeyboardInterrupt)
|
except BaseException as exception: # catches all exceptions, including the ctrl+C (KeyboardInterrupt)
|
||||||
onException(exception)
|
on_exception(exception)
|
||||||
|
|
||||||
def requestSleep(self, sleepCompleteNotifier=None):
|
def request_sleep(self, sleep_complete_notifier: Optional[ISleepCompleteNotifier] = None):
|
||||||
assert not self.m_bShouldAlwaysBeOn
|
assert not self.should_always_be_on
|
||||||
self.pushRequest(SleepRequest(sleepCompleteNotifier))
|
self.push_request(SleepRequest(sleep_complete_notifier)) # pylint: disable=no-value-for-parameter
|
||||||
|
|
||||||
def requestWakeUp(self, wakeUpNotifier=None):
|
def request_wake_up(self, wake_up_complete_notifier: Optional[IWakeUpCompleteNotifier] = None):
|
||||||
assert self.m_bShouldAlwaysBeOn is False
|
assert self.should_always_be_on is False
|
||||||
self.pushRequest(WakeUpRequest(wakeUpNotifier))
|
self.push_request(WakeUpRequest(wake_up_complete_notifier)) # pylint: disable=no-value-for-parameter
|
||||||
|
|
||||||
def getQueueMachineName(self):
|
def get_queue_machine_name(self):
|
||||||
return self.m_clusterNode.getQueueMachineName()
|
return self.cluster_node.get_queue_machine_name()
|
||||||
|
|
||||||
def setQueueActivation(self, bEnable):
|
def set_queue_activation(self, bEnable: bool):
|
||||||
"""
|
"""
|
||||||
@return true on success, false otherwise
|
@return true on success, false otherwise
|
||||||
"""
|
"""
|
||||||
return self.getGridEngine().setQueueInstanceActivation(self.getQueueMachineName(), bEnable)
|
return self.get_grid_engine().set_queue_instance_activation(self.get_queue_machine_name(), bEnable)
|
||||||
|
|
||||||
def queueIsEmpty(self):
|
def queue_is_empty(self):
|
||||||
return self.getGridEngine().queueIsEmpty(self.getName())
|
return self.get_grid_engine().queue_is_empty(self.get_name())
|
||||||
|
|
|
@ -1,11 +1,14 @@
|
||||||
import threading
|
|
||||||
from JobsStateUpdater import JobsStateUpdater
|
|
||||||
import Lib.Util
|
|
||||||
import Lib.SimpaDbUtil
|
|
||||||
from ClusterNode import ClusterNode
|
|
||||||
from Log import logInfo, logError
|
|
||||||
from PowerState import PowerState
|
|
||||||
import time
|
import time
|
||||||
|
from typing import Dict, Optional, List
|
||||||
|
import threading
|
||||||
|
from .Job import TaskUid, Task, QueueMachineId, JobRequirements
|
||||||
|
from .JobsStateUpdater import JobsStateUpdater
|
||||||
|
from .ClusterNode import ClusterNode, ClusterNodeId
|
||||||
|
from .Log import log_info, logError
|
||||||
|
from .PowerState import PowerState
|
||||||
|
from ..SimpaDbUtil import get_cluster_machines_names
|
||||||
|
from .SunGridEngine import SunGridEngine
|
||||||
|
from .JobsState import JobsState
|
||||||
|
|
||||||
|
|
||||||
class ClusterStatus:
|
class ClusterStatus:
|
||||||
|
@ -14,196 +17,203 @@ class ClusterStatus:
|
||||||
|
|
||||||
@param gridEngine the interface to the batch job tool (in our case it's sun grid engine)
|
@param gridEngine the interface to the batch job tool (in our case it's sun grid engine)
|
||||||
"""
|
"""
|
||||||
def __init__(self, gridEngine):
|
grid_engine: SunGridEngine
|
||||||
self.m_gridEngine = gridEngine
|
cluster_nodes: Dict[ClusterNodeId, ClusterNode]
|
||||||
self.m_clusterNodes = {}
|
lock: threading.Lock # to prevent concurrent access to this instance
|
||||||
self.m_lock = threading.Lock() # to prevent concurrent access to this instance
|
jobs_state_updater: JobsStateUpdater
|
||||||
self.m_jobsStateUpdater = JobsStateUpdater(self)
|
jobs_state: Optional[JobsState]
|
||||||
self.m_jobsState = None
|
controlled_machine_names: List[ClusterNodeId]
|
||||||
# self.m_controlledMachineNames = ['simpatix30']
|
|
||||||
self.m_controlledMachineNames = [] # ['simpatix30']
|
def __init__(self, grid_engine: SunGridEngine):
|
||||||
|
self.grid_engine = grid_engine
|
||||||
|
self.cluster_nodes = {}
|
||||||
|
self.lock = threading.Lock()
|
||||||
|
self.jobs_state_updater = JobsStateUpdater(self)
|
||||||
|
self.jobs_state = None
|
||||||
|
# self.controlled_machine_names = ['simpatix30']
|
||||||
|
self.controlled_machine_names = [] # ['simpatix30']
|
||||||
if False:
|
if False:
|
||||||
for iMachine in range(11, 40):
|
for iMachine in range(11, 40):
|
||||||
if (iMachine == 31) or (iMachine == 32):
|
if (iMachine == 31) or (iMachine == 32):
|
||||||
continue # these machines don't seem to be able to go to sleep properly (bug 00000010)
|
continue # these machines don't seem to be able to go to sleep properly (bug 00000010)
|
||||||
if (iMachine == 18):
|
if (iMachine == 18):
|
||||||
continue # this machine needs maintenance (restarting because it's very slow for an unknown reason)
|
continue # this machine needs maintenance (restarting because it's very slow for an unknown reason)
|
||||||
self.m_controlledMachineNames.append('simpatix%d' % iMachine)
|
self.controlled_machine_names.append('simpatix%d' % iMachine)
|
||||||
nodeNames = Lib.SimpaDbUtil.getClusterMachinesNames()
|
node_names = get_cluster_machines_names()
|
||||||
for nodeName in nodeNames:
|
for node_name in node_names:
|
||||||
if nodeName in self.m_controlledMachineNames:
|
if node_name in self.controlled_machine_names:
|
||||||
logInfo('machine %s is under the cluster controller\'s control' % nodeName)
|
log_info('machine %s is under the cluster controller\'s control' % node_name)
|
||||||
clusterNode = ClusterNode(nodeName, self, gridEngine)
|
cluster_node = ClusterNode(node_name, self, grid_engine)
|
||||||
if nodeName == 'simpatix10':
|
if node_name == 'simpatix10':
|
||||||
clusterNode.setShouldAlwaysBeOn()
|
cluster_node.set_should_always_be_on()
|
||||||
self.m_clusterNodes[nodeName] = clusterNode
|
self.cluster_nodes[node_name] = cluster_node
|
||||||
return
|
return
|
||||||
|
|
||||||
def setControlOnMachine(self, machineName, bControl):
|
def set_control_on_machine(self, machine_name: ClusterNodeId, control: bool):
|
||||||
if bControl:
|
if control:
|
||||||
# add machineName under control of ClusterController
|
# add machineName under control of ClusterController
|
||||||
for k, v in self.m_clusterNodes.items():
|
for _k, v in self.cluster_nodes.items():
|
||||||
if v.getName() == machineName:
|
if v.get_name() == machine_name:
|
||||||
return # nothing to do : machineName is already under the control of ClusterController
|
return # nothing to do : machineName is already under the control of ClusterController
|
||||||
|
|
||||||
clusterNode = ClusterNode(machineName, self, self.m_gridEngine)
|
cluster_node = ClusterNode(machine_name, self, self.grid_engine)
|
||||||
if machineName == 'simpatix10':
|
if machine_name == 'simpatix10':
|
||||||
clusterNode.setShouldAlwaysBeOn()
|
cluster_node.set_should_always_be_on()
|
||||||
self.m_clusterNodes[machineName] = clusterNode
|
self.cluster_nodes[machine_name] = cluster_node
|
||||||
clusterNode.m_machineStatusUpdater.start()
|
cluster_node.machine_status_updater.start()
|
||||||
else:
|
else:
|
||||||
# remove machineName from control of ClusterController
|
# remove machineName from control of ClusterController
|
||||||
clusterNode = self.m_clusterNodes.get(machineName)
|
cluster_node = self.cluster_nodes.get(machine_name)
|
||||||
if clusterNode:
|
if cluster_node:
|
||||||
clusterNode.m_machineStatusUpdater.m_bStop = True
|
cluster_node.machine_status_updater.stop = True
|
||||||
clusterNode.m_machineStatusUpdater.join()
|
cluster_node.machine_status_updater.join()
|
||||||
self.m_clusterNodes.pop(machineName)
|
self.cluster_nodes.pop(machine_name)
|
||||||
|
|
||||||
def getGridEngine(self):
|
def get_grid_engine(self) -> SunGridEngine:
|
||||||
return self.m_gridEngine
|
return self.grid_engine
|
||||||
|
|
||||||
def getMachines(self):
|
def get_machines(self) -> Dict[ClusterNodeId, ClusterNode]:
|
||||||
return self.m_clusterNodes
|
return self.cluster_nodes
|
||||||
|
|
||||||
def startReadingThreads(self):
|
def start_reading_threads(self):
|
||||||
for k, v in self.m_clusterNodes.items():
|
for _k, v in self.cluster_nodes.items():
|
||||||
v.m_machineStatusUpdater.start()
|
v.machine_status_updater.start()
|
||||||
self.m_jobsStateUpdater.start()
|
self.jobs_state_updater.start()
|
||||||
|
|
||||||
def stopReadingThreads(self):
|
def stop_reading_threads(self):
|
||||||
for k, v in self.m_clusterNodes.items():
|
for _k, v in self.cluster_nodes.items():
|
||||||
v.m_machineStatusUpdater.m_bStop = True
|
v.machine_status_updater.stop = True
|
||||||
v.m_machineStatusUpdater.join()
|
v.machine_status_updater.join()
|
||||||
self.m_jobsStateUpdater.m_bStop = True
|
self.jobs_state_updater.stop = True
|
||||||
self.m_jobsStateUpdater.join()
|
self.jobs_state_updater.join()
|
||||||
|
|
||||||
def onNewJobsState(self, newJobsState):
|
def on_new_jobs_state(self, new_jobs_state: JobsState):
|
||||||
# logDebug('ClusterStatus::onNewJobsState : attempting to acquire lock to access m_jobsState')
|
# logDebug('ClusterStatus::on_new_jobs_state : attempting to acquire lock to access jobs_state')
|
||||||
self.m_lock.acquire()
|
self.lock.acquire()
|
||||||
# logDebug('ClusterStatus::onNewJobsState : got lock to access m_jobsState')
|
# logDebug('ClusterStatus::on_new_jobs_state : got lock to access jobs_state')
|
||||||
self.m_jobsState = newJobsState
|
self.jobs_state = new_jobs_state
|
||||||
self.m_lock.release()
|
self.lock.release()
|
||||||
|
|
||||||
def getJobsOnMachine(self, machineName):
|
def get_jobs_on_machine(self, machine_name: ClusterNodeId) -> Dict[TaskUid, Task]:
|
||||||
return self.m_jobsState.getJobsOnMachine(machineName)
|
return self.jobs_state.get_jobs_on_machine(machine_name)
|
||||||
|
|
||||||
def isReady(self):
|
def is_ready(self) -> bool:
|
||||||
for k, v in self.m_clusterNodes.items():
|
for _k, v in self.cluster_nodes.items():
|
||||||
if not v.isReady():
|
if not v.is_ready():
|
||||||
logInfo('ClusterStatus::isReady : not ready because of ' + v.getName())
|
log_info('ClusterStatus::is_ready : not ready because of ' + v.get_name())
|
||||||
return False
|
return False
|
||||||
# log('ClusterStatus::isReady() : '+k+' is ready')
|
# log('ClusterStatus::is_ready() : '+k+' is ready')
|
||||||
# assert(False)
|
# assert(False)
|
||||||
if self.m_jobsState is None:
|
if self.jobs_state is None:
|
||||||
logInfo('ClusterStatus::isReady : not ready because waiting for jobs state')
|
log_info('ClusterStatus::is_ready : not ready because waiting for jobs state')
|
||||||
return False
|
return False
|
||||||
return True
|
return True
|
||||||
|
|
||||||
def getIdleMachines(self):
|
def get_idle_machines(self) -> Dict[ClusterNodeId, ClusterNode]:
|
||||||
assert self.isReady
|
assert self.is_ready
|
||||||
bBUG_00000009_IS_STILL_ALIVE = True
|
bBUG_00000009_IS_STILL_ALIVE = True
|
||||||
if bBUG_00000009_IS_STILL_ALIVE:
|
if bBUG_00000009_IS_STILL_ALIVE:
|
||||||
currentTime = time.time()
|
currentTime = time.time()
|
||||||
fJOBS_STATE_MAX_ALLOWED_AGE = 3600
|
fJOBS_STATE_MAX_ALLOWED_AGE = 3600
|
||||||
fJobsStateAge = currentTime - self.m_jobsState.getTime()
|
fJobsStateAge = currentTime - self.jobs_state.get_time()
|
||||||
if fJobsStateAge > fJOBS_STATE_MAX_ALLOWED_AGE:
|
if fJobsStateAge > fJOBS_STATE_MAX_ALLOWED_AGE:
|
||||||
logError('ClusterStatus::getIdleMachines : age of jobs state is too old (%f s). This is bug 00000009.' % (fJobsStateAge))
|
logError('ClusterStatus::get_idle_machines : age of jobs state is too old (%f s). This is bug 00000009.' % (fJobsStateAge))
|
||||||
assert False
|
assert False
|
||||||
idleMachines = {}
|
idleMachines = {}
|
||||||
for machineName, machine in self.m_clusterNodes.items():
|
for machineName, machine in self.cluster_nodes.items():
|
||||||
if machine.getPowerState() == PowerState.ON:
|
if machine.get_power_state() == PowerState.ON:
|
||||||
jobsOnThisMachine = self.getJobsOnMachine(machineName)
|
jobsOnThisMachine = self.get_jobs_on_machine(machineName)
|
||||||
if len(jobsOnThisMachine) == 0:
|
if len(jobsOnThisMachine) == 0:
|
||||||
idleMachines[machineName] = machine
|
idleMachines[machineName] = machine
|
||||||
return idleMachines
|
return idleMachines
|
||||||
|
|
||||||
def getPendingJobs(self):
|
def get_pending_jobs(self) -> Dict[TaskUid, Task]:
|
||||||
return self.m_jobsState.getPendingJobs()
|
return self.jobs_state.get_pending_jobs()
|
||||||
|
|
||||||
def getJobsState(self):
|
def get_jobs_state(self) -> JobsState:
|
||||||
return self.m_jobsState
|
return self.jobs_state
|
||||||
|
|
||||||
def queueMachineFitsJobRequirements(self, queueMachine, jobRequirements):
|
def queue_machine_fits_job_requirements(self, queue_machine: QueueMachineId, job_requirements: JobRequirements) -> bool:
|
||||||
if jobRequirements.m_queues:
|
if job_requirements.queues:
|
||||||
bQueueIsInAllowedQueues = False
|
bQueueIsInAllowedQueues = False
|
||||||
for queueName in jobRequirements.m_queues:
|
for queueName in job_requirements.queues:
|
||||||
if queueName == queueMachine.getQueueName():
|
if queueName == queue_machine.get_queue_name():
|
||||||
bQueueIsInAllowedQueues = True
|
bQueueIsInAllowedQueues = True
|
||||||
if not bQueueIsInAllowedQueues:
|
if not bQueueIsInAllowedQueues:
|
||||||
logInfo('queueMachineFitsJobRequirements : queueMachine ' + queueMachine.getName() + ' rejected because it\'s not in the allowed queues')
|
log_info('queue_machine_fits_job_requirements : queue_machine ' + queue_machine.get_name() + ' rejected because it\'s not in the allowed queues')
|
||||||
return False
|
return False
|
||||||
return True
|
return True
|
||||||
|
|
||||||
def getEnergyConsumption(self):
|
def get_energy_consumption(self) -> float:
|
||||||
"""
|
"""
|
||||||
returns an estimate of the energy consumption since the start of the cluster controller (in joules)
|
returns an estimate of the energy consumption since the start of the cluster controller (in joules)
|
||||||
"""
|
"""
|
||||||
fEnergyConsumption = 0.0
|
fEnergyConsumption = 0.0
|
||||||
for machine in self.m_clusterNodes.values():
|
for machine in self.cluster_nodes.values():
|
||||||
if machine.isReady(): # there are cases where the machine is not ready yet (for example, it's just been added to clustercontroller's control)
|
if machine.is_ready(): # there are cases where the machine is not ready yet (for example, it's just been added to clustercontroller's control)
|
||||||
fEnergyConsumption += machine.getEnergyConsumption()
|
fEnergyConsumption += machine.get_energy_consumption()
|
||||||
return fEnergyConsumption
|
return fEnergyConsumption
|
||||||
|
|
||||||
def getEnergySavings(self):
|
def get_energy_savings(self) -> float:
|
||||||
"""
|
"""
|
||||||
returns an estimate of the energy saving since the start of the cluster controller (in joules)
|
returns an estimate of the energy saving since the start of the cluster controller (in joules)
|
||||||
"""
|
"""
|
||||||
fEnergySavings = 0.0
|
fEnergySavings = 0.0
|
||||||
for machine in self.m_clusterNodes.values():
|
for machine in self.cluster_nodes.values():
|
||||||
if machine.isReady():
|
if machine.is_ready():
|
||||||
fEnergySavings += machine.getEnergySavings()
|
fEnergySavings += machine.get_energy_savings()
|
||||||
return fEnergySavings
|
return fEnergySavings
|
||||||
|
|
||||||
def getCurrentPowerConsumption(self):
|
def get_current_power_consumption(self) -> float:
|
||||||
fPowerConsumption = 0.0
|
power_consumption = 0.0
|
||||||
for machine in self.m_clusterNodes.values():
|
for machine in self.cluster_nodes.values():
|
||||||
if machine.isReady():
|
if machine.is_ready():
|
||||||
fPowerConsumption += machine.getPowerConsumption()
|
power_consumption += machine.get_power_consumption()
|
||||||
return fPowerConsumption
|
return power_consumption
|
||||||
|
|
||||||
def getCurrentPowerSavings(self):
|
def get_current_power_savings(self) -> float:
|
||||||
fPowerSavings = 0.0
|
power_savings = 0.0
|
||||||
for machine in self.m_clusterNodes.values():
|
for machine in self.cluster_nodes.values():
|
||||||
if machine.isReady():
|
if machine.is_ready():
|
||||||
fPowerSavings += machine.getPowerConsumptionForPowerState(PowerState.ON) - machine.getPowerConsumption()
|
power_savings += machine.get_power_consumption_for_power_state(PowerState.ON) - machine.get_power_consumption()
|
||||||
return fPowerSavings
|
return power_savings
|
||||||
|
|
||||||
def getNumControlledSlots(self):
|
def get_num_controlled_slots(self) -> int:
|
||||||
self.m_lock.acquire()
|
self.lock.acquire()
|
||||||
iNumControlledSlots = 0
|
num_controlled_slots = 0
|
||||||
for machine in self.m_clusterNodes.values():
|
for machine in self.cluster_nodes.values():
|
||||||
queueMachine = self.m_jobsState.getQueueMachine(machine.getName())
|
queue_machine = self.jobs_state.get_queue_machine(machine.get_name())
|
||||||
iNumControlledSlots += queueMachine.getNumSlots()
|
num_controlled_slots += queue_machine.get_num_slots()
|
||||||
self.m_lock.release()
|
self.lock.release()
|
||||||
return iNumControlledSlots
|
return num_controlled_slots
|
||||||
|
|
||||||
def getNumUsedSlots(self):
|
def get_num_used_slots(self) -> int:
|
||||||
self.m_lock.acquire()
|
self.lock.acquire()
|
||||||
iNumUsedSlots = 0
|
num_used_slots = 0
|
||||||
for machine in self.m_clusterNodes.values():
|
for machine in self.cluster_nodes.values():
|
||||||
queueMachine = self.m_jobsState.getQueueMachine(machine.getName())
|
queue_machine = self.jobs_state.get_queue_machine(machine.get_name())
|
||||||
iNumUsedSlotsOnThisMachine = queueMachine.getNumSlots() - self.m_jobsState.getNumFreeSlotsOnQueueMachine(queueMachine)
|
num_used_slots_on_this_machine = queue_machine.get_num_slots() - self.jobs_state.getNumFreeSlotsOnQueueMachine(queue_machine)
|
||||||
assert iNumUsedSlotsOnThisMachine >= 0
|
assert num_used_slots_on_this_machine >= 0
|
||||||
iNumUsedSlots += iNumUsedSlotsOnThisMachine
|
num_used_slots += num_used_slots_on_this_machine
|
||||||
self.m_lock.release()
|
self.lock.release()
|
||||||
return iNumUsedSlots
|
return num_used_slots
|
||||||
|
|
||||||
def getNumWastedSlots(self):
|
def get_num_wasted_slots(self) -> int:
|
||||||
self.m_lock.acquire()
|
self.lock.acquire()
|
||||||
iNumWastedSlots = 0
|
iNumWastedSlots = 0
|
||||||
for machine in self.m_clusterNodes.values():
|
for machine in self.cluster_nodes.values():
|
||||||
if machine.getPowerState() == PowerState.ON:
|
if machine.get_power_state() == PowerState.ON:
|
||||||
queueMachine = self.m_jobsState.getQueueMachine(machine.getName())
|
queue_machine = self.jobs_state.get_queue_machine(machine.get_name())
|
||||||
iNumWastedSlots += self.m_jobsState.getNumFreeSlotsOnQueueMachine(queueMachine)
|
iNumWastedSlots += self.jobs_state.getNumFreeSlotsOnQueueMachine(queue_machine)
|
||||||
self.m_lock.release()
|
self.lock.release()
|
||||||
return iNumWastedSlots
|
return iNumWastedSlots
|
||||||
|
|
||||||
def getNumSleepingSlots(self):
|
def get_num_sleeping_slots(self) -> int:
|
||||||
self.m_lock.acquire()
|
self.lock.acquire()
|
||||||
iNumSleepingSlots = 0
|
iNumSleepingSlots = 0
|
||||||
for machine in self.m_clusterNodes.values():
|
for machine in self.cluster_nodes.values():
|
||||||
if machine.getPowerState() == PowerState.SLEEP:
|
if machine.get_power_state() == PowerState.SLEEP:
|
||||||
queueMachine = self.m_jobsState.getQueueMachine(machine.getName())
|
queue_machine = self.jobs_state.get_queue_machine(machine.get_name())
|
||||||
iNumSleepingSlots += self.m_jobsState.getNumFreeSlotsOnQueueMachine(queueMachine)
|
iNumSleepingSlots += self.jobs_state.getNumFreeSlotsOnQueueMachine(queue_machine)
|
||||||
self.m_lock.release()
|
self.lock.release()
|
||||||
return iNumSleepingSlots
|
return iNumSleepingSlots
|
||||||
|
|
|
@ -26,7 +26,7 @@ if __name__ == '__main__':
|
||||||
remoteCommand += 'launchctl unload /Library/LaunchDaemons/fr.univ-rennes1.ipr.ClusterController.plist;'
|
remoteCommand += 'launchctl unload /Library/LaunchDaemons/fr.univ-rennes1.ipr.ClusterController.plist;'
|
||||||
remoteCommand += 'launchctl load /Library/LaunchDaemons/fr.univ-rennes1.ipr.ClusterController.plist;'
|
remoteCommand += 'launchctl load /Library/LaunchDaemons/fr.univ-rennes1.ipr.ClusterController.plist;'
|
||||||
command = 'ssh root@'+ machineName +' "'+remoteCommand+'"'
|
command = 'ssh root@'+ machineName +' "'+remoteCommand+'"'
|
||||||
( returnCode, stdout, stderr ) = executeCommand( command )
|
( returnCode, stdout, stderr ) = execute_command( command )
|
||||||
for strSingleCommand in remoteCommand.split(';'):
|
for strSingleCommand in remoteCommand.split(';'):
|
||||||
print(strSingleCommand)
|
print(strSingleCommand)
|
||||||
print(stdout)
|
print(stdout)
|
||||||
|
|
|
@ -1,3 +1,4 @@
|
||||||
|
import enum
|
||||||
from typing import Optional, Dict, List
|
from typing import Optional, Dict, List
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
|
||||||
|
@ -13,7 +14,7 @@ class JobStateFlags:
|
||||||
SUSPENDED = 128
|
SUSPENDED = 128
|
||||||
|
|
||||||
|
|
||||||
class ParallelEnvironment:
|
class ParallelEnvironment(enum.Enum):
|
||||||
MPI = 1
|
MPI = 1
|
||||||
|
|
||||||
|
|
||||||
|
@ -29,14 +30,14 @@ ResourceRequest = str # eg 'mem_available=5G'
|
||||||
class JobRequirements:
|
class JobRequirements:
|
||||||
num_slots: Optional[int]
|
num_slots: Optional[int]
|
||||||
architecture: Optional[str] # machine architecture
|
architecture: Optional[str] # machine architecture
|
||||||
m_parallelEnvironment: Optional[int] # todo: make ParallelEnvironment an Enum
|
parallel_environment: Optional[ParallelEnvironment]
|
||||||
queues: Optional[List[QueueId]] # the list of queues this job is allowed to run on
|
queues: Optional[List[QueueId]] # the list of queues this job is allowed to run on
|
||||||
resources: Optional[List[ResourceRequest]]
|
resources: Optional[List[ResourceRequest]]
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.num_slots = None
|
self.num_slots = None
|
||||||
self.architecture = None
|
self.architecture = None
|
||||||
self.m_parallelEnvironment = None
|
self.parallel_environment = None
|
||||||
self.queues = None
|
self.queues = None
|
||||||
self.resources = None
|
self.resources = None
|
||||||
|
|
||||||
|
@ -62,10 +63,10 @@ class TaskUid:
|
||||||
"""
|
"""
|
||||||
required to use a TaskUid as a dict hash key
|
required to use a TaskUid as a dict hash key
|
||||||
"""
|
"""
|
||||||
hash = self.job_id * self.MAX_NUM_JOBS_IN_ARRAY
|
_hash = self.job_id * self.MAX_NUM_JOBS_IN_ARRAY
|
||||||
if self.task_id is not None:
|
if self.task_id is not None:
|
||||||
hash += self.task_id
|
_hash += self.task_id
|
||||||
return hash
|
return _hash
|
||||||
|
|
||||||
def __eq__(self, other: 'TaskUid'):
|
def __eq__(self, other: 'TaskUid'):
|
||||||
"""
|
"""
|
||||||
|
|
|
@ -1,26 +1,30 @@
|
||||||
from typing import Dict
|
from typing import Dict
|
||||||
from .Log import *
|
from datetime import datetime
|
||||||
from .Job import Task, TaskUid
|
# from .Log import log_info
|
||||||
|
from .Job import Task, TaskUid, QueueMachineId
|
||||||
|
from .QueueMachine import QueueMachine
|
||||||
|
|
||||||
|
|
||||||
class JobsState:
|
class JobsState:
|
||||||
"""
|
"""
|
||||||
represents a snapshot of the state of SGE jobs as seen by the SGE command "qstat -f -u \*"
|
represents a snapshot of the state of SGE jobs as seen by the SGE command "qstat -f -u \\*"
|
||||||
"""
|
"""
|
||||||
tasks: Dict[TaskUid, Task]
|
tasks: Dict[TaskUid, Task] # list of tasks
|
||||||
job_array_tasks: Dict[int, Dict[TaskUid, Task]]
|
job_array_tasks: Dict[int, Dict[TaskUid, Task]] # a dictionary of jobs for each job array, indexed by job array id
|
||||||
|
queue_machines: Dict[QueueMachineId, QueueMachine] # list of queue machines such as allintel.q@simpatix10
|
||||||
|
state_time: datetime # the time at which the state was snapshot
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.tasks = {} # list of jobs
|
self.tasks = {}
|
||||||
self.job_array_tasks = {} # a dictionary of jobs for each job array, indexed by job array id
|
self.job_array_tasks = {}
|
||||||
self.m_queueMachines = {} # list of queue machines such as allintel.q@simpatix10
|
self.queue_machines = {}
|
||||||
self.m_stateTime = None # the time at which the state was snapshot
|
self.state_time = None
|
||||||
|
|
||||||
def deleteAllJobs(self):
|
def delete_all_tasks(self):
|
||||||
self.tasks = {}
|
self.tasks = {}
|
||||||
self.job_array_tasks = {}
|
self.job_array_tasks = {}
|
||||||
|
|
||||||
def addTask(self, task: Task):
|
def add_task(self, task: Task):
|
||||||
task_uid = task.get_id()
|
task_uid = task.get_id()
|
||||||
self.tasks[task_uid] = task
|
self.tasks[task_uid] = task
|
||||||
if task_uid.is_job_array_element():
|
if task_uid.is_job_array_element():
|
||||||
|
@ -36,56 +40,56 @@ class JobsState:
|
||||||
def get_job_array_tasks(self, job_array_id: int) -> Dict[TaskUid, Task]:
|
def get_job_array_tasks(self, job_array_id: int) -> Dict[TaskUid, Task]:
|
||||||
return self.job_array_tasks.get(job_array_id)
|
return self.job_array_tasks.get(job_array_id)
|
||||||
|
|
||||||
def setTime(self, stateTime):
|
def set_time(self, state_time: datetime):
|
||||||
self.m_stateTime = stateTime
|
self.state_time = state_time
|
||||||
|
|
||||||
def getTime(self):
|
def get_time(self) -> datetime:
|
||||||
return self.m_stateTime
|
return self.state_time
|
||||||
|
|
||||||
def getJobsOnMachine(self, machineName):
|
def get_jobs_on_machine(self, machine_name: str) -> Dict[TaskUid, Task]:
|
||||||
jobs_on_machine = {}
|
jobs_on_machine = {}
|
||||||
for task_uid, task in self.tasks.items():
|
for task_uid, task in self.tasks.items():
|
||||||
for queueMachineName, numSlots in task.get_slots().items():
|
for queue_machine_name, _num_slots in task.get_slots().items():
|
||||||
jobMachineName = queueMachineName.split('@')[1]
|
jobMachineName = queue_machine_name.split('@')[1]
|
||||||
if jobMachineName == machineName:
|
if jobMachineName == machine_name:
|
||||||
jobs_on_machine[task_uid] = task
|
jobs_on_machine[task_uid] = task
|
||||||
return jobs_on_machine
|
return jobs_on_machine
|
||||||
|
|
||||||
def getNumFreeSlotsOnQueueMachine(self, queueMachine):
|
def get_num_free_slots_on_queue_machine(self, queue_machine: QueueMachine) -> int:
|
||||||
# logInfo('getNumFreeSlotsOnQueueMachine : looking for free slots on queuemachine %s' % queueMachine.getName())
|
# log_info('getNumFreeSlotsOnQueueMachine : looking for free slots on queuemachine %s' % queueMachine.get_name())
|
||||||
numUsedSlots = 0
|
numUsedSlots = 0
|
||||||
for job in self.tasks.values():
|
for job in self.tasks.values():
|
||||||
numUsedSlotsByThisJob = job.get_slots().get(queueMachine.getName())
|
numUsedSlotsByThisJob = job.get_slots().get(queue_machine.get_name())
|
||||||
if numUsedSlotsByThisJob is not None:
|
if numUsedSlotsByThisJob is not None:
|
||||||
# logInfo('getNumFreeSlotsOnQueueMachine : job %d uses %d slots' % (job.getId().asStr(), numUsedSlotsByThisJob))
|
# log_info('getNumFreeSlotsOnQueueMachine : job %d uses %d slots' % (job.getId().asStr(), numUsedSlotsByThisJob))
|
||||||
numUsedSlots += numUsedSlotsByThisJob
|
numUsedSlots += numUsedSlotsByThisJob
|
||||||
else:
|
else:
|
||||||
None
|
pass
|
||||||
# logInfo('getNumFreeSlotsOnQueueMachine : job %d uses no slot' % job.getId().asStr())
|
# log_info('getNumFreeSlotsOnQueueMachine : job %d uses no slot' % job.getId().asStr())
|
||||||
numFreeSlots = queueMachine.getNumSlots() - numUsedSlots
|
numFreeSlots = queue_machine.get_num_slots() - numUsedSlots
|
||||||
assert numFreeSlots >= 0
|
assert numFreeSlots >= 0
|
||||||
return numFreeSlots
|
return numFreeSlots
|
||||||
|
|
||||||
def addQueueMachine(self, queueMachine):
|
def add_queue_machine(self, queue_machine: QueueMachine):
|
||||||
self.m_queueMachines[queueMachine.getName()] = queueMachine
|
self.queue_machines[queue_machine.get_name()] = queue_machine
|
||||||
|
|
||||||
def getQueueMachine(self, machineName):
|
def get_queue_machine(self, machine_name) -> QueueMachine:
|
||||||
"""
|
"""
|
||||||
finds the queue machine associated with a machine
|
finds the queue machine associated with a machine
|
||||||
"""
|
"""
|
||||||
queueMachine = None
|
queueMachine = None
|
||||||
for qmName, qm in self.m_queueMachines.items():
|
for _qname, qm in self.queue_machines.items():
|
||||||
if qm.m_machineName == machineName:
|
if qm.machine_name == machine_name:
|
||||||
assert queueMachine is None # to be sure that no more than one queue machine is on a given machine
|
assert queueMachine is None # to be sure that no more than one queue machine is on a given machine
|
||||||
queueMachine = qm
|
queueMachine = qm
|
||||||
return queueMachine
|
return queueMachine
|
||||||
|
|
||||||
def getQueueMachines(self):
|
def get_queue_machines(self) -> Dict[QueueMachineId, QueueMachine]:
|
||||||
return self.m_queueMachines
|
return self.queue_machines
|
||||||
|
|
||||||
def getPendingJobs(self):
|
def get_pending_jobs(self) -> Dict[TaskUid, Task]:
|
||||||
pendingJobs = {}
|
pending_jobs = {}
|
||||||
for jobId, job in self.tasks.items():
|
for _task_id, task in self.tasks.items():
|
||||||
if job.is_pending():
|
if task.is_pending():
|
||||||
pendingJobs[job.get_id()] = job
|
pending_jobs[task.get_id()] = task
|
||||||
return pendingJobs
|
return pending_jobs
|
||||||
|
|
|
@ -1,35 +1,39 @@
|
||||||
|
from typing import TYPE_CHECKING
|
||||||
import threading
|
import threading
|
||||||
import Util
|
|
||||||
import os
|
|
||||||
import traceback
|
|
||||||
import sys
|
|
||||||
import time
|
import time
|
||||||
|
from .Util import on_exception
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
from .ClusterStatus import ClusterStatus
|
||||||
|
|
||||||
class JobsStateUpdater( threading.Thread ):
|
|
||||||
DELAY_BETWEEN_STATUS_CHECKS=10 # in seconds
|
class JobsStateUpdater(threading.Thread):
|
||||||
def __init__( self, clusterStatus ):
|
cluster_status: 'ClusterStatus'
|
||||||
|
stop: bool
|
||||||
|
DELAY_BETWEEN_STATUS_CHECKS = 10 # in seconds
|
||||||
|
|
||||||
|
def __init__(self, clusterStatus):
|
||||||
threading.Thread.__init__(self)
|
threading.Thread.__init__(self)
|
||||||
self.m_clusterStatus = clusterStatus
|
self.cluster_status = clusterStatus
|
||||||
self.m_bStop = False
|
self.stop = False
|
||||||
|
|
||||||
def getName( self ):
|
|
||||||
return 'JobsStateUpdater'
|
|
||||||
|
|
||||||
def getGridEngine( self ):
|
|
||||||
return self.m_clusterStatus.getGridEngine()
|
|
||||||
|
|
||||||
def updateClusterStatus( self ):
|
|
||||||
#log('JobsStateUpdater::updateClusterStatus : start')
|
|
||||||
|
|
||||||
jobsState = self.getGridEngine().getCurrentJobsState()
|
def get_name(self):
|
||||||
|
return 'JobsStateUpdater'
|
||||||
|
|
||||||
|
def get_grid_engine(self):
|
||||||
|
return self.cluster_status.get_grid_engine()
|
||||||
|
|
||||||
|
def updateClusterStatus(self):
|
||||||
|
# log('JobsStateUpdater::updateClusterStatus : start')
|
||||||
|
|
||||||
|
jobsState = self.get_grid_engine().getCurrentJobsState()
|
||||||
# update the jobs in the cluster status
|
# update the jobs in the cluster status
|
||||||
self.m_clusterStatus.onNewJobsState( jobsState )
|
self.cluster_status.on_new_jobs_state(jobsState)
|
||||||
#log('JobsStateUpdater::updateClusterStatus : end')
|
# log('JobsStateUpdater::updateClusterStatus : end')
|
||||||
|
|
||||||
def run( self ):
|
def run(self):
|
||||||
try:
|
try:
|
||||||
while not self.m_bStop :
|
while not self.stop:
|
||||||
self.updateClusterStatus()
|
self.updateClusterStatus()
|
||||||
time.sleep(JobsStateUpdater.DELAY_BETWEEN_STATUS_CHECKS)
|
time.sleep(JobsStateUpdater.DELAY_BETWEEN_STATUS_CHECKS)
|
||||||
except BaseException, exception: # catches all exceptions, including the ctrl+C (KeyboardInterrupt)
|
except BaseException as exception: # catches all exceptions, including the ctrl+C (KeyboardInterrupt)
|
||||||
Util.onException(exception)
|
on_exception(exception)
|
||||||
|
|
|
@ -5,10 +5,10 @@ gLogFilePath = '/tmp/ClusterController.log' # '/var/log/ClusterController.log'
|
||||||
|
|
||||||
|
|
||||||
def log(message):
|
def log(message):
|
||||||
threadName = threading.currentThread().getName()
|
threadName = threading.currentThread().get_name()
|
||||||
logMessage = time.asctime(time.localtime()) + ' : ' + threadName + ' : ' + message
|
logMessage = time.asctime(time.localtime()) + ' : ' + threadName + ' : ' + message
|
||||||
print(logMessage)
|
print(logMessage)
|
||||||
f = open(gLogFilePath, 'a+')
|
f = open(gLogFilePath, 'a+', encoding='utf8')
|
||||||
assert f
|
assert f
|
||||||
try:
|
try:
|
||||||
f.write(logMessage + '\n')
|
f.write(logMessage + '\n')
|
||||||
|
@ -21,11 +21,11 @@ def logDebug(message):
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
def logInfo(message):
|
def log_info(message):
|
||||||
log('[I]' + message)
|
log('[I]' + message)
|
||||||
|
|
||||||
|
|
||||||
def logWarning(message):
|
def log_warning(message):
|
||||||
log('[W]' + message)
|
log('[W]' + message)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,5 +1,8 @@
|
||||||
|
|
||||||
class PowerState:
|
import enum
|
||||||
|
|
||||||
|
|
||||||
|
class PowerState(enum.Enum):
|
||||||
UNKNOWN = 0
|
UNKNOWN = 0
|
||||||
OFF = 1
|
OFF = 1
|
||||||
ON = 2
|
ON = 2
|
||||||
|
|
|
@ -2,7 +2,6 @@ import io
|
||||||
import re
|
import re
|
||||||
from .JobsState import JobsState
|
from .JobsState import JobsState
|
||||||
from .QueueMachine import QueueMachine, QueueMachineStateFlags
|
from .QueueMachine import QueueMachine, QueueMachineStateFlags
|
||||||
from .Util import *
|
|
||||||
from .Log import logError
|
from .Log import logError
|
||||||
from .Job import JobStateFlags, TaskUid, Task, ParallelEnvironment, JobState
|
from .Job import JobStateFlags, TaskUid, Task, ParallelEnvironment, JobState
|
||||||
import logging
|
import logging
|
||||||
|
@ -54,13 +53,13 @@ class QstatParser:
|
||||||
assert False, 'unhandled queue machine state flag :"' + c + '"'
|
assert False, 'unhandled queue machine state flag :"' + c + '"'
|
||||||
return queueMachineState
|
return queueMachineState
|
||||||
|
|
||||||
def parseQstatOutput(self, qstatOutput, cluster_domain: str = 'ipr.univ-rennes1.fr'):
|
def parse_qstat_output(self, qstat_output: str, cluster_domain: str = 'ipr.univ-rennes1.fr'):
|
||||||
"""
|
"""
|
||||||
parses result of command 'qstat -f -u \\* -pri'
|
parses result of command 'qstat -f -u \\* -pri'
|
||||||
|
|
||||||
cluster_domain: network domain of the cluster (eg 'ipr.univ-rennes.fr'). This information is missing from qstat's output and is used to form the fully qualified domain name of the cluster machines.
|
cluster_domain: network domain of the cluster (eg 'ipr.univ-rennes.fr'). This information is missing from qstat's output and is used to form the fully qualified domain name of the cluster machines.
|
||||||
"""
|
"""
|
||||||
logging.debug('qstatOutput type : %s' % type(qstatOutput))
|
logging.debug('qstatOutput type : %s', type(qstat_output))
|
||||||
|
|
||||||
def parse_pending_tasks(task_ranges_sequence):
|
def parse_pending_tasks(task_ranges_sequence):
|
||||||
"""
|
"""
|
||||||
|
@ -99,13 +98,13 @@ class QstatParser:
|
||||||
# ---------------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------------
|
||||||
# main.q@physix88.ipr.univ-renne BIP 0/0/36 14.03 lx-amd64
|
# main.q@physix88.ipr.univ-renne BIP 0/0/36 14.03 lx-amd64
|
||||||
# TODO: fix this properly by parsing the output of 'qstat -f -u \* -xml' instead of 'qstat -f -u \*'
|
# TODO: fix this properly by parsing the output of 'qstat -f -u \* -xml' instead of 'qstat -f -u \*'
|
||||||
qstatOutput = re.sub(r'\.ipr\.univ[^ ]*', f'.{cluster_domain}', qstatOutput)
|
qstat_output = re.sub(r'\.ipr\.univ[^ ]*', f'.{cluster_domain}', qstat_output)
|
||||||
|
|
||||||
jobsState = JobsState()
|
jobsState = JobsState()
|
||||||
f = io.StringIO(qstatOutput)
|
f = io.StringIO(qstat_output)
|
||||||
line = f.readline()
|
line = f.readline()
|
||||||
currentQueueMachine = None
|
current_queue_machine = None
|
||||||
bInPendingJobsSection = False
|
in_pending_jobs_section = False
|
||||||
# examples of job line :
|
# examples of job line :
|
||||||
# 43521 0.55108 Confidiso3 aghoufi r 08/19/2009 18:40:09 1
|
# 43521 0.55108 Confidiso3 aghoufi r 08/19/2009 18:40:09 1
|
||||||
# a typical job line in the pending jobs section looks like this :
|
# a typical job line in the pending jobs section looks like this :
|
||||||
|
@ -120,42 +119,42 @@ class QstatParser:
|
||||||
# ntckts The job's ticket amount in normalized fashion.
|
# ntckts The job's ticket amount in normalized fashion.
|
||||||
# ppri The job's -p priority as specified by the user.
|
# ppri The job's -p priority as specified by the user.
|
||||||
|
|
||||||
jobRegularExp = re.compile(r'^[ ]*(?P<jobId>[^ ]+)[ ]+(?P<JobPriority>[0-9.]+)[ ]+(?P<nurg>[0-9.]+)[ ]+(?P<npprior>[0-9.]+)[ ]+(?P<ntckts>[0-9.]+)[ ]+(?P<ppri>-?[0-9]+)[ ]+(?P<jobScriptName>[^ ]+)[ ]+(?P<jobOwner>[^ ]+)[ ]+(?P<jobStatus>[^ ]+)[ ]+(?P<jobStartOrSubmitTime>[0-9][0-9]/[0-9][0-9]/[0-9][0-9][0-9][0-9] [0-9][0-9]:[0-9][0-9]:[0-9][0-9])[ ]+(?P<numSlots>[0-9]+)[ ]+(?P<jobArrayDetails>[^\n]*)[\s]*$')
|
job_regular_exp = re.compile(r'^[ ]*(?P<jobId>[^ ]+)[ ]+(?P<JobPriority>[0-9.]+)[ ]+(?P<nurg>[0-9.]+)[ ]+(?P<npprior>[0-9.]+)[ ]+(?P<ntckts>[0-9.]+)[ ]+(?P<ppri>-?[0-9]+)[ ]+(?P<jobScriptName>[^ ]+)[ ]+(?P<jobOwner>[^ ]+)[ ]+(?P<jobStatus>[^ ]+)[ ]+(?P<jobStartOrSubmitTime>[0-9][0-9]/[0-9][0-9]/[0-9][0-9][0-9][0-9] [0-9][0-9]:[0-9][0-9]:[0-9][0-9])[ ]+(?P<numSlots>[0-9]+)[ ]+(?P<jobArrayDetails>[^\n]*)[\s]*$')
|
||||||
# example of machine line :
|
# example of machine line :
|
||||||
# allintel.q@simpatix34.univ-ren BIP 0/6/8 6.00 darwin-x86
|
# allintel.q@simpatix34.univ-ren BIP 0/6/8 6.00 darwin-x86
|
||||||
machineRegularExp = re.compile(r'^(?P<queueName>[^@]+)@(?P<machineName>[^ ]+)[ ]+(?P<queueTypeString>[^ ]+)[ ]+(?P<numReservedSlots>[^/]+)/(?P<numUsedSlots>[^/]+)/(?P<numTotalSlots>[^ ]+)[ ]+(?P<cpuLoad>[^ ]+)[\s]+(?P<archName>[^ ]+)[\s]+(?P<queueMachineStatus>[^\s]*)')
|
machine_regular_exp = re.compile(r'^(?P<queueName>[^@]+)@(?P<machineName>[^ ]+)[ ]+(?P<queueTypeString>[^ ]+)[ ]+(?P<numReservedSlots>[^/]+)/(?P<numUsedSlots>[^/]+)/(?P<numTotalSlots>[^ ]+)[ ]+(?P<cpuLoad>[^ ]+)[\s]+(?P<archName>[^ ]+)[\s]+(?P<queueMachineStatus>[^\s]*)')
|
||||||
pendingJobsHeaderRegularExp = re.compile('^ - PENDING JOBS - PENDING JOBS - PENDING JOBS - PENDING JOBS - PENDING JOBS[?]*')
|
pending_jobs_header_regular_exp = re.compile('^ - PENDING JOBS - PENDING JOBS - PENDING JOBS - PENDING JOBS - PENDING JOBS[?]*')
|
||||||
while len(line) > 0:
|
while len(line) > 0:
|
||||||
# print line
|
# print line
|
||||||
# check if the current line is a line describing a job running on a machine
|
# check if the current line is a line describing a job running on a machine
|
||||||
matchObj = jobRegularExp.match(line)
|
match_obj = job_regular_exp.match(line)
|
||||||
if matchObj:
|
if match_obj:
|
||||||
# we are dealing with a job line
|
# we are dealing with a job line
|
||||||
if not bInPendingJobsSection:
|
if not in_pending_jobs_section:
|
||||||
assert currentQueueMachine
|
assert current_queue_machine
|
||||||
# log('QstatParser::parseQstatOutput : jobId = "'+matchObj.group('jobId')+'"')
|
# log('QstatParser::parseQstatOutput : jobId = "'+matchObj.group('jobId')+'"')
|
||||||
job_id = int(matchObj.group('jobId'))
|
job_id = int(match_obj.group('jobId'))
|
||||||
logging.debug('iJobId = %d' % job_id)
|
logging.debug('iJobId = %d', job_id)
|
||||||
jobState = self.parseJobState(matchObj.group('jobStatus'))
|
job_state = self.parseJobState(match_obj.group('jobStatus'))
|
||||||
strJobArrayDetails = matchObj.group('jobArrayDetails')
|
job_array_details = match_obj.group('jobArrayDetails')
|
||||||
bIsJobArray = (len(strJobArrayDetails) != 0)
|
is_job_array = (len(job_array_details) != 0)
|
||||||
# logDebug('strJobArrayDetails = "%s", bIsJobArray=%d' % (strJobArrayDetails, int(bIsJobArray)))
|
# logDebug('strJobArrayDetails = "%s", bIsJobArray=%d' % (strJobArrayDetails, int(bIsJobArray)))
|
||||||
# each element of a job array is treated as a separate job for the sake of simplicity.
|
# each element of a job array is treated as a separate job for the sake of simplicity.
|
||||||
# For these elements, the job id in sge sense is the same, but they are different in this program's sense
|
# For these elements, the job id in sge sense is the same, but they are different in this program's sense
|
||||||
task_ids = range(0, 1) # just one element, unless it's a job array
|
task_ids = range(0, 1) # just one element, unless it's a job array
|
||||||
if bIsJobArray:
|
if is_job_array:
|
||||||
if bInPendingJobsSection:
|
if in_pending_jobs_section:
|
||||||
task_ids = parse_pending_tasks(strJobArrayDetails)
|
task_ids = parse_pending_tasks(job_array_details)
|
||||||
else:
|
else:
|
||||||
# we are in the running jobs section, and here we expect the strJobArrayDetails to just contain the index of the job array element
|
# we are in the running jobs section, and here we expect the strJobArrayDetails to just contain the index of the job array element
|
||||||
iJobArrayElementIndex = int(strJobArrayDetails)
|
task_id = int(job_array_details)
|
||||||
assert iJobArrayElementIndex != 0 # sge does not allow element indices to be 0
|
assert task_id != 0 # sge does not allow element indices to be 0
|
||||||
task_ids = range(iJobArrayElementIndex, iJobArrayElementIndex + 1)
|
task_ids = range(task_id, task_id + 1)
|
||||||
logging.debug('task_ids = %s' % task_ids)
|
logging.debug('task_ids = %s', task_ids)
|
||||||
for task_id in task_ids:
|
for task_id in task_ids:
|
||||||
logging.debug('task_id = %s' % task_id)
|
logging.debug('task_id = %s', task_id)
|
||||||
task_uid = None
|
task_uid = None
|
||||||
if bIsJobArray:
|
if is_job_array:
|
||||||
task_uid = TaskUid(job_id, task_id)
|
task_uid = TaskUid(job_id, task_id)
|
||||||
else:
|
else:
|
||||||
task_uid = TaskUid(job_id)
|
task_uid = TaskUid(job_id)
|
||||||
|
@ -165,57 +164,57 @@ class QstatParser:
|
||||||
# this job hasn't been encountered yet in the output of qstat ...
|
# this job hasn't been encountered yet in the output of qstat ...
|
||||||
# we could either be in the pending jobs section or in the running jobs section
|
# we could either be in the pending jobs section or in the running jobs section
|
||||||
task = Task(task_uid)
|
task = Task(task_uid)
|
||||||
jobsState.addTask(task)
|
jobsState.add_task(task)
|
||||||
task.set_state(jobState)
|
task.set_state(job_state)
|
||||||
strJobStartOrSubmitTime = matchObj.group('jobStartOrSubmitTime')
|
job_start_or_submit_time_as_str = match_obj.group('jobStartOrSubmitTime')
|
||||||
jobStartOrSubmitTime = time.strptime(strJobStartOrSubmitTime, '%m/%d/%Y %H:%M:%S')
|
job_start_or_submit_time = time.strptime(job_start_or_submit_time_as_str, '%m/%d/%Y %H:%M:%S')
|
||||||
if bInPendingJobsSection:
|
if in_pending_jobs_section:
|
||||||
task.get_submit_time(jobStartOrSubmitTime)
|
task.get_submit_time(job_start_or_submit_time)
|
||||||
else:
|
else:
|
||||||
task.set_start_time(jobStartOrSubmitTime)
|
task.set_start_time(job_start_or_submit_time)
|
||||||
task.set_owner(matchObj.group('jobOwner'))
|
task.set_owner(match_obj.group('jobOwner'))
|
||||||
task.set_script_name(matchObj.group('jobScriptName'))
|
task.set_script_name(match_obj.group('jobScriptName'))
|
||||||
if bInPendingJobsSection:
|
if in_pending_jobs_section:
|
||||||
task.set_num_required_slots(int(matchObj.group('numSlots')))
|
task.set_num_required_slots(int(match_obj.group('numSlots')))
|
||||||
else:
|
else:
|
||||||
assert not bInPendingJobsSection # if we are in the pending jobs section, the job should be new
|
assert not in_pending_jobs_section # if we are in the pending jobs section, the job should be new
|
||||||
if not bInPendingJobsSection:
|
if not in_pending_jobs_section:
|
||||||
task.add_slots(currentQueueMachine.getName(), int(matchObj.group('numSlots')))
|
task.add_slots(current_queue_machine.get_name(), int(match_obj.group('numSlots')))
|
||||||
else:
|
else:
|
||||||
# the current line does not describe a job
|
# the current line does not describe a job
|
||||||
if not bInPendingJobsSection:
|
if not in_pending_jobs_section:
|
||||||
# check if this line describes the status of a machine
|
# check if this line describes the status of a machine
|
||||||
matchObj = machineRegularExp.match(line)
|
match_obj = machine_regular_exp.match(line)
|
||||||
if matchObj:
|
if match_obj:
|
||||||
queueName = matchObj.group('queueName')
|
queue_name = match_obj.group('queueName')
|
||||||
machineName = matchObj.group('machineName')
|
machine_name = match_obj.group('machineName')
|
||||||
queueMachine = QueueMachine(queueName, machineName)
|
queue_machine = QueueMachine(queue_name, machine_name)
|
||||||
# log(line)
|
# log(line)
|
||||||
# log('matchObj.group(queueTypeString) :' + matchObj.group('queueTypeString'))
|
# log('matchObj.group(queueTypeString) :' + matchObj.group('queueTypeString'))
|
||||||
# log('matchObj.group(numTotalSlots) :' + matchObj.group('numTotalSlots'))
|
# log('matchObj.group(numTotalSlots) :' + matchObj.group('numTotalSlots'))
|
||||||
queueMachine.setNumSlots(int(matchObj.group('numTotalSlots')))
|
queue_machine.set_num_slots(int(match_obj.group('numTotalSlots')))
|
||||||
queueMachine.setNumUsedSlots(int(matchObj.group('numUsedSlots')))
|
queue_machine.set_num_used_slots(int(match_obj.group('numUsedSlots')))
|
||||||
strCpuLoad = matchObj.group('cpuLoad')
|
cpu_load_as_str = match_obj.group('cpuLoad')
|
||||||
if strCpuLoad != '-NA-':
|
if cpu_load_as_str != '-NA-':
|
||||||
queueMachine.setCpuLoad(float(strCpuLoad))
|
queue_machine.set_cpu_load(float(cpu_load_as_str))
|
||||||
|
|
||||||
strQueueMachineState = matchObj.group('queueMachineStatus')
|
queue_machine_state_as_str = match_obj.group('queueMachineStatus')
|
||||||
queueMachine.setState(self.parseQueueMachineState(strQueueMachineState))
|
queue_machine.set_state(self.parseQueueMachineState(queue_machine_state_as_str))
|
||||||
# log('QstatParser::parseQstatOutput : queueName = "'+matchObj.group('queueName')+'"')
|
# log('QstatParser::parseQstatOutput : queueName = "'+matchObj.group('queueName')+'"')
|
||||||
# log('QstatParser::parseQstatOutput : machineName = "'+matchObj.group('machineName')+'"')
|
# log('QstatParser::parseQstatOutput : machineName = "'+matchObj.group('machineName')+'"')
|
||||||
currentQueueMachine = queueMachine
|
current_queue_machine = queue_machine
|
||||||
jobsState.addQueueMachine(queueMachine)
|
jobsState.add_queue_machine(queue_machine)
|
||||||
else:
|
else:
|
||||||
matchObj = pendingJobsHeaderRegularExp.match(line)
|
match_obj = pending_jobs_header_regular_exp.match(line)
|
||||||
if matchObj:
|
if match_obj:
|
||||||
bInPendingJobsSection = True
|
in_pending_jobs_section = True
|
||||||
currentQueueMachine = None
|
current_queue_machine = None
|
||||||
else:
|
else:
|
||||||
pass
|
pass
|
||||||
else:
|
else:
|
||||||
# we are in a pending jobs section
|
# we are in a pending jobs section
|
||||||
matchObj = re.match('^[#]+$', line)
|
match_obj = re.match('^[#]+$', line)
|
||||||
if not matchObj:
|
if not match_obj:
|
||||||
# unexpected line
|
# unexpected line
|
||||||
print('line = "' + line + '"')
|
print('line = "' + line + '"')
|
||||||
assert False
|
assert False
|
||||||
|
@ -223,11 +222,11 @@ class QstatParser:
|
||||||
f.close()
|
f.close()
|
||||||
return jobsState
|
return jobsState
|
||||||
|
|
||||||
def parseJobDetails(self, qstatOutput, job):
|
def parse_job_details(self, qstat_output: str, task: Task):
|
||||||
"""
|
"""
|
||||||
adds to job the details parsed from the output of the "qstat -j <jobid>" command
|
adds to job the details parsed from the output of the "qstat -j <jobid>" command
|
||||||
"""
|
"""
|
||||||
f = io.StringIO(qstatOutput)
|
f = io.StringIO(qstat_output)
|
||||||
line = f.readline()
|
line = f.readline()
|
||||||
fieldRegularExp = re.compile('^(?P<fieldName>[^:]+):[ ]+(?P<fieldValue>[?]*)$')
|
fieldRegularExp = re.compile('^(?P<fieldName>[^:]+):[ ]+(?P<fieldValue>[?]*)$')
|
||||||
while len(line) > 0:
|
while len(line) > 0:
|
||||||
|
@ -238,20 +237,20 @@ class QstatParser:
|
||||||
fieldName = matchObj.group('fieldName')
|
fieldName = matchObj.group('fieldName')
|
||||||
strFieldValue = matchObj.group('fieldValue')
|
strFieldValue = matchObj.group('fieldValue')
|
||||||
if fieldName == 'job_number':
|
if fieldName == 'job_number':
|
||||||
assert job.getId().asStr() == strFieldValue
|
assert task.getId().asStr() == strFieldValue
|
||||||
elif fieldName == 'hard_queue_list':
|
elif fieldName == 'hard_queue_list':
|
||||||
allowedQueues = strFieldValue.split(',')
|
allowedQueues = strFieldValue.split(',')
|
||||||
assert len(allowedQueues) > 0
|
assert len(allowedQueues) > 0
|
||||||
job.m_jobRequirements.m_queues = allowedQueues
|
task.job_requirements.queues = allowedQueues
|
||||||
elif fieldName == 'parallel environment':
|
elif fieldName == 'parallel environment':
|
||||||
# the value could be 'ompi range: 32'
|
# the value could be 'ompi range: 32'
|
||||||
matchObj = re.match('ompi range: (?P<numSlots>[0-9]+)[?]*', strFieldValue)
|
matchObj = re.match('ompi range: (?P<numSlots>[0-9]+)[?]*', strFieldValue)
|
||||||
if matchObj:
|
if matchObj:
|
||||||
job.m_jobRequirements.m_parallelEnvironment = ParallelEnvironment.MPI
|
task.job_requirements.parallel_environment = ParallelEnvironment.MPI
|
||||||
else:
|
else:
|
||||||
assert False
|
assert False
|
||||||
else:
|
else:
|
||||||
# ignore he other fields
|
# ignore the other fields
|
||||||
None
|
pass
|
||||||
line = f.readline()
|
line = f.readline()
|
||||||
f.close()
|
f.close()
|
||||||
|
|
|
@ -1,3 +1,7 @@
|
||||||
|
from typing import Optional
|
||||||
|
from .Job import QueueMachineId
|
||||||
|
from .ClusterNode import ClusterNodeId
|
||||||
|
|
||||||
|
|
||||||
class QueueMachineStateFlags: #
|
class QueueMachineStateFlags: #
|
||||||
DISABLED = 1 # the queue machine is disabled
|
DISABLED = 1 # the queue machine is disabled
|
||||||
|
@ -12,70 +16,73 @@ class QueueMachine:
|
||||||
"""
|
"""
|
||||||
a QueueMachine instance represents a given SGE queue on a given machine (eg allintel.q@simpatix10)
|
a QueueMachine instance represents a given SGE queue on a given machine (eg allintel.q@simpatix10)
|
||||||
"""
|
"""
|
||||||
def __init__(self, queueName, machineName):
|
queue_name: str
|
||||||
self.m_queueName = queueName
|
machine_name: ClusterNodeId
|
||||||
self.m_machineName = machineName
|
num_slots: Optional[int]
|
||||||
self.m_numSlots = None
|
num_used_slots: Optional[int]
|
||||||
self.m_numUsedSlots = None
|
cpu_load: Optional[float]
|
||||||
self.m_fCpuLoad = None
|
state_flags: int
|
||||||
self.m_stateFlags = 0
|
disable_message: str
|
||||||
self.m_strDisableMessage = ''
|
|
||||||
|
|
||||||
def getName(self):
|
def __init__(self, queueName, machineName):
|
||||||
|
self.queue_name = queueName
|
||||||
|
self.machine_name = machineName
|
||||||
|
self.num_slots = None
|
||||||
|
self.num_used_slots = None
|
||||||
|
self.cpu_load = None
|
||||||
|
self.state_flags = 0
|
||||||
|
self.disable_message = ''
|
||||||
|
|
||||||
|
def get_name(self) -> QueueMachineId:
|
||||||
"""
|
"""
|
||||||
returns the name of the machine queue (such as allintel.q@simpatix10)
|
returns the name of the machine queue (such as allintel.q@simpatix10)
|
||||||
"""
|
"""
|
||||||
return self.m_queueName + '@' + self.m_machineName
|
return self.queue_name + '@' + self.machine_name
|
||||||
|
|
||||||
def getQueueName(self):
|
def get_queue_name(self) -> str:
|
||||||
return self.m_queueName
|
return self.queue_name
|
||||||
|
|
||||||
def getMachineName(self):
|
def get_machine_name(self) -> str:
|
||||||
return self.m_machineName
|
return self.machine_name
|
||||||
|
|
||||||
def setNumSlots(self, numSlots):
|
def set_num_slots(self, num_slots: int):
|
||||||
self.m_numSlots = numSlots
|
self.num_slots = num_slots
|
||||||
|
|
||||||
def setNumUsedSlots(self, numSlots):
|
def set_num_used_slots(self, num_slots: int):
|
||||||
self.m_numUsedSlots = numSlots
|
self.num_used_slots = num_slots
|
||||||
|
|
||||||
def getNumSlots(self):
|
def get_num_slots(self) -> int:
|
||||||
assert self.m_numSlots is not None
|
assert self.num_slots is not None
|
||||||
return self.m_numSlots
|
return self.num_slots
|
||||||
|
|
||||||
def getNumUsedSlots(self):
|
def get_num_used_slots(self) -> int:
|
||||||
assert self.m_numUsedSlots is not None
|
assert self.num_used_slots is not None
|
||||||
return self.m_numUsedSlots
|
return self.num_used_slots
|
||||||
|
|
||||||
def setCpuLoad(self, fCpuLoad):
|
def set_cpu_load(self, cpu_load: float):
|
||||||
self.m_fCpuLoad = fCpuLoad
|
self.cpu_load = cpu_load
|
||||||
|
|
||||||
def cpuLoadIsAvailable(self):
|
def cpu_load_is_available(self) -> bool:
|
||||||
return self.m_fCpuLoad is not None
|
return self.cpu_load is not None
|
||||||
|
|
||||||
def getCpuLoad(self):
|
def get_cpu_load(self) -> float:
|
||||||
assert self.m_fCpuLoad is not None
|
assert self.cpu_load is not None
|
||||||
return self.m_fCpuLoad
|
return self.cpu_load
|
||||||
|
|
||||||
def setState(self, state):
|
def set_state(self, state: int):
|
||||||
self.m_stateFlags = state
|
self.state_flags = state
|
||||||
|
|
||||||
def isDisabled(self):
|
def is_disabled(self) -> bool:
|
||||||
return self.m_stateFlags & QueueMachineStateFlags.DISABLED
|
return self.state_flags & QueueMachineStateFlags.DISABLED
|
||||||
|
|
||||||
def isInErrorState(self):
|
def is_in_error_state(self) -> bool:
|
||||||
return self.m_stateFlags & QueueMachineStateFlags.ERROR
|
return self.state_flags & QueueMachineStateFlags.ERROR
|
||||||
|
|
||||||
def isResponding(self):
|
def is_responding(self) -> bool:
|
||||||
return not (self.m_stateFlags & QueueMachineStateFlags.UNKNOWN)
|
return not (self.state_flags & QueueMachineStateFlags.UNKNOWN)
|
||||||
|
|
||||||
def isInAlarmState(self):
|
def is_in_alarm_state(self) -> bool:
|
||||||
return self.m_stateFlags & QueueMachineStateFlags.ALARM
|
return self.state_flags & QueueMachineStateFlags.ALARM
|
||||||
|
|
||||||
def isSuspended(self):
|
def is_suspended(self) -> bool:
|
||||||
return self.m_stateFlags & QueueMachineStateFlags.SUSPENDED
|
return self.state_flags & QueueMachineStateFlags.SUSPENDED
|
||||||
"""
|
|
||||||
def getStateAsString(self):
|
|
||||||
assert(self.m_strState is not None)
|
|
||||||
return self.m_strState
|
|
||||||
"""
|
|
||||||
|
|
|
@ -1,21 +1,24 @@
|
||||||
from PowerState import PowerState
|
|
||||||
from Log import logInfo
|
import abc
|
||||||
import time
|
import time
|
||||||
import copy
|
import copy
|
||||||
|
from .PowerState import PowerState
|
||||||
|
from .Log import log_info
|
||||||
|
|
||||||
|
|
||||||
class Slot:
|
class Slot:
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.m_queueMachine = None
|
self.queue_machine = None
|
||||||
self.m_numSlots = None
|
self.num_slots = None
|
||||||
self.m_job = None # job for which this slot is allocated
|
self.jobs = None # job for which this slot is allocated
|
||||||
|
|
||||||
|
|
||||||
class SlotAllocator:
|
class SlotAllocator(abc.ABCMeta):
|
||||||
"""
|
"""
|
||||||
a class that defines a strategy for allocating free slots for the given pending jobs
|
a class that defines a strategy for allocating free slots for the given pending jobs
|
||||||
"""
|
"""
|
||||||
def getMachinesThatNeedWakeUp(self, pendingJobs, clusterState):
|
@abc.abstractmethod
|
||||||
|
def get_machinesThatNeedWakeUp(self, pendingJobs, clusterState):
|
||||||
"""
|
"""
|
||||||
returns the list of machines that need to wake up to make pending jobs running
|
returns the list of machines that need to wake up to make pending jobs running
|
||||||
"""
|
"""
|
||||||
|
@ -23,44 +26,45 @@ class SlotAllocator:
|
||||||
|
|
||||||
|
|
||||||
class SimpleSlotAllocator(SlotAllocator):
|
class SimpleSlotAllocator(SlotAllocator):
|
||||||
def getMachinesThatNeedWakeUp(self, pendingJobs, clusterState):
|
|
||||||
|
def get_machinesThatNeedWakeUp(self, pendingJobs, clusterState):
|
||||||
machinesThatNeedWakeUp = {}
|
machinesThatNeedWakeUp = {}
|
||||||
highestPriorityPendingJob = pendingJobs.values()[0]
|
highestPriorityPendingJob = pendingJobs.values()[0]
|
||||||
logInfo('SimpleSlotAllocator::getMachinesThatNeedWakeUp : looking for free slots for job ' + highestPriorityPendingJob.getId().asStr())
|
log_info('SimpleSlotAllocator::get_machinesThatNeedWakeUp : looking for free slots for job ' + highestPriorityPendingJob.getId().asStr())
|
||||||
numFreeSlots = {} # contains the number of free slots for each queueMachine
|
numFreeSlots = {} # contains the number of free slots for each queueMachine
|
||||||
for queueMachine in clusterState.getJobsState().getQueueMachines().values():
|
for queueMachine in clusterState.get_jobs_state().get_queue_machines().values():
|
||||||
numFreeSlots[queueMachine] = clusterState.getJobsState().getNumFreeSlotsOnQueueMachine(queueMachine)
|
numFreeSlots[queueMachine] = clusterState.get_jobs_state().getNumFreeSlotsOnQueueMachine(queueMachine)
|
||||||
logInfo('SimpleSlotAllocator::getMachinesThatNeedWakeUp : init numFreeSlots[%s] with %d ' % (queueMachine.getName(), numFreeSlots[queueMachine]))
|
log_info('SimpleSlotAllocator::get_machinesThatNeedWakeUp : init numFreeSlots[%s] with %d ' % (queueMachine.get_name(), numFreeSlots[queueMachine]))
|
||||||
remainingNumSlotsToAllocate = highestPriorityPendingJob.m_jobRequirements.m_numSlots
|
remainingNumSlotsToAllocate = highestPriorityPendingJob.job_requirements.num_slots
|
||||||
logInfo('SimpleSlotAllocator::getMachinesThatNeedWakeUp : still %d slots to find' % remainingNumSlotsToAllocate)
|
log_info('SimpleSlotAllocator::get_machinesThatNeedWakeUp : still %d slots to find' % remainingNumSlotsToAllocate)
|
||||||
# first look in running machines if there are available slots
|
# first look in running machines if there are available slots
|
||||||
for queueMachine in clusterState.getJobsState().getQueueMachines().values():
|
for queueMachine in clusterState.get_jobs_state().get_queue_machines().values():
|
||||||
logInfo('SimpleSlotAllocator::getMachinesThatNeedWakeUp : examining queueMachine %s ' % queueMachine.getName())
|
log_info('SimpleSlotAllocator::get_machinesThatNeedWakeUp : examining queueMachine %s ' % queueMachine.get_name())
|
||||||
machine = clusterState.getMachines()[queueMachine.getMachineName()]
|
machine = clusterState.get_machines()[queueMachine.get_machine_name()]
|
||||||
if machine.getPowerState() == PowerState.ON:
|
if machine.get_power_state() == PowerState.ON:
|
||||||
if clusterState.queueMachineFitsJobRequirements(queueMachine, highestPriorityPendingJob.m_jobRequirements):
|
if clusterState.queue_machine_fits_job_requirements(queueMachine, highestPriorityPendingJob.job_requirements):
|
||||||
numSlotsAllocatedOnThisMachine = min(numFreeSlots[queueMachine], remainingNumSlotsToAllocate)
|
numSlotsAllocatedOnThisMachine = min(numFreeSlots[queueMachine], remainingNumSlotsToAllocate)
|
||||||
logInfo('SimpleSlotAllocator::getMachinesThatNeedWakeUp : found %d slots on already running %s ' % (numSlotsAllocatedOnThisMachine, queueMachine.getMachineName()))
|
log_info('SimpleSlotAllocator::get_machinesThatNeedWakeUp : found %d slots on already running %s ' % (numSlotsAllocatedOnThisMachine, queueMachine.get_machine_name()))
|
||||||
|
|
||||||
remainingNumSlotsToAllocate -= numSlotsAllocatedOnThisMachine
|
remainingNumSlotsToAllocate -= numSlotsAllocatedOnThisMachine
|
||||||
numFreeSlots[queueMachine] -= numSlotsAllocatedOnThisMachine
|
numFreeSlots[queueMachine] -= numSlotsAllocatedOnThisMachine
|
||||||
logInfo('SimpleSlotAllocator::getMachinesThatNeedWakeUp : still %d slots to find' % remainingNumSlotsToAllocate)
|
log_info('SimpleSlotAllocator::get_machinesThatNeedWakeUp : still %d slots to find' % remainingNumSlotsToAllocate)
|
||||||
assert remainingNumSlotsToAllocate >= 0
|
assert remainingNumSlotsToAllocate >= 0
|
||||||
if remainingNumSlotsToAllocate == 0:
|
if remainingNumSlotsToAllocate == 0:
|
||||||
break
|
break
|
||||||
if remainingNumSlotsToAllocate > 0:
|
if remainingNumSlotsToAllocate > 0:
|
||||||
# now look into machines that are asleep
|
# now look into machines that are asleep
|
||||||
for queueMachine in clusterState.getJobsState().getQueueMachines().values():
|
for queueMachine in clusterState.get_jobs_state().get_queue_machines().values():
|
||||||
logInfo('SimpleSlotAllocator::getMachinesThatNeedWakeUp : examining queueMachine %s ' % queueMachine.getName())
|
log_info('SimpleSlotAllocator::get_machinesThatNeedWakeUp : examining queueMachine %s ' % queueMachine.get_name())
|
||||||
machine = clusterState.getMachines()[queueMachine.getMachineName()]
|
machine = clusterState.get_machines()[queueMachine.get_machine_name()]
|
||||||
if machine.getPowerState() == PowerState.SLEEP:
|
if machine.get_power_state() == PowerState.SLEEP:
|
||||||
if clusterState.queueMachineFitsJobRequirements(queueMachine, highestPriorityPendingJob.m_jobRequirements):
|
if clusterState.queue_machine_fits_job_requirements(queueMachine, highestPriorityPendingJob.job_requirements):
|
||||||
numSlotsAllocatedOnThisMachine = min(numFreeSlots[queueMachine], remainingNumSlotsToAllocate)
|
numSlotsAllocatedOnThisMachine = min(numFreeSlots[queueMachine], remainingNumSlotsToAllocate)
|
||||||
logInfo('SimpleSlotAllocator::getMachinesThatNeedWakeUp : found %d slots on sleeping %s ' % (numSlotsAllocatedOnThisMachine, queueMachine.getMachineName()))
|
log_info('SimpleSlotAllocator::get_machinesThatNeedWakeUp : found %d slots on sleeping %s ' % (numSlotsAllocatedOnThisMachine, queueMachine.get_machine_name()))
|
||||||
remainingNumSlotsToAllocate -= numSlotsAllocatedOnThisMachine
|
remainingNumSlotsToAllocate -= numSlotsAllocatedOnThisMachine
|
||||||
numFreeSlots[queueMachine] -= numSlotsAllocatedOnThisMachine
|
numFreeSlots[queueMachine] -= numSlotsAllocatedOnThisMachine
|
||||||
machinesThatNeedWakeUp[machine.getName()] = machine
|
machinesThatNeedWakeUp[machine.get_name()] = machine
|
||||||
logInfo('SimpleSlotAllocator::getMachinesThatNeedWakeUp : still %d slots to find' % remainingNumSlotsToAllocate)
|
log_info('SimpleSlotAllocator::get_machinesThatNeedWakeUp : still %d slots to find' % remainingNumSlotsToAllocate)
|
||||||
assert remainingNumSlotsToAllocate >= 0
|
assert remainingNumSlotsToAllocate >= 0
|
||||||
if remainingNumSlotsToAllocate == 0:
|
if remainingNumSlotsToAllocate == 0:
|
||||||
break
|
break
|
||||||
|
@ -75,9 +79,9 @@ class DecoupledSlotAllocator(SlotAllocator):
|
||||||
Instead, it uses a very simple strategy : it wakes up all the machines periodically to allow jobs to get in.
|
Instead, it uses a very simple strategy : it wakes up all the machines periodically to allow jobs to get in.
|
||||||
"""
|
"""
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.m_delayBetweenPeriodicChecks = -1 # in seconds. Disable periodic checks by setting this to -1
|
self.delay_between_periodic_checks = -1 # in seconds. Disable periodic checks by setting this to -1
|
||||||
self.m_lastCheckTime = time.time()
|
self.last_check_time = time.time()
|
||||||
self.m_lastClusterState = None
|
self.last_cluster_state = None
|
||||||
|
|
||||||
def jobsStateHasChanged(self, newClusterState):
|
def jobsStateHasChanged(self, newClusterState):
|
||||||
"""
|
"""
|
||||||
|
@ -85,26 +89,24 @@ class DecoupledSlotAllocator(SlotAllocator):
|
||||||
to start (provided all machines are enabled)
|
to start (provided all machines are enabled)
|
||||||
"""
|
"""
|
||||||
oldJobs = {}
|
oldJobs = {}
|
||||||
if self.m_lastClusterState:
|
if self.last_cluster_state:
|
||||||
oldJobs = self.m_lastClusterState.m_jobsState.m_jobs
|
oldJobs = self.last_cluster_state.jobs_state.jobs
|
||||||
newJobs = newClusterState.m_jobsState.m_jobs
|
newJobs = newClusterState.jobs_state.jobs
|
||||||
bJobsHaveChanged = False
|
bJobsHaveChanged = False
|
||||||
oldJobsOnly = oldJobs.copy() # shallow copy
|
oldJobsOnly = oldJobs.copy() # shallow copy
|
||||||
# print 'oldJobs : ', oldJobs
|
# print 'oldJobs : ', oldJobs
|
||||||
# print 'newJobs : ', newJobs
|
# print 'newJobs : ', newJobs
|
||||||
"""
|
# print 'self.last_cluster_state', self.last_cluster_state
|
||||||
print 'self.m_lastClusterState', self.m_lastClusterState
|
# print 'newClusterState', newClusterState
|
||||||
print 'newClusterState', newClusterState
|
# if self.last_cluster_state:
|
||||||
if self.m_lastClusterState:
|
# print 'self.last_cluster_state.jobs_state', self.last_cluster_state.jobs_state
|
||||||
print 'self.m_lastClusterState.m_jobsState', self.m_lastClusterState.m_jobsState
|
# print 'newClusterState.jobs_state', newClusterState.jobs_state
|
||||||
print 'newClusterState.m_jobsState', newClusterState.m_jobsState
|
# print 'id(self.last_cluster_state) : ', id(self.last_cluster_state)
|
||||||
print 'id(self.m_lastClusterState) : ', id(self.m_lastClusterState)
|
# print 'id(newClusterState) : ', id(newClusterState)
|
||||||
print 'id(newClusterState) : ', id(newClusterState)
|
# print 'len(oldJobs) : ', len(oldJobs)
|
||||||
print 'len(oldJobs) : ', len(oldJobs)
|
# print 'len(newJobs) : ', len(newJobs)
|
||||||
print 'len(newJobs) : ', len(newJobs)
|
# print 'id(oldJobs) : ', id(oldJobs)
|
||||||
print 'id(oldJobs) : ', id(oldJobs)
|
# print 'id(newJobs) : ', id(newJobs)
|
||||||
print 'id(newJobs) : ', id(newJobs)
|
|
||||||
"""
|
|
||||||
for newJob in newJobs.values():
|
for newJob in newJobs.values():
|
||||||
# logDebug('DecoupledSlotAllocator::jobsStateHasChanged newJob id=%s' % newJob.getId().asStr())
|
# logDebug('DecoupledSlotAllocator::jobsStateHasChanged newJob id=%s' % newJob.getId().asStr())
|
||||||
if newJob.getId() in oldJobs:
|
if newJob.getId() in oldJobs:
|
||||||
|
@ -112,36 +114,36 @@ class DecoupledSlotAllocator(SlotAllocator):
|
||||||
del oldJobsOnly[newJob.getId()]
|
del oldJobsOnly[newJob.getId()]
|
||||||
else:
|
else:
|
||||||
# ah ... a new job has arrived
|
# ah ... a new job has arrived
|
||||||
logInfo('A new job (jobId =%s) has been detected ' % newJob.getId().asStr())
|
log_info('A new job (jobId =%s) has been detected ' % newJob.getId().asStr())
|
||||||
bJobsHaveChanged = True
|
bJobsHaveChanged = True
|
||||||
if len(oldJobsOnly) != 0:
|
if len(oldJobsOnly) != 0:
|
||||||
for oldJob in oldJobsOnly.values():
|
for oldJob in oldJobsOnly.values():
|
||||||
logInfo('Job (jobId =%s) has finished' % oldJob.getId().asStr())
|
log_info('Job (jobId =%s) has finished' % oldJob.getId().asStr())
|
||||||
# at least one old job has finished, freeing some slots
|
# at least one old job has finished, freeing some slots
|
||||||
bJobsHaveChanged = True
|
bJobsHaveChanged = True
|
||||||
return bJobsHaveChanged
|
return bJobsHaveChanged
|
||||||
|
|
||||||
def getMachinesThatNeedWakeUp(self, pendingJobs, clusterState):
|
def get_machinesThatNeedWakeUp(self, pendingJobs, clusterState):
|
||||||
machinesThatNeedWakeUp = {}
|
machinesThatNeedWakeUp = {}
|
||||||
bJobsStateHasChanged = self.jobsStateHasChanged(clusterState)
|
bJobsStateHasChanged = self.jobsStateHasChanged(clusterState) # pylint: disable=no-value-for-parameter
|
||||||
currentTime = time.time()
|
currentTime = time.time()
|
||||||
# we do periodic checks to detect changes in cluster state that are not detected by jobsStateHasChanged
|
# we do periodic checks to detect changes in cluster state that are not detected by jobsStateHasChanged
|
||||||
# for example changes in the requirements, in the allocation policy, etc...
|
# for example changes in the requirements, in the allocation policy, etc...
|
||||||
bItsTimeForPeriodicCheck = False
|
bItsTimeForPeriodicCheck = False
|
||||||
if self.m_delayBetweenPeriodicChecks > 0:
|
if self.delay_between_periodic_checks > 0:
|
||||||
bItsTimeForPeriodicCheck = (currentTime - self.m_lastCheckTime) > self.m_delayBetweenPeriodicChecks
|
bItsTimeForPeriodicCheck = (currentTime - self.last_check_time) > self.delay_between_periodic_checks
|
||||||
if bJobsStateHasChanged or bItsTimeForPeriodicCheck:
|
if bJobsStateHasChanged or bItsTimeForPeriodicCheck:
|
||||||
if bJobsStateHasChanged:
|
if bJobsStateHasChanged:
|
||||||
logInfo('DecoupledSlotAllocator::getMachinesThatNeedWakeUp : waking up machines that are asleep because jobs state has changed')
|
log_info('DecoupledSlotAllocator::get_machinesThatNeedWakeUp : waking up machines that are asleep because jobs state has changed')
|
||||||
else:
|
else:
|
||||||
logInfo('DecoupledSlotAllocator::getMachinesThatNeedWakeUp : waking up machines that are asleep for periodic check (to be sure pending jobs get a chance to start)')
|
log_info('DecoupledSlotAllocator::get_machinesThatNeedWakeUp : waking up machines that are asleep for periodic check (to be sure pending jobs get a chance to start)')
|
||||||
for queueMachine in clusterState.getJobsState().getQueueMachines().values():
|
for queueMachine in clusterState.get_jobs_state().get_queue_machines().values():
|
||||||
if queueMachine.getMachineName() in clusterState.getMachines():
|
if queueMachine.get_machine_name() in clusterState.get_machines():
|
||||||
# this means that the machine is under the cluster controller's control
|
# this means that the machine is under the cluster controller's control
|
||||||
machine = clusterState.getMachines()[queueMachine.getMachineName()]
|
machine = clusterState.get_machines()[queueMachine.get_machine_name()]
|
||||||
if machine.getPowerState() == PowerState.SLEEP:
|
if machine.get_power_state() == PowerState.SLEEP:
|
||||||
machinesThatNeedWakeUp[machine.getName()] = machine
|
machinesThatNeedWakeUp[machine.get_name()] = machine
|
||||||
self.m_lastCheckTime = currentTime
|
self.last_check_time = currentTime
|
||||||
self.m_lastClusterState = copy.copy(clusterState)
|
self.last_cluster_state = copy.copy(clusterState)
|
||||||
# print 'self.m_lastClusterState', self.m_lastClusterState
|
# print 'self.last_cluster_state', self.last_cluster_state
|
||||||
return machinesThatNeedWakeUp
|
return machinesThatNeedWakeUp
|
||||||
|
|
|
@ -1,58 +1,58 @@
|
||||||
import time
|
import time
|
||||||
from Util import executeProgram
|
from .Util import execute_program
|
||||||
from QstatParser import QstatParser
|
from .QstatParser import QstatParser
|
||||||
from Log import logDebug, logWarning
|
from .Log import logDebug, log_warning
|
||||||
|
|
||||||
|
|
||||||
class SunGridEngine:
|
class SunGridEngine:
|
||||||
|
|
||||||
def getCurrentJobsState(self):
|
def get_current_job_state(self):
|
||||||
bBUG_00000009_IS_STILL_ALIVE = True
|
bBUG_00000009_IS_STILL_ALIVE = True
|
||||||
if bBUG_00000009_IS_STILL_ALIVE:
|
if bBUG_00000009_IS_STILL_ALIVE:
|
||||||
logDebug('Querying the current state of jobs')
|
logDebug('Querying the current state of jobs')
|
||||||
returnCode = -1
|
return_code = -1
|
||||||
delayBetweenAttemps = 5 # in seconds
|
delay_between_attempts = 5 # in seconds
|
||||||
while returnCode != 0:
|
while return_code != 0:
|
||||||
command = ['qstat', '-f', '-u', '*']
|
command = ['qstat', '-f', '-u', '*']
|
||||||
(returnCode, qstatOutput, stderr) = executeProgram(command)
|
(return_code, qstat_output, stderr) = execute_program(command)
|
||||||
if returnCode != 0:
|
if return_code != 0:
|
||||||
logWarning('command "%s" failed (returnCode = %d, stdout="%s", stderr="%s"). Retrying in %d seconds' % (' '.join(command), returnCode, qstatOutput, stderr, delayBetweenAttemps))
|
log_warning('command "%s" failed (returnCode = %d, stdout="%s", stderr="%s"). Retrying in %d seconds' % (' '.join(command), return_code, qstat_output, stderr, delay_between_attempts))
|
||||||
time.sleep(delayBetweenAttemps)
|
time.sleep(delay_between_attempts)
|
||||||
if bBUG_00000009_IS_STILL_ALIVE:
|
if bBUG_00000009_IS_STILL_ALIVE:
|
||||||
logDebug('Just got current state of jobs')
|
logDebug('Just got current state of jobs')
|
||||||
|
|
||||||
jobsState = QstatParser().parseQstatOutput(qstatOutput)
|
jobs_state = QstatParser().parse_qstat_output(qstat_output)
|
||||||
jobsState.setTime(time.time())
|
jobs_state.set_time(time.time())
|
||||||
|
|
||||||
# read the requirements for pending jobs (which parallel environment, which queue, which architecture) from sge
|
# read the requirements for pending jobs (which parallel environment, which queue, which architecture) from sge
|
||||||
if False: # no need for job details at the moment and since it's very slow, it's been disabled
|
if False: # no need for job details at the moment and since it's very slow, it's been disabled
|
||||||
for unused_jobId, job in jobsState.getPendingJobs().items():
|
for unused_jobId, job in jobs_state.get_pending_jobs().items():
|
||||||
(returnCode, stdout, stderr) = executeProgram(['qstat', '-j', job.getId().asStr()])
|
(return_code, stdout, stderr) = execute_program(['qstat', '-j', job.getId().asStr()])
|
||||||
assert returnCode != 0, 'prout'
|
assert return_code != 0, 'prout'
|
||||||
QstatParser().parseJobDetails(stdout, job)
|
QstatParser().parse_job_details(stdout, job)
|
||||||
|
|
||||||
return jobsState
|
return jobs_state
|
||||||
|
|
||||||
def setQueueInstanceActivation(self, strQueueInstanceName, bEnable):
|
def set_queue_instance_activation(self, queue_instance_name: str, enable: bool):
|
||||||
argument = 'd'
|
argument = 'd'
|
||||||
if bEnable:
|
if enable:
|
||||||
argument = 'e'
|
argument = 'e'
|
||||||
bBUG_00000269_IS_STILL_ALIVE = True # for some reason, qmod -d (and maybe any sge command) could fail with error: commlib error: can't connect to service (Address already in use)
|
bBUG_00000269_IS_STILL_ALIVE = True # for some reason, qmod -d (and maybe any sge command) could fail with error: commlib error: can't connect to service (Address already in use)
|
||||||
delayBetweenAttemps = 5 # in seconds
|
delay_between_attempts = 5 # in seconds
|
||||||
while True:
|
while True:
|
||||||
errorCode, unused_stdout, unused_stderr = executeProgram(['qmod', '-' + argument, strQueueInstanceName])
|
error_code, unused_stdout, unused_stderr = execute_program(['qmod', '-' + argument, queue_instance_name])
|
||||||
if bBUG_00000269_IS_STILL_ALIVE:
|
if bBUG_00000269_IS_STILL_ALIVE:
|
||||||
# if the command failed, try again
|
# if the command failed, try again
|
||||||
if errorCode == 0:
|
if error_code == 0:
|
||||||
break
|
break
|
||||||
time.sleep(delayBetweenAttemps)
|
time.sleep(delay_between_attempts)
|
||||||
else:
|
else:
|
||||||
break
|
break
|
||||||
return (errorCode == 0)
|
return (error_code == 0)
|
||||||
|
|
||||||
def queueIsEmpty(self, strMachineName):
|
def queue_is_empty(self, machine_name: str):
|
||||||
(returnCode, qstatOutput, unused_stderr) = executeProgram(['qstat', '-f', '-u', '*'])
|
(returnCode, qstat_output, unused_stderr) = execute_program(['qstat', '-f', '-u', '*'])
|
||||||
assert returnCode == 0
|
assert returnCode == 0
|
||||||
jobsState = QstatParser().parseQstatOutput(qstatOutput)
|
jobs_state = QstatParser().parse_qstat_output(qstat_output)
|
||||||
jobs = jobsState.getJobsOnMachine(strMachineName)
|
jobs = jobs_state.get_jobs_on_machine(machine_name)
|
||||||
return (len(jobs) == 0)
|
return (len(jobs) == 0)
|
||||||
|
|
|
@ -1,24 +1,21 @@
|
||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
import sys
|
from .Log import log_info
|
||||||
sys.path.insert(0, '..')
|
from .Util import get_power_state, blocking_put_machine_to_sleep, blocking_wake_up_machine, execute_command, execute_ipmi_command
|
||||||
from Log import logInfo
|
from .PowerState import PowerState
|
||||||
import Util
|
|
||||||
from PowerState import PowerState
|
|
||||||
from HTMLParser import HTMLParser
|
|
||||||
|
|
||||||
|
|
||||||
def Test0000():
|
def Test0000():
|
||||||
logInfo('Testing bug 00000003 if a series of wake up, goto sleep can shutdown a machine')
|
log_info('Testing bug 00000003 if a series of wake up, goto sleep can shutdown a machine')
|
||||||
strTargetMachineName = 'simpatix12'
|
strTarget_machine_name = 'simpatix12'
|
||||||
ePowerState = Util.getPowerState(strTargetMachineName)
|
ePowerState = get_power_state(strTarget_machine_name)
|
||||||
while True:
|
while True:
|
||||||
if ePowerState == PowerState.ON:
|
if ePowerState == PowerState.ON:
|
||||||
bSuccess = Util.blockingPutMachineToSleep(strTargetMachineName)
|
bSuccess = blocking_put_machine_to_sleep(strTarget_machine_name)
|
||||||
assert bSuccess
|
assert bSuccess
|
||||||
bSuccess = Util.blockingPutMachineToSleep(strTargetMachineName)
|
bSuccess = blocking_put_machine_to_sleep(strTarget_machine_name)
|
||||||
ePowerState = PowerState.SLEEP
|
ePowerState = PowerState.SLEEP
|
||||||
elif ePowerState == PowerState.SLEEP:
|
elif ePowerState == PowerState.SLEEP:
|
||||||
bSuccess = Util.blockingWakeUpMachine(strTargetMachineName)
|
bSuccess = blocking_wake_up_machine(strTarget_machine_name)
|
||||||
assert bSuccess
|
assert bSuccess
|
||||||
ePowerState = PowerState.ON
|
ePowerState = PowerState.ON
|
||||||
else:
|
else:
|
||||||
|
@ -26,30 +23,30 @@ def Test0000():
|
||||||
|
|
||||||
|
|
||||||
def Test0001():
|
def Test0001():
|
||||||
logInfo('Testing bug 00000003 : could it be caused by a sleep and a power on at the same tim ?')
|
log_info('Testing bug 00000003 : could it be caused by a sleep and a power on at the same tim ?')
|
||||||
strTargetMachineName = 'simpatix12'
|
strTarget_machine_name = 'simpatix12'
|
||||||
ePowerState = Util.getPowerState(strTargetMachineName)
|
ePowerState = get_power_state(strTarget_machine_name)
|
||||||
if ePowerState == PowerState.SLEEP:
|
if ePowerState == PowerState.SLEEP:
|
||||||
bSuccess = Util.blockingWakeUpMachine(strTargetMachineName)
|
bSuccess = blocking_wake_up_machine(strTarget_machine_name)
|
||||||
assert bSuccess
|
assert bSuccess
|
||||||
ePowerState = PowerState.ON
|
ePowerState = PowerState.ON
|
||||||
assert ePowerState == PowerState.ON
|
assert ePowerState == PowerState.ON
|
||||||
Util.executeCommand("ssh %s 'pmset sleepnow'" % strTargetMachineName)
|
execute_command("ssh %s 'pmset sleepnow'" % strTarget_machine_name)
|
||||||
bSuccess = Util.blockingWakeUpMachine(strTargetMachineName)
|
bSuccess = blocking_wake_up_machine(strTarget_machine_name)
|
||||||
assert bSuccess
|
assert bSuccess
|
||||||
|
|
||||||
|
|
||||||
def Test0002():
|
def Test0002():
|
||||||
logInfo('Testing bug 00000003 : could it be caused by a power on quickly followed by a sleep ?')
|
log_info('Testing bug 00000003 : could it be caused by a power on quickly followed by a sleep ?')
|
||||||
strTargetMachineName = 'simpatix12'
|
strTarget_machine_name = 'simpatix12'
|
||||||
ePowerState = Util.getPowerState(strTargetMachineName)
|
ePowerState = get_power_state(strTarget_machine_name)
|
||||||
if ePowerState == PowerState.ON:
|
if ePowerState == PowerState.ON:
|
||||||
bSuccess = Util.blockingWakeUpMachine(strTargetMachineName)
|
bSuccess = blocking_wake_up_machine(strTarget_machine_name)
|
||||||
assert bSuccess
|
assert bSuccess
|
||||||
ePowerState = PowerState.SLEEP
|
ePowerState = PowerState.SLEEP
|
||||||
assert ePowerState == PowerState.SLEEP
|
assert ePowerState == PowerState.SLEEP
|
||||||
Util.executeIpmiCommand(strTargetMachineName, 'chassis power on')
|
execute_ipmi_command(strTarget_machine_name, 'chassis power on')
|
||||||
Util.executeCommand("ssh %s 'pmset sleepnow'" % strTargetMachineName)
|
execute_command("ssh %s 'pmset sleepnow'" % strTarget_machine_name)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
|
|
@ -1,80 +1,81 @@
|
||||||
# import .Util
|
|
||||||
# import ..SimpaDbUtil
|
|
||||||
from .Log import logDebug, logInfo, logWarning, logError
|
|
||||||
from .PowerState import PowerState, PowerStateToStr
|
|
||||||
import re
|
import re
|
||||||
import io
|
import io
|
||||||
import os
|
import os
|
||||||
import traceback
|
import traceback
|
||||||
import sys
|
import sys
|
||||||
|
import time
|
||||||
|
from ..Util import execute_program as exe_prog
|
||||||
|
from ..Util import execute_command as exe_comm
|
||||||
|
from ..Util import send_text_mail
|
||||||
|
from ..SimpaDbUtil import getLightOutManagementIpAddress, is_machine_responding
|
||||||
|
from .Log import logDebug, log_info, log_warning, logError
|
||||||
|
from .PowerState import PowerState, PowerStateToStr
|
||||||
|
|
||||||
|
|
||||||
def executeProgram(astrArguments):
|
def execute_program(astrArguments):
|
||||||
bBUG_00000008_IS_STILL_ACTIVE = True
|
bBUG_00000008_IS_STILL_ACTIVE = True
|
||||||
if bBUG_00000008_IS_STILL_ACTIVE:
|
if bBUG_00000008_IS_STILL_ACTIVE:
|
||||||
logDebug('executeProgram : program = [%s]' % (','.join(astrArguments)))
|
logDebug('execute_program : program = [%s]' % (','.join(astrArguments)))
|
||||||
(returnCode, stdout, stderr) = Lib.Util.executeProgram(astrArguments)
|
(returnCode, stdout, stderr) = exe_prog(astrArguments)
|
||||||
if bBUG_00000008_IS_STILL_ACTIVE:
|
if bBUG_00000008_IS_STILL_ACTIVE:
|
||||||
logDebug('executeCommand : return code of [%s] = %d' % (','.join(astrArguments), returnCode))
|
logDebug('execute_command : return code of [%s] = %d' % (','.join(astrArguments), returnCode))
|
||||||
# for debugging purpose, log info in case the command failed
|
# for debugging purpose, log info in case the command failed
|
||||||
if returnCode != 0:
|
if returnCode != 0:
|
||||||
logDebug('executeCommand : return code of [%s] = %d' % (','.join(astrArguments), returnCode))
|
logDebug('execute_command : return code of [%s] = %d' % (','.join(astrArguments), returnCode))
|
||||||
logDebug('executeCommand : stdout of [%s] = %s' % (','.join(astrArguments), stdout))
|
logDebug('execute_command : stdout of [%s] = %s' % (','.join(astrArguments), stdout))
|
||||||
logDebug('executeCommand : stderr of [%s] = %s' % (','.join(astrArguments), stderr))
|
logDebug('execute_command : stderr of [%s] = %s' % (','.join(astrArguments), stderr))
|
||||||
return (returnCode, stdout, stderr)
|
return (returnCode, stdout, stderr)
|
||||||
|
|
||||||
|
|
||||||
def executeCommand(command):
|
def execute_command(command):
|
||||||
# logDebug('executeCommand : command = ' + command)
|
# logDebug('execute_command : command = ' + command)
|
||||||
(returnCode, stdout, stderr) = Lib.Util.executeCommand(command)
|
(returnCode, stdout, stderr) = exe_comm(command)
|
||||||
# logDebug('executeCommand : return code of "'+command+'" = '+str(returnCode))
|
# logDebug('execute_command : return code of "'+command+'" = '+str(returnCode))
|
||||||
return (returnCode, stdout, stderr)
|
return (returnCode, stdout, stderr)
|
||||||
|
|
||||||
|
|
||||||
def executeIpmiCommand(machineName, ipmiCommandArgs):
|
def execute_ipmi_command(machineName, ipmiCommandArgs):
|
||||||
lomIpAddress = Lib.SimpaDbUtil.getLightOutManagementIpAddress(machineName)
|
lomIpAddress = getLightOutManagementIpAddress(machineName)
|
||||||
lomPasswordFilepath = '/usr/local/etc/LightOutManagementPassword.txt'
|
lomPasswordFilepath = '/usr/local/etc/LightOutManagementPassword.txt'
|
||||||
astrProgram = ['ipmitool', '-U', 'admin', '-H', lomIpAddress, '-f', lomPasswordFilepath]
|
astrProgram = ['ipmitool', '-U', 'admin', '-H', lomIpAddress, '-f', lomPasswordFilepath]
|
||||||
astrProgram.extend(ipmiCommandArgs)
|
astrProgram.extend(ipmiCommandArgs)
|
||||||
# print 'executeIpmiCommand'
|
# print 'execute_ipmi_command'
|
||||||
# print astrProgram
|
# print astrProgram
|
||||||
bBUG_00000005_IS_STILL_ACTIVE = True
|
bBUG_00000005_IS_STILL_ACTIVE = True
|
||||||
if bBUG_00000005_IS_STILL_ACTIVE:
|
if bBUG_00000005_IS_STILL_ACTIVE:
|
||||||
# bug 00000005 causes ipmitool to randomly fail for no apparent reason (two consecutive calls might give different errors, these errors not always being timeouts). Therefore we try and try again, until the command succeeds. If we don't do this, cluster controller keeps stopping because ipmi commands fail. The effect of this hack is that the UNPLUGGED power state is no longer detected; therefore, with this hack, cluster controller is expecting all machines to be plugged.
|
# bug 00000005 causes ipmitool to randomly fail for no apparent reason (two consecutive calls might give different errors, these errors not always being timeouts). Therefore we try and try again, until the command succeeds. If we don't do this, cluster controller keeps stopping because ipmi commands fail. The effect of this hack is that the UNPLUGGED power state is no longer detected; therefore, with this hack, cluster controller is expecting all machines to be plugged.
|
||||||
bCommandSucceeded = False
|
bCommandSucceeded = False
|
||||||
while not bCommandSucceeded:
|
while not bCommandSucceeded:
|
||||||
(returnCode, stdout, stderr) = executeProgram(astrProgram)
|
(returnCode, stdout, stderr) = execute_program(astrProgram)
|
||||||
if returnCode == 0:
|
if returnCode == 0:
|
||||||
bCommandSucceeded = True
|
bCommandSucceeded = True
|
||||||
else:
|
else:
|
||||||
logWarning('the command "%s" failed. Retrying a bit later' % ' '.join(astrProgram))
|
log_warning('the command "%s" failed. Retrying a bit later' % ' '.join(astrProgram))
|
||||||
time.sleep(5) # wait for 5 seconds before the next attempt, in order not to saturate activity
|
time.sleep(5) # wait for 5 seconds before the next attempt, in order not to saturate activity
|
||||||
else:
|
else:
|
||||||
(returnCode, stdout, stderr) = executeProgram(astrProgram)
|
(returnCode, stdout, stderr) = execute_program(astrProgram)
|
||||||
"""
|
# sh-3.2# ipmitool -U admin -H 129.20.27.220 -f /usr/local/etc/LightOutManagementPassword.txt sensor get 'ACPI State'
|
||||||
sh-3.2# ipmitool -U admin -H 129.20.27.220 -f /usr/local/etc/LightOutManagementPassword.txt sensor get 'ACPI State'
|
# Unabled to establish a session with the BMC.
|
||||||
Unabled to establish a session with the BMC.
|
# Command failed due to insufficient resources for session (0xFFFEF901)
|
||||||
Command failed due to insufficient resources for session (0xFFFEF901)
|
# -> this error means that the number of active conections to the BMC has reached the maximum (usually 5).
|
||||||
-> this error means that the number of active conections to the BMC has reached the maximum (usually 5).
|
|
||||||
|
|
||||||
sh-3.2# ipmitool -U admin -H 129.20.27.212 -f /usr/local/etc/LightOutManagementPassword.txt sensor get 'ACPI State'
|
# sh-3.2# ipmitool -U admin -H 129.20.27.212 -f /usr/local/etc/LightOutManagementPassword.txt sensor get 'ACPI State'
|
||||||
Unabled to establish a session with the BMC.
|
# Unabled to establish a session with the BMC.
|
||||||
Command failed due to Unknown (0xFFFEF923) (0xFFFEF923)
|
# Command failed due to Unknown (0xFFFEF923) (0xFFFEF923)
|
||||||
|
|
||||||
sh-3.2# ipmitool -U admin -H 129.20.27.212 -f /usr/local/etc/LightOutManagementPassword.txt sensor get 'ACPI State'
|
# sh-3.2# ipmitool -U admin -H 129.20.27.212 -f /usr/local/etc/LightOutManagementPassword.txt sensor get 'ACPI State'
|
||||||
Unabled to establish a session with the BMC.
|
# Unabled to establish a session with the BMC.
|
||||||
Command failed due to Timeout (0xFFFEF9C3)
|
# Command failed due to Timeout (0xFFFEF9C3)
|
||||||
"""
|
|
||||||
|
|
||||||
return (returnCode, stdout, stderr)
|
return (returnCode, stdout, stderr)
|
||||||
|
|
||||||
|
|
||||||
def getPowerState(machineName):
|
def get_power_state(machineName):
|
||||||
ePowerState = PowerState.UNKNOWN
|
ePowerState = PowerState.UNKNOWN
|
||||||
bPowerStateRead = False
|
bPowerStateRead = False
|
||||||
iNumFailedAttempts = 0
|
iNumFailedAttempts = 0
|
||||||
while not bPowerStateRead:
|
while not bPowerStateRead:
|
||||||
(returnCode, stdout, stderr) = executeIpmiCommand(machineName, ['sensor', 'get', 'ACPI State'])
|
(returnCode, stdout, _stderr) = execute_ipmi_command(machineName, ['sensor', 'get', 'ACPI State'])
|
||||||
if returnCode == 0:
|
if returnCode == 0:
|
||||||
matchObj = re.search(r'\[(?P<AcpiState>S[0-9][^\:]*)\:', stdout)
|
matchObj = re.search(r'\[(?P<AcpiState>S[0-9][^\:]*)\:', stdout)
|
||||||
bBUG_00000002_IS_STILL_ACTIVE = True
|
bBUG_00000002_IS_STILL_ACTIVE = True
|
||||||
|
@ -83,7 +84,7 @@ def getPowerState(machineName):
|
||||||
# the following warning has been commented out because it pollutes the logs and apparently
|
# the following warning has been commented out because it pollutes the logs and apparently
|
||||||
# it's a 'feature' of all 4 core machines : if the machine is woken up using ipmitool, then
|
# it's a 'feature' of all 4 core machines : if the machine is woken up using ipmitool, then
|
||||||
# no power on event is logged ...
|
# no power on event is logged ...
|
||||||
# logWarning('degenerate ipmitool output for machine %s (see bug 00000002). Assuming power in on because that''s what I noticed when I had the case.' % machineName)
|
# log_warning('degenerate ipmitool output for machine %s (see bug 00000002). Assuming power in on because that''s what I noticed when I had the case.' % machineName)
|
||||||
return PowerState.ON
|
return PowerState.ON
|
||||||
else:
|
else:
|
||||||
assert matchObj
|
assert matchObj
|
||||||
|
@ -103,31 +104,31 @@ def getPowerState(machineName):
|
||||||
iMAX_NUM_ATTEMPTS = 5
|
iMAX_NUM_ATTEMPTS = 5
|
||||||
iNumFailedAttempts += 1
|
iNumFailedAttempts += 1
|
||||||
if iNumFailedAttempts < iMAX_NUM_ATTEMPTS:
|
if iNumFailedAttempts < iMAX_NUM_ATTEMPTS:
|
||||||
logWarning('failed to read the power state of %s. I\'ll try a again a bit later....' % machineName)
|
log_warning('failed to read the power state of %s. I\'ll try a again a bit later....' % machineName)
|
||||||
time.sleep(5)
|
time.sleep(5)
|
||||||
else:
|
else:
|
||||||
logWarning('failed to read the power state of %s too many times. I assume this machine is unplugged' % machineName)
|
log_warning('failed to read the power state of %s too many times. I assume this machine is unplugged' % machineName)
|
||||||
ePowerState = PowerState.UNPLUGGED # too many attempts failed ... I guess it's because the machine is unplugged
|
ePowerState = PowerState.UNPLUGGED # too many attempts failed ... I guess it's because the machine is unplugged
|
||||||
bPowerStateRead = True
|
bPowerStateRead = True
|
||||||
return ePowerState
|
return ePowerState
|
||||||
|
|
||||||
|
|
||||||
def wakeUpMachine(machineName):
|
def wake_up_machine(machineName):
|
||||||
"""
|
"""
|
||||||
this method seems more reliable than wake on lan (sometimes, sending wake on lan packet seems to have no effect)
|
this method seems more reliable than wake on lan (sometimes, sending wake on lan packet seems to have no effect)
|
||||||
@return true on success, false otherwise
|
@return true on success, false otherwise
|
||||||
@note I once had this method failing for no obvious reason.. maybe this command does not succeed if the machine is in a transition state
|
@note I once had this method failing for no obvious reason.. maybe this command does not succeed if the machine is in a transition state
|
||||||
"""
|
"""
|
||||||
(returnCode, stdout, stderr) = executeIpmiCommand(machineName, ['chassis', 'power', 'on'])
|
(returnCode, _stdout, _stderr) = execute_ipmi_command(machineName, ['chassis', 'power', 'on'])
|
||||||
bSuccess = (returnCode == 0) # this command can fail if the machine is manually unplugged for example
|
bSuccess = (returnCode == 0) # this command can fail if the machine is manually unplugged for example
|
||||||
return bSuccess
|
return bSuccess
|
||||||
|
|
||||||
|
|
||||||
def blockingPutMachineToSleep(machineName):
|
def blocking_put_machine_to_sleep(machineName):
|
||||||
"""
|
"""
|
||||||
@return true on success, false otherwise
|
@return true on success, false otherwise
|
||||||
"""
|
"""
|
||||||
logInfo('putting machine %s to sleep...' % machineName)
|
log_info('putting machine %s to sleep...' % machineName)
|
||||||
iMaxNumAttempts = 5
|
iMaxNumAttempts = 5
|
||||||
bSuccess = False
|
bSuccess = False
|
||||||
bBUG_239_IS_STILL_ALIVE = True
|
bBUG_239_IS_STILL_ALIVE = True
|
||||||
|
@ -135,36 +136,36 @@ def blockingPutMachineToSleep(machineName):
|
||||||
# note : each sleep order is not actually succeeding (god knows why). Therefore, we need to try again and again.
|
# note : each sleep order is not actually succeeding (god knows why). Therefore, we need to try again and again.
|
||||||
while not bSuccess:
|
while not bSuccess:
|
||||||
# note : pmset must be executed as root
|
# note : pmset must be executed as root
|
||||||
(returnCode, stdout, stderr) = executeProgram(['ssh', machineName, 'pmset sleepnow'])
|
(_returnCode, _stdout, _stderr) = execute_program(['ssh', machineName, 'pmset sleepnow'])
|
||||||
# check if the machine actually went to sleep
|
# check if the machine actually went to sleep
|
||||||
iMaxGoToSleepDuration = 30 # in seconds
|
iMaxGoToSleepDuration = 30 # in seconds
|
||||||
iDelay = 0
|
iDelay = 0
|
||||||
while iDelay < iMaxGoToSleepDuration:
|
while iDelay < iMaxGoToSleepDuration:
|
||||||
time.sleep(5)
|
time.sleep(5)
|
||||||
iDelay += 5
|
iDelay += 5
|
||||||
ePowerState = getPowerState(machineName)
|
ePowerState = get_power_state(machineName)
|
||||||
if ePowerState == PowerState.SLEEP:
|
if ePowerState == PowerState.SLEEP:
|
||||||
logInfo('machine %s is now sleeping (put to sleep succeeded)' % machineName)
|
log_info('machine %s is now sleeping (put to sleep succeeded)' % machineName)
|
||||||
return True
|
return True
|
||||||
else:
|
else:
|
||||||
if ePowerState != PowerState.ON:
|
if ePowerState != PowerState.ON:
|
||||||
logWarning('unexpectedly, powerState of %s is %s' % (machineName, PowerStateToStr(ePowerState)))
|
log_warning('unexpectedly, powerState of %s is %s' % (machineName, PowerStateToStr(ePowerState)))
|
||||||
assert ePowerState == PowerState.ON
|
assert ePowerState == PowerState.ON
|
||||||
iAttempt += 1
|
iAttempt += 1
|
||||||
if iAttempt > iMaxNumAttempts:
|
if iAttempt > iMaxNumAttempts:
|
||||||
if bBUG_239_IS_STILL_ALIVE:
|
if bBUG_239_IS_STILL_ALIVE:
|
||||||
logWarning('the attempt to put %s to sleep failed too many times (probably because of bug 239 (machine is in a weird state : power on but no ssh possible) ?)... giving up. ' % (machineName))
|
log_warning('the attempt to put %s to sleep failed too many times (probably because of bug 239 (machine is in a weird state : power on but no ssh possible) ?)... giving up. ' % (machineName))
|
||||||
return False
|
return False
|
||||||
else:
|
else:
|
||||||
logWarning('the attempt to put %s to sleep failed too many times... giving up' % (machineName))
|
log_warning('the attempt to put %s to sleep failed too many times... giving up' % (machineName))
|
||||||
return False
|
return False
|
||||||
else:
|
else:
|
||||||
logWarning('the attempt to put %s to sleep failed... trying again' % (machineName))
|
log_warning('the attempt to put %s to sleep failed... trying again' % (machineName))
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
|
||||||
def blockingWakeUpMachine(machineName):
|
def blocking_wake_up_machine(machineName):
|
||||||
logInfo('waking up machine %s...' % machineName)
|
log_info('waking up machine %s...' % machineName)
|
||||||
numAttempts = 0
|
numAttempts = 0
|
||||||
bWakeUpFailed = True
|
bWakeUpFailed = True
|
||||||
while bWakeUpFailed: # try more than once because sometimes for an unknown reason, the wake up order is ignored by the machine ... to be investigated
|
while bWakeUpFailed: # try more than once because sometimes for an unknown reason, the wake up order is ignored by the machine ... to be investigated
|
||||||
|
@ -172,45 +173,45 @@ def blockingWakeUpMachine(machineName):
|
||||||
iNumWakeUpAttempts = 0
|
iNumWakeUpAttempts = 0
|
||||||
bWakeUpMachineSucceeded = False
|
bWakeUpMachineSucceeded = False
|
||||||
while not bWakeUpMachineSucceeded:
|
while not bWakeUpMachineSucceeded:
|
||||||
bWakeUpMachineSucceeded = wakeUpMachine(machineName)
|
bWakeUpMachineSucceeded = wake_up_machine(machineName)
|
||||||
iNumWakeUpAttempts += 1
|
iNumWakeUpAttempts += 1
|
||||||
# the previous command can fail if the machine is already in a transition
|
# the previous command can fail if the machine is already in a transition
|
||||||
# in that case we try sevral times bevire giving up
|
# in that case we try sevral times bevire giving up
|
||||||
if not bWakeUpMachineSucceeded:
|
if not bWakeUpMachineSucceeded:
|
||||||
if iNumWakeUpAttempts < iMaxNumWakeUpAttempts:
|
if iNumWakeUpAttempts < iMaxNumWakeUpAttempts:
|
||||||
iDelay = 5
|
iDelay = 5
|
||||||
logWarning('wake up attempt %d of %s failed... I\'ll try again in %d seconds' % (iNumWakeUpAttempts, machineName, iDelay))
|
log_warning('wake up attempt %d of %s failed... I\'ll try again in %d seconds' % (iNumWakeUpAttempts, machineName, iDelay))
|
||||||
time.sleep(iDelay)
|
time.sleep(iDelay)
|
||||||
else:
|
else:
|
||||||
logWarning('wake up attempt %d of %s failed too many times... giving up' % (iNumWakeUpAttempts, machineName))
|
log_warning('wake up attempt %d of %s failed too many times... giving up' % (iNumWakeUpAttempts, machineName))
|
||||||
return False # couldn't wake up to machine for whatever reason
|
return False # couldn't wake up to machine for whatever reason
|
||||||
|
|
||||||
bWakeUpFailed = False
|
bWakeUpFailed = False
|
||||||
# wait until the machine is operational
|
# wait until the machine is operational
|
||||||
WAKEUPTIMEOUT = 5 * 60 # max number of seconds allowed for a machine to be alive after a wakeup request
|
WAKEUPTIMEOUT = 5 * 60 # max number of seconds allowed for a machine to be alive after a wakeup request
|
||||||
wakeUpToAliveDuration = 0
|
wakeUpToAliveDuration = 0
|
||||||
while not Lib.SimpaDbUtil.isMachineResponding(machineName):
|
while not is_machine_responding(machineName):
|
||||||
time.sleep(5)
|
time.sleep(5)
|
||||||
wakeUpToAliveDuration += 5
|
wakeUpToAliveDuration += 5
|
||||||
if wakeUpToAliveDuration > WAKEUPTIMEOUT:
|
if wakeUpToAliveDuration > WAKEUPTIMEOUT:
|
||||||
# the wake up failed for whatever reason (power state changed manually ? wake up order got lost ?)
|
# the wake up failed for whatever reason (power state changed manually ? wake up order got lost ?)
|
||||||
logWarning('%s took too long (more than %d seconds) to respond after a successful wakeup request.' % (machineName, WAKEUPTIMEOUT))
|
log_warning('%s took too long (more than %d seconds) to respond after a successful wakeup request.' % (machineName, WAKEUPTIMEOUT))
|
||||||
bWakeUpFailed = True
|
bWakeUpFailed = True
|
||||||
break
|
break
|
||||||
if bWakeUpFailed:
|
if bWakeUpFailed:
|
||||||
numAttempts += 1
|
numAttempts += 1
|
||||||
if numAttempts >= 2:
|
if numAttempts >= 2:
|
||||||
logWarning('giving up waking up %s because the wake up request succeeded but the machine never actually came alive (and this too many times)' % (machineName))
|
log_warning('giving up waking up %s because the wake up request succeeded but the machine never actually came alive (and this too many times)' % (machineName))
|
||||||
return False # power state changed manually ?
|
return False # power state changed manually ?
|
||||||
else:
|
else:
|
||||||
logWarning('attempting to wake up %s one more time' % (machineName))
|
log_warning('attempting to wake up %s one more time' % (machineName))
|
||||||
else:
|
else:
|
||||||
# wake up completed
|
# wake up completed
|
||||||
logInfo('Waking up of machine %s completed successfully' % machineName)
|
log_info('Waking up of machine %s completed successfully' % machineName)
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
|
||||||
def onException(exception):
|
def on_exception(exception):
|
||||||
sys.stdout.flush()
|
sys.stdout.flush()
|
||||||
strExceptionType = type(exception)
|
strExceptionType = type(exception)
|
||||||
strMessage = 'exception %s : %s\n' % (strExceptionType, exception.message)
|
strMessage = 'exception %s : %s\n' % (strExceptionType, exception.message)
|
||||||
|
@ -224,11 +225,10 @@ def onException(exception):
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# I had the case (see bugzilla 234) where an assert failed in a child thread, but the main process kept going. I suspect that it was caused
|
# I had the case (see bugzilla 234) where an assert failed in a child thread, but the main process kept going. I suspect that it was caused
|
||||||
# by a failure of sendTextMail... that's why I've embedded the sendmail inside a try except block, so that is this operation fails, then the
|
# by a failure of send_text_mail... that's why I've embedded the sendmail inside a try except block, so that is this operation fails, then the
|
||||||
# kill of the main process is still executed.
|
# kill of the main process is still executed.
|
||||||
Lib.Util.sendTextMail('ClusterController <guillaume.raffy@univ-rennes1.fr>', 'guillaume.raffy@univ-rennes1.fr', 'ClusterController has stopped because of an exception', strMessage)
|
send_text_mail('ClusterController <guillaume.raffy@univ-rennes1.fr>', 'guillaume.raffy@univ-rennes1.fr', 'ClusterController has stopped because of an exception', strMessage)
|
||||||
except BaseException:
|
except BaseException:
|
||||||
logError("Could not send the email to notify the administrator that cluster controller failed")
|
logError("Could not send the email to notify the administrator that cluster controller failed")
|
||||||
pass
|
execute_command('kill -9 %d' % os.getpid()) # stop other threads immediately
|
||||||
executeCommand('kill -9 %d' % os.getpid()) # stop other threads immediately
|
|
||||||
exit()
|
exit()
|
||||||
|
|
|
@ -1,109 +1,109 @@
|
||||||
#Copyright Jon Berg , turtlemeat.com
|
# Copyright Jon Berg , turtlemeat.com
|
||||||
|
import cgi
|
||||||
import string,cgi,time
|
import time
|
||||||
from os import curdir, sep
|
from os import curdir, sep
|
||||||
from BaseHTTPServer import BaseHTTPRequestHandler, HTTPServer
|
from BaseHTTPServer import BaseHTTPRequestHandler, HTTPServer # pylint:disable=import-error
|
||||||
import threading
|
import threading
|
||||||
import Util
|
# import pri
|
||||||
#import pri
|
from urlparse import urlparse, parse_qs # pylint:disable=import-error
|
||||||
from urlparse import urlparse, parse_qs
|
|
||||||
import xml.dom.minidom
|
import xml.dom.minidom
|
||||||
#>>> url = 'http://example.com/?foo=bar&one=1'
|
from .Util import on_exception
|
||||||
#>>> parse_qs(urlparse(url).query)
|
# >>> url = 'http://example.com/?foo=bar&one=1'
|
||||||
#{'foo': ['bar'], 'one': ['1']}
|
# >>> parse_qs(urlparse(url).query)
|
||||||
|
# {'foo': ['bar'], 'one': ['1']}
|
||||||
|
|
||||||
|
|
||||||
class MyHandler(BaseHTTPRequestHandler):
|
class MyHandler(BaseHTTPRequestHandler):
|
||||||
|
|
||||||
def do_GET(self):
|
def do_GET(self):
|
||||||
try:
|
try:
|
||||||
paramsDict=parse_qs(urlparse(self.path).query)
|
paramsDict = parse_qs(urlparse(self.path).query)
|
||||||
if self.path.endswith(".html"):
|
if self.path.endswith(".html"):
|
||||||
f = open(curdir + sep + self.path) #self.path has /test.html
|
f = open(curdir + sep + self.path, encoding='utf8') # self.path has /test.html
|
||||||
#note that this potentially makes every file on your computer readable by the internet
|
# note that this potentially makes every file on your computer readable by the internet
|
||||||
|
|
||||||
self.send_response(200)
|
self.send_response(200)
|
||||||
self.send_header('Content-type', 'text/html')
|
self.send_header('Content-type', 'text/html')
|
||||||
self.end_headers()
|
self.end_headers()
|
||||||
self.wfile.write(f.read())
|
self.wfile.write(f.read())
|
||||||
f.close()
|
f.close()
|
||||||
return
|
return
|
||||||
if self.path.endswith(".esp"): #our dynamic content
|
if self.path.endswith(".esp"): # our dynamic content
|
||||||
self.send_response(200)
|
self.send_response(200)
|
||||||
self.send_header('Content-type', 'text/html')
|
self.send_header('Content-type', 'text/html')
|
||||||
self.end_headers()
|
self.end_headers()
|
||||||
self.wfile.write("hey, today is the" + str(time.localtime()[7]))
|
self.wfile.write("hey, today is the" + str(time.localtime()[7]))
|
||||||
self.wfile.write(" day in the year " + str(time.localtime()[0]))
|
self.wfile.write(" day in the year " + str(time.localtime()[0]))
|
||||||
return
|
return
|
||||||
if self.path.endswith("ShowControlledMachines"): #http://simpatix10.univ-rennes1.fr:8080/ShowControlledMachines
|
if self.path.endswith("ShowControlledMachines"): # http://simpatix10.univ-rennes1.fr:8080/ShowControlledMachines
|
||||||
self.send_response(200)
|
self.send_response(200)
|
||||||
self.send_header('Content-type', 'text/xml')
|
self.send_header('Content-type', 'text/xml')
|
||||||
self.end_headers()
|
self.end_headers()
|
||||||
|
|
||||||
# Create the minidom document
|
# Create the minidom document
|
||||||
doc = xml.dom.minidom.Document()
|
doc = xml.dom.minidom.Document()
|
||||||
|
|
||||||
# Create the <ControlledMachines> root element
|
# Create the <ControlledMachines> root element
|
||||||
controlledMachinesElement = doc.createElement("ControlledMachines")
|
controlledMachinesElement = doc.createElement("ControlledMachines")
|
||||||
doc.appendChild(controlledMachinesElement)
|
doc.appendChild(controlledMachinesElement)
|
||||||
|
|
||||||
for machine in self.server.m_clusterController.m_clusterStatus.m_clusterNodes.values():
|
for machine in self.server.cluster_controller.cluster_status.cluster_nodes.values():
|
||||||
# Create the main <card> element
|
# Create the main <card> element
|
||||||
controlledMachineElement = doc.createElement("Machine")
|
controlledMachineElement = doc.createElement("Machine")
|
||||||
controlledMachineElement.setAttribute("name", machine.getName())
|
controlledMachineElement.setAttribute("name", machine.get_name())
|
||||||
controlledMachinesElement.appendChild(controlledMachineElement)
|
controlledMachinesElement.appendChild(controlledMachineElement)
|
||||||
# Print our newly created XML
|
# Print our newly created XML
|
||||||
self.wfile.write(doc.toprettyxml(indent=" "))
|
self.wfile.write(doc.toprettyxml(indent=" "))
|
||||||
return
|
return
|
||||||
if urlparse(self.path).path == '/SetControlOnMachine': #http://simpatix10.univ-rennes1.fr:8080/SetControlOnMachine?machineName=simpatix30&control=1
|
if urlparse(self.path).path == '/SetControlOnMachine': # http://simpatix10.univ-rennes1.fr:8080/SetControlOnMachine?machineName=simpatix30&control=1
|
||||||
machineName = paramsDict['machineName'][0]
|
machineName = paramsDict['machineName'][0]
|
||||||
bControl = (paramsDict['control'][0] == '1')
|
bControl = (paramsDict['control'][0] == '1')
|
||||||
self.server.m_clusterController.setControlOnMachine(machineName, bControl)
|
self.server.cluster_controller.set_control_on_machine(machineName, bControl)
|
||||||
self.send_response(200)
|
self.send_response(200)
|
||||||
self.send_header('Content-type', 'text/html')
|
self.send_header('Content-type', 'text/html')
|
||||||
self.end_headers()
|
self.end_headers()
|
||||||
if bControl == True:
|
if bControl is True:
|
||||||
self.wfile.write("%s is now controlled by ClusterController" % machineName)
|
self.wfile.write("%s is now controlled by ClusterController" % machineName)
|
||||||
else:
|
else:
|
||||||
self.wfile.write("%s is no longer controlled by ClusterController" % machineName)
|
self.wfile.write("%s is no longer controlled by ClusterController" % machineName)
|
||||||
return
|
return
|
||||||
|
|
||||||
return
|
return
|
||||||
|
|
||||||
except IOError:
|
except IOError:
|
||||||
self.send_error(404,'File Not Found: %s' % self.path)
|
self.send_error(404, 'File Not Found: %s' % self.path)
|
||||||
|
|
||||||
|
|
||||||
def do_POST(self):
|
def do_POST(self):
|
||||||
global rootnode
|
|
||||||
try:
|
try:
|
||||||
ctype, pdict = cgi.parse_header(self.headers.getheader('content-type'))
|
ctype, pdict = cgi.parse_header(self.headers.getheader('content-type'))
|
||||||
if ctype == 'multipart/form-data':
|
if ctype == 'multipart/form-data':
|
||||||
query=cgi.parse_multipart(self.rfile, pdict)
|
query = cgi.parse_multipart(self.rfile, pdict)
|
||||||
self.send_response(301)
|
self.send_response(301)
|
||||||
|
|
||||||
self.end_headers()
|
self.end_headers()
|
||||||
upfilecontent = query.get('upfile')
|
upfilecontent = query.get('upfile')
|
||||||
print "filecontent", upfilecontent[0]
|
print("filecontent", upfilecontent[0])
|
||||||
self.wfile.write("<HTML>POST OK.<BR><BR>");
|
self.wfile.write("<HTML>POST OK.<BR><BR>")
|
||||||
self.wfile.write(upfilecontent[0]);
|
self.wfile.write(upfilecontent[0])
|
||||||
|
|
||||||
except :
|
except BaseException:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
class WebServerThread( threading.Thread ):
|
|
||||||
def __init__( self, clusterController ):
|
|
||||||
threading.Thread.__init__(self)
|
|
||||||
#self.m_clusterController = clusterController
|
|
||||||
self.m_bStop = False
|
|
||||||
self.m_httpServer = HTTPServer(('', 8080), MyHandler)
|
|
||||||
self.m_httpServer.m_clusterController = clusterController
|
|
||||||
def run( self ):
|
|
||||||
try:
|
|
||||||
while not self.m_bStop:
|
|
||||||
self.m_httpServer.handle_request()
|
|
||||||
#self.m_httpServer.serve_forever()
|
|
||||||
except BaseException, exception: # catches all exceptions, including the ctrl+C (KeyboardInterrupt)
|
|
||||||
self.m_httpServer.socket.close()
|
|
||||||
Util.onException(exception)
|
|
||||||
|
|
||||||
|
class WebServerThread(threading.Thread):
|
||||||
|
stop: bool
|
||||||
|
http_server: HTTPServer
|
||||||
|
|
||||||
|
def __init__(self, clusterController):
|
||||||
|
threading.Thread.__init__(self)
|
||||||
|
# self.cluster_controller = clusterController
|
||||||
|
self.stop = False
|
||||||
|
self.http_server = HTTPServer(('', 8080), MyHandler)
|
||||||
|
self.http_server.cluster_controller = clusterController
|
||||||
|
|
||||||
|
def run(self):
|
||||||
|
try:
|
||||||
|
while not self.stop:
|
||||||
|
self.http_server.handle_request()
|
||||||
|
# self.http_server.serve_forever()
|
||||||
|
except BaseException as exception: # catches all exceptions, including the ctrl+C (KeyboardInterrupt)
|
||||||
|
self.http_server.socket.close()
|
||||||
|
on_exception(exception)
|
||||||
|
|
|
@ -12,35 +12,35 @@ class ClusterNodeSensorsReadings:
|
||||||
POWERSTATE_SLEEP=3
|
POWERSTATE_SLEEP=3
|
||||||
"""
|
"""
|
||||||
def __init__(self, clusterNodeName):
|
def __init__(self, clusterNodeName):
|
||||||
self.m_clusterNodeName = clusterNodeName
|
self.cluster_node_name = clusterNodeName
|
||||||
self.m_sensors = {}
|
self.sensors = {}
|
||||||
# self.m_powerState = ClusterNodeStatus.POWERSTATE_UNKNOWN
|
# self.power_state = ClusterNodeStatus.POWERSTATE_UNKNOWN
|
||||||
return
|
return
|
||||||
|
|
||||||
def addSensor(self, sensor):
|
def addSensor(self, sensor):
|
||||||
self.m_sensors[sensor.m_name] = sensor
|
self.sensors[sensor.name] = sensor
|
||||||
|
|
||||||
def dump(self):
|
def dump(self):
|
||||||
for key, sensor in self.m_sensors.items():
|
for key, sensor in self.sensors.items():
|
||||||
sensor.dump()
|
sensor.dump()
|
||||||
return
|
return
|
||||||
|
|
||||||
# def getPowerState(self):
|
# def get_power_state(self):
|
||||||
# return self.m_powerState
|
# return self.power_state
|
||||||
|
|
||||||
def getLowestTemperature(self):
|
def getLowestTemperature(self):
|
||||||
# log('ClusterNodeSensorsReadings::getLowestTemperature : start')
|
# log('ClusterNodeSensorsReadings::getLowestTemperature : start')
|
||||||
lowestTemperature = 0.0
|
lowestTemperature = 0.0
|
||||||
lowestTemperatureIsDefined = False
|
lowestTemperatureIsDefined = False
|
||||||
for key, sensor in self.m_sensors.items():
|
for key, sensor in self.sensors.items():
|
||||||
# log('ClusterNodeSensorsReadings::getLowestTemperature : start')
|
# log('ClusterNodeSensorsReadings::getLowestTemperature : start')
|
||||||
if sensor.typeName() == 'Temperature':
|
if sensor.typeName() == 'Temperature':
|
||||||
sensor.m_temperature
|
sensor.temperature
|
||||||
if lowestTemperatureIsDefined:
|
if lowestTemperatureIsDefined:
|
||||||
if sensor.m_temperature < lowestTemperature:
|
if sensor.temperature < lowestTemperature:
|
||||||
lowestTemperature = sensor.m_temperature
|
lowestTemperature = sensor.temperature
|
||||||
else:
|
else:
|
||||||
lowestTemperature = sensor.m_temperature
|
lowestTemperature = sensor.temperature
|
||||||
lowestTemperatureIsDefined = True
|
lowestTemperatureIsDefined = True
|
||||||
assert lowestTemperatureIsDefined
|
assert lowestTemperatureIsDefined
|
||||||
# log('ClusterNodeSensorsReadings::getLowestTemperature : end')
|
# log('ClusterNodeSensorsReadings::getLowestTemperature : end')
|
||||||
|
|
|
@ -31,12 +31,12 @@ class IpmiTool202Parser:
|
||||||
rpms = self.parseFanSensorOutput(f)
|
rpms = self.parseFanSensorOutput(f)
|
||||||
if temperature is not None:
|
if temperature is not None:
|
||||||
sensor = FanSensor(sensorName)
|
sensor = FanSensor(sensorName)
|
||||||
sensor.m_rpms = rpms
|
sensor.rpms = rpms
|
||||||
elif sensorType == 'Temperature':
|
elif sensorType == 'Temperature':
|
||||||
temperature = self.parseTemperatureSensorOutput(f)
|
temperature = self.parseTemperatureSensorOutput(f)
|
||||||
if temperature is not None:
|
if temperature is not None:
|
||||||
sensor = TemperatureSensor(sensorName)
|
sensor = TemperatureSensor(sensorName)
|
||||||
sensor.m_temperature = temperature
|
sensor.temperature = temperature
|
||||||
else:
|
else:
|
||||||
# ignoring other sensors
|
# ignoring other sensors
|
||||||
sensor = None
|
sensor = None
|
||||||
|
|
|
@ -22,10 +22,10 @@ class IpmiTool218Parser:
|
||||||
sensor = None
|
sensor = None
|
||||||
if sensorUnit == 'degrees C':
|
if sensorUnit == 'degrees C':
|
||||||
sensor = TemperatureSensor(sensorName)
|
sensor = TemperatureSensor(sensorName)
|
||||||
sensor.m_temperature = float(sensorValue)
|
sensor.temperature = float(sensorValue)
|
||||||
elif sensorUnit == 'RPM':
|
elif sensorUnit == 'RPM':
|
||||||
sensor = FanSensor(sensorName)
|
sensor = FanSensor(sensorName)
|
||||||
sensor.m_rpms = float(sensorValue)
|
sensor.rpms = float(sensorValue)
|
||||||
else:
|
else:
|
||||||
None
|
None
|
||||||
if sensor:
|
if sensor:
|
||||||
|
|
|
@ -1,23 +1,40 @@
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
|
||||||
class Sensor:
|
class Sensor:
|
||||||
|
|
||||||
def __init__(self, sensorName):
|
def __init__(self, sensorName):
|
||||||
self.m_name = sensorName
|
self.name = sensorName
|
||||||
self.m_isValid = True # false if this sensor is not actually present on the target machine
|
self.is_valid = True # false if this sensor is not actually present on the target machine
|
||||||
return
|
return
|
||||||
|
|
||||||
def dump(self):
|
def dump(self):
|
||||||
print self.m_name
|
print(self.name)
|
||||||
|
|
||||||
|
|
||||||
class FanSensor(Sensor):
|
class FanSensor(Sensor):
|
||||||
|
rpms: Optional[float]
|
||||||
|
|
||||||
def __init__(self, sensorName):
|
def __init__(self, sensorName):
|
||||||
Sensor.__init__(self, sensorName)
|
Sensor.__init__(self, sensorName)
|
||||||
|
self.rpms = None
|
||||||
|
|
||||||
def dump(self):
|
def dump(self):
|
||||||
print 'Fan \'', self.m_name, '\' rpm=',self.m_rpms
|
print('Fan \'', self.name, '\' rpm=', self.rpms)
|
||||||
|
|
||||||
def typeName(self):
|
def typeName(self):
|
||||||
return 'Fan'
|
return 'Fan'
|
||||||
|
|
||||||
|
|
||||||
class TemperatureSensor(Sensor):
|
class TemperatureSensor(Sensor):
|
||||||
|
temperature: Optional[float]
|
||||||
|
|
||||||
def __init__(self, sensorName):
|
def __init__(self, sensorName):
|
||||||
Sensor.__init__(self, sensorName)
|
Sensor.__init__(self, sensorName)
|
||||||
|
self.temperature = None
|
||||||
|
|
||||||
def dump(self):
|
def dump(self):
|
||||||
print 'Temperature \'', self.m_name, '\' temperature=',self.m_temperature
|
print('Temperature \'', self.name, '\' temperature=', self.temperature)
|
||||||
|
|
||||||
def typeName(self):
|
def typeName(self):
|
||||||
return 'Temperature'
|
return 'Temperature'
|
||||||
|
|
|
@ -8,31 +8,31 @@ else:
|
||||||
import re
|
import re
|
||||||
from .wol import wake_on_lan
|
from .wol import wake_on_lan
|
||||||
import os
|
import os
|
||||||
from .Util import executeProgram, executeCommand, log
|
from .Util import execute_program, execute_command, log
|
||||||
import abc
|
import abc
|
||||||
import sqlite3
|
import sqlite3
|
||||||
from .mysql2sqlite import mysql_to_sqlite
|
from .mysql2sqlite import mysql_to_sqlite
|
||||||
|
|
||||||
|
|
||||||
def isMachineResponding(machineName):
|
def is_machine_responding(machineName):
|
||||||
(returnCode, stdout, stderr) = executeProgram(['ping', '-o', '-t', '1', machineName])
|
(returnCode, stdout, stderr) = execute_program(['ping', '-o', '-t', '1', machineName])
|
||||||
# log( 'isMachineResponding : result of command %s : %d' % (command, returnCode) )
|
# log( 'is_machine_responding : result of command %s : %d' % (command, returnCode) )
|
||||||
|
|
||||||
if returnCode == 0:
|
if returnCode == 0:
|
||||||
return True
|
return True
|
||||||
else:
|
else:
|
||||||
bMachineNameIsNotKnown = (returnCode == 68)
|
# bMachineNameIsNotKnown = (returnCode == 68)
|
||||||
bMachineIsNotResponding = (returnCode == 2)
|
bMachineIsNotResponding = (returnCode == 2)
|
||||||
if bMachineIsNotResponding is False:
|
if bMachineIsNotResponding is False:
|
||||||
bBUG_00000004_IS_STILL_ALIVE = True
|
bBUG_00000004_IS_STILL_ALIVE = True
|
||||||
if bBUG_00000004_IS_STILL_ALIVE is True and returnCode == 142:
|
if bBUG_00000004_IS_STILL_ALIVE is True and returnCode == 142:
|
||||||
log('isMachineResponding : bug00000004 Unexpected return code : returnCode=%d, stdout="%s", stderr="%s" , machineName = %s' % (returnCode, stdout, stderr, machineName))
|
log('is_machine_responding : bug00000004 Unexpected return code : returnCode=%d, stdout="%s", stderr="%s" , machineName = %s' % (returnCode, stdout, stderr, machineName))
|
||||||
# don't stop the program until we understand bug00000004
|
# don't stop the program until we understand bug00000004
|
||||||
elif bBUG_00000004_IS_STILL_ALIVE is True and returnCode == -14: # I had this error code on 07/09/2009 20:38 but I don't know yet what that means
|
elif bBUG_00000004_IS_STILL_ALIVE is True and returnCode == -14: # I had this error code on 07/09/2009 20:38 but I don't know yet what that means
|
||||||
log('isMachineResponding : bug00000004 Unexpected return code : returnCode=%d, stdout="%s", stderr="%s" , machineName = %s' % (returnCode, stdout, stderr, machineName))
|
log('is_machine_responding : bug00000004 Unexpected return code : returnCode=%d, stdout="%s", stderr="%s" , machineName = %s' % (returnCode, stdout, stderr, machineName))
|
||||||
# don't stop the program until we understand bug00000004
|
# don't stop the program until we understand bug00000004
|
||||||
else:
|
else:
|
||||||
log('isMachineResponding : Unexpected return code : returnCode=%d, stdout="%s", stderr="%s" , machineName = %s' % (returnCode, stdout, stderr, machineName))
|
log('is_machine_responding : Unexpected return code : returnCode=%d, stdout="%s", stderr="%s" , machineName = %s' % (returnCode, stdout, stderr, machineName))
|
||||||
assert False
|
assert False
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
@ -46,7 +46,6 @@ class ISqlDatabaseBackend(object):
|
||||||
"""
|
"""
|
||||||
:param str sql_query: the sql query to perform
|
:param str sql_query: the sql query to perform
|
||||||
"""
|
"""
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
class RemoteMysqlDb(ISqlDatabaseBackend):
|
class RemoteMysqlDb(ISqlDatabaseBackend):
|
||||||
|
@ -70,7 +69,7 @@ class RemoteMysqlDb(ISqlDatabaseBackend):
|
||||||
:param str sql_query: the sql query to perform
|
:param str sql_query: the sql query to perform
|
||||||
"""
|
"""
|
||||||
self._conn.query(sql_query)
|
self._conn.query(sql_query)
|
||||||
rows = conn.store_result()
|
rows = self._conn.store_result()
|
||||||
return rows
|
return rows
|
||||||
|
|
||||||
|
|
||||||
|
@ -87,7 +86,7 @@ class SqlFile(ISqlDatabaseBackend):
|
||||||
# - the file is stored on a solid state disk
|
# - the file is stored on a solid state disk
|
||||||
try:
|
try:
|
||||||
os.remove(sqlite_db_path)
|
os.remove(sqlite_db_path)
|
||||||
except:
|
except BaseException:
|
||||||
pass
|
pass
|
||||||
check_same_thread = False
|
check_same_thread = False
|
||||||
# this is to prevent the following error when run from apache/django : SQLite objects created in a thread can only be used in that same thread.The object was created in thread id 139672342353664 and this is thread id 139672333960960
|
# this is to prevent the following error when run from apache/django : SQLite objects created in a thread can only be used in that same thread.The object was created in thread id 139672342353664 and this is thread id 139672333960960
|
||||||
|
@ -95,7 +94,7 @@ class SqlFile(ISqlDatabaseBackend):
|
||||||
# If set False, the returned connection may be shared across multiple threads. When using multiple threads with the same connection writing operations should be serialized by the user to avoid data corruption
|
# If set False, the returned connection may be shared across multiple threads. When using multiple threads with the same connection writing operations should be serialized by the user to avoid data corruption
|
||||||
# I hope it's safe here but I'm not 100% sure though. Anyway, if the database gets corrupt, it not a big deal since this memory resident database gets reconstructed from the sql file...
|
# I hope it's safe here but I'm not 100% sure though. Anyway, if the database gets corrupt, it not a big deal since this memory resident database gets reconstructed from the sql file...
|
||||||
self._con = sqlite3.connect(sqlite_db_path, check_same_thread=check_same_thread)
|
self._con = sqlite3.connect(sqlite_db_path, check_same_thread=check_same_thread)
|
||||||
with open(str(self._sql_file_path), 'r') as f: # str conversion has been added to support older versions of python in which open don't accept arguments of type Path
|
with open(str(self._sql_file_path), 'r', encoding='utf8') as f: # str conversion has been added to support older versions of python in which open don't accept arguments of type Path
|
||||||
sql = f.read() # watch out for built-in `str`
|
sql = f.read() # watch out for built-in `str`
|
||||||
# print(sql)
|
# print(sql)
|
||||||
self._cur = self._con.cursor()
|
self._cur = self._con.cursor()
|
||||||
|
@ -111,7 +110,6 @@ class SqlFile(ISqlDatabaseBackend):
|
||||||
"""
|
"""
|
||||||
:param str sql_query: the sql query to perform
|
:param str sql_query: the sql query to perform
|
||||||
"""
|
"""
|
||||||
pass
|
|
||||||
self._cur.execute(sql_query)
|
self._cur.execute(sql_query)
|
||||||
rows = self._cur.fetchall()
|
rows = self._cur.fetchall()
|
||||||
return rows
|
return rows
|
||||||
|
@ -196,7 +194,7 @@ def getLightOutManagementIpAddress(machineName):
|
||||||
return ipAddress
|
return ipAddress
|
||||||
|
|
||||||
|
|
||||||
def getClusterMachinesNames():
|
def get_cluster_machines_names():
|
||||||
clusterMachinesNames = []
|
clusterMachinesNames = []
|
||||||
conn = MySQLdb.connect('simpatix10', 'simpadb_reader', '', 'simpadb')
|
conn = MySQLdb.connect('simpatix10', 'simpadb_reader', '', 'simpadb')
|
||||||
assert conn
|
assert conn
|
||||||
|
@ -223,14 +221,12 @@ def machineSupportsIpmi(machineName):
|
||||||
|
|
||||||
def putToSleep(machineName):
|
def putToSleep(machineName):
|
||||||
# note : pmset must be executed as root
|
# note : pmset must be executed as root
|
||||||
(returnCode, stdout, stderr) = executeCommand(['ssh', machineName, 'pmset sleepnow'])
|
(returnCode, stdout, _stderr) = execute_command(['ssh', machineName, 'pmset sleepnow'])
|
||||||
"""
|
# print returnCode
|
||||||
print returnCode
|
# print 'stdout :'
|
||||||
print 'stdout :'
|
# print stdout
|
||||||
print stdout
|
# print 'stderr :'
|
||||||
print 'stderr :'
|
# print stderr
|
||||||
print stderr
|
|
||||||
"""
|
|
||||||
assert returnCode == 0
|
assert returnCode == 0
|
||||||
# check if the command succeeded by looking at the output (that's the only way I found)
|
# check if the command succeeded by looking at the output (that's the only way I found)
|
||||||
f = StringIO.StringIO(stdout)
|
f = StringIO.StringIO(stdout)
|
||||||
|
@ -255,7 +251,7 @@ def isNonRespondingMachineSleeping(machineName):
|
||||||
"""
|
"""
|
||||||
wakeUp(machineName)
|
wakeUp(machineName)
|
||||||
time.sleep(120)
|
time.sleep(120)
|
||||||
if isMachineResponding(machineName):
|
if is_machine_responding(machineName):
|
||||||
putToSleep(machineName)
|
putToSleep(machineName)
|
||||||
time.sleep(30) # allow a little time to make sure the machine is ready to receive other wake on lan messages
|
time.sleep(30) # allow a little time to make sure the machine is ready to receive other wake on lan messages
|
||||||
return True
|
return True
|
||||||
|
@ -264,11 +260,9 @@ def isNonRespondingMachineSleeping(machineName):
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
"""
|
# for i in range(30):
|
||||||
for i in range(30):
|
# machineName = 'simpatix%d' % (i+10)
|
||||||
machineName = 'simpatix%d' % (i+10)
|
# print 'lom ip of %s is %s' % (machineName, getLightOutManagementIpAddress(machineName))
|
||||||
print 'lom ip of %s is %s' % (machineName, getLightOutManagementIpAddress(machineName))
|
|
||||||
"""
|
|
||||||
wakeUp('simpatix21')
|
wakeUp('simpatix21')
|
||||||
# print putToSleep('simpatix13')
|
# print putToSleep('simpatix13')
|
||||||
# print isNonRespondingMachineSleeping('simpatix13')
|
# print isNonRespondingMachineSleeping('simpatix13')
|
||||||
|
|
|
@ -1,6 +1,5 @@
|
||||||
import time
|
import time
|
||||||
import subprocess
|
import subprocess
|
||||||
import io
|
|
||||||
import re
|
import re
|
||||||
import logging
|
import logging
|
||||||
# from wol import *
|
# from wol import *
|
||||||
|
@ -16,7 +15,7 @@ else:
|
||||||
from email.mime.text import MIMEText
|
from email.mime.text import MIMEText
|
||||||
|
|
||||||
|
|
||||||
def sendTextMail(strFrom, to, strSubject, text):
|
def send_text_mail(strFrom, to, strSubject, text):
|
||||||
# from = "SimpaCluster <guillaume.raffy@univ-rennes1.fr>"
|
# from = "SimpaCluster <guillaume.raffy@univ-rennes1.fr>"
|
||||||
mail = MIMEText(text)
|
mail = MIMEText(text)
|
||||||
mail['From'] = strFrom
|
mail['From'] = strFrom
|
||||||
|
@ -30,12 +29,14 @@ def sendTextMail(strFrom, to, strSubject, text):
|
||||||
|
|
||||||
|
|
||||||
class Error(Exception):
|
class Error(Exception):
|
||||||
|
message: str
|
||||||
|
|
||||||
def __init__(self, strMessage):
|
def __init__(self, strMessage):
|
||||||
self.m_strMessage = strMessage
|
self.message = strMessage
|
||||||
|
|
||||||
|
|
||||||
def getHostName():
|
def getHostName():
|
||||||
(returnCode, stdout, stderr) = executeProgram(['hostname', '-s'])
|
(returnCode, stdout, stderr) = execute_program(['hostname', '-s'])
|
||||||
if returnCode != 0:
|
if returnCode != 0:
|
||||||
raise Error(stderr)
|
raise Error(stderr)
|
||||||
strHostName = re.sub(r"\n", "", stdout)
|
strHostName = re.sub(r"\n", "", stdout)
|
||||||
|
@ -46,18 +47,18 @@ def log(message):
|
||||||
print(time.asctime(time.localtime()) + ' : ' + message)
|
print(time.asctime(time.localtime()) + ' : ' + message)
|
||||||
|
|
||||||
|
|
||||||
def executeProgram(astrArguments):
|
def execute_program(astrArguments):
|
||||||
# log('executeProgram : program [%s]' % (','.join(astrArguments)))
|
# log('execute_program : program [%s]' % (','.join(astrArguments)))
|
||||||
popen = subprocess.Popen(astrArguments, shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE) # bufsize=1 seems to prevent deadlocks that happen 50% the time
|
popen = subprocess.Popen(astrArguments, shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE) # bufsize=1 seems to prevent deadlocks that happen 50% the time
|
||||||
stdout, stderr = popen.communicate()
|
stdout, stderr = popen.communicate()
|
||||||
# popen.wait()
|
# popen.wait()
|
||||||
result = (popen.returncode, stdout.decode(), stderr)
|
result = (popen.returncode, stdout.decode(), stderr)
|
||||||
# log('executeProgram : command %s popen.pid = %d' % (astrArguments[0], popen.pid))
|
# log('execute_program : command %s popen.pid = %d' % (astrArguments[0], popen.pid))
|
||||||
# os.kill(popen.pid, signal.SIGTERM)
|
# os.kill(popen.pid, signal.SIGTERM)
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
|
||||||
def executeCommand(command):
|
def execute_command(command):
|
||||||
"""
|
"""
|
||||||
executes the shell command such as 'set x=1; myprog $x'
|
executes the shell command such as 'set x=1; myprog $x'
|
||||||
"""
|
"""
|
||||||
|
@ -69,25 +70,25 @@ def executeCommand(command):
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
|
||||||
def executeCommandOn(target_machine_fqdn: str, command: str, user: str = None):
|
def execute_commandOn(target_machine_fqdn: str, command: str, user: str = None):
|
||||||
"""
|
"""
|
||||||
execute command on a local or remote machine (using ssh then)
|
execute command on a local or remote machine (using ssh then)
|
||||||
:param str user: if not None, the user that should be used to execute the command (instead of the current user)
|
:param str user: if not None, the user that should be used to execute the command (instead of the current user)
|
||||||
"""
|
"""
|
||||||
logging.debug("executing %s on %s as %s" % (command, target_machine_fqdn, user))
|
logging.debug("executing %s on %s as %s", command, target_machine_fqdn, user)
|
||||||
if getHostName() == target_machine_fqdn.split('.')[0]:
|
if getHostName() == target_machine_fqdn.split('.')[0]:
|
||||||
if user is not None:
|
if user is not None:
|
||||||
# su -c "ls -l /tmp" graffy
|
# su -c "ls -l /tmp" graffy
|
||||||
result = executeCommand("su -c '%s' %s" % (command, user))
|
result = execute_command("su -c '%s' %s" % (command, user))
|
||||||
else:
|
else:
|
||||||
result = executeCommand(command)
|
result = execute_command(command)
|
||||||
else:
|
else:
|
||||||
if user is not None:
|
if user is not None:
|
||||||
target = '%s@%s' % (user, target_machine_fqdn)
|
target = '%s@%s' % (user, target_machine_fqdn)
|
||||||
else:
|
else:
|
||||||
target = target_machine_fqdn
|
target = target_machine_fqdn
|
||||||
result = executeProgram(['ssh', target, "%s" % command])
|
result = execute_program(['ssh', target, "%s" % command])
|
||||||
logging.debug("finished executing %s on %s as %s" % (command, target_machine_fqdn, user))
|
logging.debug("finished executing %s on %s as %s", command, target_machine_fqdn, user)
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
@ -97,16 +98,16 @@ def getUpsStatus():
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
HTMLParser.__init__(self)
|
HTMLParser.__init__(self)
|
||||||
self.TokenList = []
|
self.token_list = []
|
||||||
|
|
||||||
def handle_data(self, data):
|
def handle_data(self, data):
|
||||||
data = data.strip()
|
data = data.strip()
|
||||||
if data and len(data) > 0:
|
if data and len(data) > 0:
|
||||||
self.TokenList.append(data)
|
self.token_list.append(data)
|
||||||
# print data
|
# print data
|
||||||
|
|
||||||
def GetTokenList(self):
|
def get_token_list(self):
|
||||||
return self.TokenList
|
return self.token_list
|
||||||
from urllib.request import urlopen
|
from urllib.request import urlopen
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
@ -114,21 +115,20 @@ def getUpsStatus():
|
||||||
f = urlopen(url)
|
f = urlopen(url)
|
||||||
res = f.read()
|
res = f.read()
|
||||||
f.close()
|
f.close()
|
||||||
except:
|
except BaseException:
|
||||||
print("bad read")
|
print("bad read")
|
||||||
return
|
return
|
||||||
h = MyHTMLParser()
|
h = MyHTMLParser()
|
||||||
h.feed(res)
|
h.feed(res)
|
||||||
tokensList = h.GetTokenList() # noqa:F841
|
_tokensList = h.get_token_list() # noqa:F841
|
||||||
|
raise NotImplementedError('the implementation is not complete')
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
from SimpaDbUtil import wakeUp
|
from .SimpaDbUtil import wakeUp
|
||||||
"""
|
# for i in range(30):
|
||||||
for i in range(30):
|
# machineName = 'simpatix%d' % (i+10)
|
||||||
machineName = 'simpatix%d' % (i+10)
|
# print 'lom ip of %s is %s' % (machineName, getLightOutManagementIpAddress(machineName))
|
||||||
print 'lom ip of %s is %s' % (machineName, getLightOutManagementIpAddress(machineName))
|
|
||||||
"""
|
|
||||||
wakeUp('simpatix21')
|
wakeUp('simpatix21')
|
||||||
# print putToSleep('simpatix13')
|
# print putToSleep('simpatix13')
|
||||||
# print isNonRespondingMachineSleeping('simpatix13')
|
# print isNonRespondingMachineSleeping('simpatix13')
|
||||||
|
|
|
@ -20,7 +20,7 @@ class CoclutoTestCase(unittest.TestCase):
|
||||||
qstat_output = file.read()
|
qstat_output = file.read()
|
||||||
# qstatParser = ClusterController.QstatParser()
|
# qstatParser = ClusterController.QstatParser()
|
||||||
qstatParser = QstatParser()
|
qstatParser = QstatParser()
|
||||||
job_state = qstatParser.parseQstatOutput(qstat_output, cluster_domain='ipr.univ-rennes1.fr')
|
job_state = qstatParser.parse_qstat_output(qstat_output, cluster_domain='ipr.univ-rennes1.fr')
|
||||||
|
|
||||||
self.assertIsInstance(job_state, JobsState)
|
self.assertIsInstance(job_state, JobsState)
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue