fixed pylint errors and cleaned up
work related to Bug 3315 - make simpaweb django app a packageable application
This commit is contained in:
parent
7a5d32dec0
commit
270304f58e
|
@ -2,313 +2,325 @@
|
||||||
import sys
|
import sys
|
||||||
sys.path.insert(0, '..')
|
sys.path.insert(0, '..')
|
||||||
import os
|
import os
|
||||||
|
import MySQLdb
|
||||||
|
import threading
|
||||||
from Lib.Util import *
|
from Lib.Util import *
|
||||||
from Lib.SimpaDbUtil import *
|
from Lib.SimpaDbUtil import *
|
||||||
import time
|
import time
|
||||||
from ClusterStatus import ClusterStatus
|
from ClusterStatus import ClusterStatus
|
||||||
from SlotAllocator import *
|
from SlotAllocator import DecoupledSlotAllocator
|
||||||
from Log import *
|
from Log import logDebug, logInfo
|
||||||
from ClusterNodeStatusUpdater import *
|
from ClusterNodeStatusUpdater import IWakeUpCompleteNotifier, ISleepCompleteNotifier
|
||||||
from SunGridEngine import SunGridEngine
|
from SunGridEngine import SunGridEngine
|
||||||
import Util
|
from Util import log, onException
|
||||||
from WebServer import WebServerThread
|
from WebServer import WebServerThread
|
||||||
|
from PowerState import PowerState
|
||||||
from HTMLParser import HTMLParser
|
from HTMLParser import HTMLParser
|
||||||
|
|
||||||
VERSION='1.18'
|
VERSION = '1.18'
|
||||||
|
|
||||||
|
|
||||||
class MyHTMLParser(HTMLParser):
|
class MyHTMLParser(HTMLParser):
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
HTMLParser.__init__(self)
|
HTMLParser.__init__(self)
|
||||||
self.TokenList = []
|
self.TokenList = []
|
||||||
def handle_data( self,data):
|
|
||||||
data = data.strip()
|
def handle_data(self, data):
|
||||||
if data and len(data) > 0:
|
data = data.strip()
|
||||||
self.TokenList.append(data)
|
if data and len(data) > 0:
|
||||||
#print data
|
self.TokenList.append(data)
|
||||||
def GetTokenList(self):
|
# print data
|
||||||
return self.TokenList
|
|
||||||
|
def GetTokenList(self):
|
||||||
|
return self.TokenList
|
||||||
|
|
||||||
|
|
||||||
class WakeUpCompleteNotifier( IWakeUpCompleteNotifier ):
|
class WakeUpCompleteNotifier(IWakeUpCompleteNotifier):
|
||||||
def __init__(self, machineName, clusterController):
|
|
||||||
self.m_machineName = machineName
|
|
||||||
self.m_clusterController = clusterController
|
|
||||||
def onWakeUpComplete( self ):
|
|
||||||
logDebug('WakeUpCompleteNotifier::onWakeUpComplete : start')
|
|
||||||
self.m_clusterController.onMachineWakeUpComplete( self.m_machineName )
|
|
||||||
|
|
||||||
class SleepCompleteNotifier( ISleepCompleteNotifier ):
|
def __init__(self, machineName, clusterController):
|
||||||
def __init__(self, machineName, clusterController):
|
self.m_machineName = machineName
|
||||||
self.m_machineName = machineName
|
self.m_clusterController = clusterController
|
||||||
self.m_clusterController = clusterController
|
|
||||||
def onSleepComplete( self, bSleepSucceeded ):
|
def onWakeUpComplete(self):
|
||||||
logDebug('SleepCompleteNotifier::onSleepComplete : start')
|
logDebug('WakeUpCompleteNotifier::onWakeUpComplete : start')
|
||||||
self.m_clusterController.onMachineSleepComplete( self.m_machineName, bSleepSucceeded )
|
self.m_clusterController.onMachineWakeUpComplete(self.m_machineName)
|
||||||
|
|
||||||
|
|
||||||
|
class SleepCompleteNotifier(ISleepCompleteNotifier):
|
||||||
|
|
||||||
|
def __init__(self, machineName, clusterController):
|
||||||
|
self.m_machineName = machineName
|
||||||
|
self.m_clusterController = clusterController
|
||||||
|
|
||||||
|
def onSleepComplete(self, bSleepSucceeded):
|
||||||
|
logDebug('SleepCompleteNotifier::onSleepComplete : start')
|
||||||
|
self.m_clusterController.onMachineSleepComplete(self.m_machineName, bSleepSucceeded)
|
||||||
|
|
||||||
|
|
||||||
|
def jouleToKwh(fEnergyInJoules):
|
||||||
|
"""
|
||||||
|
converts joules to kWH
|
||||||
|
"""
|
||||||
|
# 1 kWh = 1000 * 3600 J
|
||||||
|
return fEnergyInJoules / (1000.0 * 3600.0)
|
||||||
|
|
||||||
def jouleToKwh( fEnergyInJoules ):
|
|
||||||
"""
|
|
||||||
converts joules to kWH
|
|
||||||
"""
|
|
||||||
# 1 kWh = 1000 * 3600 J
|
|
||||||
return fEnergyInJoules / (1000.0 * 3600.0)
|
|
||||||
|
|
||||||
class ClusterController:
|
class ClusterController:
|
||||||
"""
|
"""
|
||||||
The cluster controller monitors the cluster's activity and has multiple purposes :
|
The cluster controller monitors the cluster's activity and has multiple purposes :
|
||||||
- energy saving : it can put some machines to sleep if they have nothing to do, or it
|
- energy saving : it can put some machines to sleep if they have nothing to do, or it
|
||||||
can wake them up when needed (eg when a new job has arrived)
|
can wake them up when needed (eg when a new job has arrived)
|
||||||
- auto-repair : for examples
|
- auto-repair : for examples
|
||||||
- it happened sometimes that sge_execd process disappeared for some unknown reason
|
- it happened sometimes that sge_execd process disappeared for some unknown reason
|
||||||
in that case, the cluster controller can detect it and restart the daemon
|
in that case, the cluster controller can detect it and restart the daemon
|
||||||
automatically, without administrator's intervention
|
automatically, without administrator's intervention
|
||||||
- clear the Error state of queues
|
- clear the Error state of queues
|
||||||
- it could also be used to dynamically adapt sge's settings to the requirements of
|
- it could also be used to dynamically adapt sge's settings to the requirements of
|
||||||
jobs (eg add some machines to a queue).
|
jobs (eg add some machines to a queue).
|
||||||
Mechanism to let user get priority
|
Mechanism to let user get priority
|
||||||
"""
|
"""
|
||||||
def __init__( self ):
|
def __init__(self):
|
||||||
gridEngine = SunGridEngine()
|
gridEngine = SunGridEngine()
|
||||||
self.m_clusterStatus = ClusterStatus( gridEngine )
|
self.m_clusterStatus = ClusterStatus(gridEngine)
|
||||||
self.m_slotAllocator = DecoupledSlotAllocator() #SimpleSlotAllocator()
|
self.m_slotAllocator = DecoupledSlotAllocator() # SimpleSlotAllocator()
|
||||||
self.m_machinesThatNeedWakeUp = {}
|
self.m_machinesThatNeedWakeUp = {}
|
||||||
self.m_machinesThatNeedWakeupLock = threading.Lock() # to prevent concurrent access to m_machinesThatNeedWakeUp
|
self.m_machinesThatNeedWakeupLock = threading.Lock() # to prevent concurrent access to m_machinesThatNeedWakeUp
|
||||||
self.m_machinesThatNeedSleeping = {}
|
self.m_machinesThatNeedSleeping = {}
|
||||||
self.m_machinesThatNeedSleepingLock = threading.Lock() # to prevent concurrent access to m_machinesThatNeedSleeping
|
self.m_machinesThatNeedSleepingLock = threading.Lock() # to prevent concurrent access to m_machinesThatNeedSleeping
|
||||||
self.m_lastEnergyStatusLogTime = None
|
self.m_lastEnergyStatusLogTime = None
|
||||||
self.DELAY_BETWEEN_ENERGY_STATUS_LOGS = 60 # in seconds
|
self.DELAY_BETWEEN_ENERGY_STATUS_LOGS = 60 # in seconds
|
||||||
self.m_iSessionId = None # session (run) identifier in database
|
self.m_iSessionId = None # session (run) identifier in database
|
||||||
self.m_webServer = WebServerThread(self)
|
self.m_webServer = WebServerThread(self)
|
||||||
self.m_bStop = False
|
self.m_bStop = False
|
||||||
self.m_bStopLock = threading.Lock() # to prevent concurrent access to m_bStop
|
self.m_bStopLock = threading.Lock() # to prevent concurrent access to m_bStop
|
||||||
|
|
||||||
def getClusterStatus( self ):
|
|
||||||
return self.m_clusterStatus
|
|
||||||
|
|
||||||
def log( self, message ):
|
|
||||||
print message
|
|
||||||
|
|
||||||
def shutdownLeastImportantNode( self ):
|
|
||||||
self.log("ClusterController::shutdownLeastImportantNode : start")
|
|
||||||
|
|
||||||
def onMachineWakeUpComplete( self, machineName ):
|
|
||||||
self.m_machinesThatNeedWakeupLock.acquire()
|
|
||||||
#logDebug('ClusterController::onMachineWakeUpComplete : machine %s old len(self.m_machinesThatNeedWakeUp) = %d' % (machineName,len(self.m_machinesThatNeedWakeUp)) )
|
|
||||||
del self.m_machinesThatNeedWakeUp[ machineName ]
|
|
||||||
#logDebug('ClusterController::onMachineWakeUpComplete : machine %s new len(self.m_machinesThatNeedWakeUp) = %d' % (machineName,len(self.m_machinesThatNeedWakeUp)) )
|
|
||||||
self.m_machinesThatNeedWakeupLock.release()
|
|
||||||
logDebug('ClusterController::onMachineWakeUpComplete : removed %s from the list of machines that need waking up because it\'s now awake' % machineName)
|
|
||||||
|
|
||||||
def onMachineSleepComplete( self, machineName, bSleepSucceeded ):
|
def getClusterStatus(self):
|
||||||
self.m_machinesThatNeedSleepingLock.acquire()
|
return self.m_clusterStatus
|
||||||
#logDebug('ClusterController::onMachineSleepComplete : machine %s old len(self.m_machinesThatNeedWakeUp) = %d' % (machineName,len(self.m_machinesThatNeedWakeUp)) )
|
|
||||||
del self.m_machinesThatNeedSleeping[ machineName ]
|
|
||||||
#logDebug('ClusterController::onMachineSleepComplete : machine %s new len(self.m_machinesThatNeedWakeUp) = %d' % (machineName,len(self.m_machinesThatNeedWakeUp)) )
|
|
||||||
self.m_machinesThatNeedSleepingLock.release()
|
|
||||||
if bSleepSucceeded:
|
|
||||||
logDebug('ClusterController::onMachineSleepComplete : removed %s from the list of machines that need sleeping because it\'s now sleeping' % machineName)
|
|
||||||
else:
|
|
||||||
logDebug('ClusterController::onMachineSleepComplete : removed %s from the list of machines that need sleeping because it can\'t be put to sleep at the moment (eg a job just arrived)' % machineName)
|
|
||||||
|
|
||||||
def getNumPendingWakeUps( self ):
|
|
||||||
self.m_machinesThatNeedWakeupLock.acquire()
|
|
||||||
numPendingWakeUps = len(self.m_machinesThatNeedWakeUp)
|
|
||||||
self.m_machinesThatNeedWakeupLock.release()
|
|
||||||
return numPendingWakeUps
|
|
||||||
|
|
||||||
def getNumPendingSleeps( self ):
|
def log(self, message):
|
||||||
self.m_machinesThatNeedSleepingLock.acquire()
|
print(message)
|
||||||
numPendingSleeps = len(self.m_machinesThatNeedSleeping)
|
|
||||||
self.m_machinesThatNeedSleepingLock.release()
|
|
||||||
return numPendingSleeps
|
|
||||||
|
|
||||||
def putIdleMachinesToSleep( self ):
|
|
||||||
self.m_clusterStatus.m_lock.acquire()
|
|
||||||
idleMachines = self.m_clusterStatus.getIdleMachines()
|
|
||||||
# logInfo('idleMachines :')
|
|
||||||
self.m_machinesThatNeedToSleep = []
|
|
||||||
for machineName, idleMachine in idleMachines.items():
|
|
||||||
if idleMachine.getPowerState() == PowerState.ON:
|
|
||||||
# logInfo('\t%s' % machineName)
|
|
||||||
if idleMachine.getName() != 'simpatix10': # never put simpatix10 to sleep because it's the sge master and is also server for other things
|
|
||||||
self.m_machinesThatNeedSleeping[idleMachine.getName()]=idleMachine
|
|
||||||
self.m_clusterStatus.m_lock.release()
|
|
||||||
|
|
||||||
listOfMachinesThatNeedSleeping = self.m_machinesThatNeedSleeping.values() # duplicate the list so that we don't iterate on m_machinesThatNeedSleeping, which could cause a runtime error because callbacks alter m_machinesThatNeedWakeUp
|
def shutdownLeastImportantNode(self):
|
||||||
for machine in listOfMachinesThatNeedSleeping:
|
self.log("ClusterController::shutdownLeastImportantNode : start")
|
||||||
logInfo('ClusterController::putIdleMachinesToSleep : requesting sleep for %s because it\'s idle' % machine.getName())
|
|
||||||
machine.requestSleep( SleepCompleteNotifier( machine.getName(), self ) )
|
|
||||||
|
|
||||||
if len(listOfMachinesThatNeedSleeping) != 0:
|
|
||||||
# hack : wait until the sleep requests are handled so that we don't request the same machine to sleep multiple times
|
|
||||||
while self.getNumPendingSleeps() > 0:
|
|
||||||
time.sleep(1)
|
|
||||||
|
|
||||||
|
def onMachineWakeUpComplete(self, machineName):
|
||||||
def wakeUpMachinesForPendingJobs(self):
|
self.m_machinesThatNeedWakeupLock.acquire()
|
||||||
listOfMachinesThatNeedWakeUp = []
|
# logDebug('ClusterController::onMachineWakeUpComplete : machine %s old len(self.m_machinesThatNeedWakeUp) = %d' % (machineName,len(self.m_machinesThatNeedWakeUp)))
|
||||||
|
del self.m_machinesThatNeedWakeUp[machineName]
|
||||||
self.m_clusterStatus.m_lock.acquire()
|
# logDebug('ClusterController::onMachineWakeUpComplete : machine %s new len(self.m_machinesThatNeedWakeUp) = %d' % (machineName,len(self.m_machinesThatNeedWakeUp)))
|
||||||
pendingJobs = self.m_clusterStatus.getPendingJobs()
|
self.m_machinesThatNeedWakeupLock.release()
|
||||||
"""
|
logDebug('ClusterController::onMachineWakeUpComplete : removed %s from the list of machines that need waking up because it\'s now awake' % machineName)
|
||||||
logInfo('pending jobs :')
|
|
||||||
for job in pendingJobs.values():
|
|
||||||
logInfo('\t%d' % job.getId().asStr())
|
|
||||||
"""
|
|
||||||
if len(pendingJobs) != 0:
|
|
||||||
self.m_machinesThatNeedWakeUp = self.m_slotAllocator.getMachinesThatNeedWakeUp( pendingJobs, self.m_clusterStatus )
|
|
||||||
if len(self.m_machinesThatNeedWakeUp) == 0:
|
|
||||||
None
|
|
||||||
#logInfo('ClusterController::updateNormalState : no machine needs waking up' )
|
|
||||||
else:
|
|
||||||
listOfMachinesThatNeedWakeUp = self.m_machinesThatNeedWakeUp.values() # duplicate the list so that we don't iterate on m_machinesThatNeedWakeUp, which would cause a runtime error because callbacks alter m_machinesThatNeedWakeUp
|
|
||||||
for machine in listOfMachinesThatNeedWakeUp:
|
|
||||||
logInfo('ClusterController::wakeUpMachinesForPendingJobs : requesting wake up for '+machine.getName() )
|
|
||||||
machine.requestWakeUp( WakeUpCompleteNotifier( machine.getName(), self ) )
|
|
||||||
self.m_clusterStatus.m_lock.release()
|
|
||||||
|
|
||||||
if len(listOfMachinesThatNeedWakeUp) != 0:
|
|
||||||
# hack : wait until the wakeup requests are handled so that a later sleep request doesn't cancel it
|
|
||||||
# and also wait for the jobs to come in
|
|
||||||
while self.getNumPendingWakeUps() > 0:
|
|
||||||
time.sleep(1)
|
|
||||||
iSGE_CHEK_RUNNABLE_JOBS_DELAY = 60 * 5 # max time it takes for sge between the fact that a queued job is runnable and SGE actually starting it (I've put a long time here because sometimes, qstat takes a long time to ralise that the machine is available after I wake it up)
|
|
||||||
logInfo('ClusterController::wakeUpMachinesForPendingJobs : all required machines are awake. Now give %d seconds to SGE to allocate slots.' % iSGE_CHEK_RUNNABLE_JOBS_DELAY)
|
|
||||||
# wait until SGE has a chance to allocate slots
|
|
||||||
time.sleep(iSGE_CHEK_RUNNABLE_JOBS_DELAY) # note : this is annoying because it blocks the main thread. This could be improved if we forbid the machines to go to sleep for that much time....
|
|
||||||
logInfo('ClusterController::wakeUpMachinesForPendingJobs : end of the delay given to SGE to allocate slots')
|
|
||||||
|
|
||||||
def updateNormalState( self ):
|
|
||||||
# attempt to shut down machines that are idle
|
|
||||||
self.putIdleMachinesToSleep()
|
|
||||||
# wake up necessary machines if there are pending jobs
|
|
||||||
self.wakeUpMachinesForPendingJobs()
|
|
||||||
|
|
||||||
def storeSessionInDatabase( self ):
|
|
||||||
conn = MySQLdb.connect('simpatix10', 'clusterctrl', '', 'clustercontroller')
|
|
||||||
assert(conn)
|
|
||||||
|
|
||||||
# retrieve the session id, as it's an auto_increment field
|
|
||||||
sqlCommand = "SELECT AUTO_INCREMENT FROM information_schema.TABLES WHERE TABLE_SCHEMA = 'clustercontroller' AND TABLE_NAME = 'sessions_desc'"
|
|
||||||
print sqlCommand
|
|
||||||
conn.query(sqlCommand)
|
|
||||||
r=conn.store_result()
|
|
||||||
iSessionId = r.fetch_row()[0][0]
|
|
||||||
|
|
||||||
# stores information about the session
|
|
||||||
sqlCommand = "INSERT INTO `sessions_desc` (`start_time`, end_time, `program_version`, `machine_name`, `pid`, num_controlled_machines) VALUES (NOW(), NOW(), '%s', 'simpatix10', %d, %d);" % (VERSION, os.getpid(), len(self.m_clusterStatus.m_clusterNodes))
|
|
||||||
print sqlCommand
|
|
||||||
conn.query(sqlCommand)
|
|
||||||
|
|
||||||
# initialize the energy savings table
|
|
||||||
sqlCommand = "INSERT INTO session_to_energy_savings (session_id, energy_savings_kwh) VALUES (%d,0.0);" % (iSessionId)
|
|
||||||
print sqlCommand
|
|
||||||
conn.query(sqlCommand)
|
|
||||||
|
|
||||||
conn.close()
|
def onMachineSleepComplete(self, machineName, bSleepSucceeded):
|
||||||
print( 'Session Iid = %d' % iSessionId )
|
self.m_machinesThatNeedSleepingLock.acquire()
|
||||||
return iSessionId
|
# logDebug('ClusterController::onMachineSleepComplete : machine %s old len(self.m_machinesThatNeedWakeUp) = %d' % (machineName,len(self.m_machinesThatNeedWakeUp)))
|
||||||
|
del self.m_machinesThatNeedSleeping[machineName]
|
||||||
def updateSessionEnergyConsumptionInDatabase( self ):
|
# logDebug('ClusterController::onMachineSleepComplete : machine %s new len(self.m_machinesThatNeedWakeUp) = %d' % (machineName,len(self.m_machinesThatNeedWakeUp)))
|
||||||
conn = MySQLdb.connect('simpatix10', 'root', '', 'clustercontroller')
|
self.m_machinesThatNeedSleepingLock.release()
|
||||||
assert(conn)
|
if bSleepSucceeded:
|
||||||
|
logDebug('ClusterController::onMachineSleepComplete : removed %s from the list of machines that need sleeping because it\'s now sleeping' % machineName)
|
||||||
# update energy savings for the current session
|
else:
|
||||||
sqlCommand = "UPDATE session_to_energy_savings SET energy_savings_kwh=%f WHERE session_id=%d;" % ( jouleToKwh(self.m_clusterStatus.getEnergySavings()) ,self.m_iSessionId)
|
logDebug('ClusterController::onMachineSleepComplete : removed %s from the list of machines that need sleeping because it can\'t be put to sleep at the moment (eg a job just arrived)' % machineName)
|
||||||
print sqlCommand
|
|
||||||
conn.query(sqlCommand)
|
|
||||||
|
|
||||||
# update the end time of the current session
|
|
||||||
sqlCommand = "UPDATE sessions_desc SET end_time=NOW() WHERE session_id=%d;" % (self.m_iSessionId)
|
|
||||||
print sqlCommand
|
|
||||||
conn.query(sqlCommand)
|
|
||||||
|
|
||||||
conn.close()
|
|
||||||
|
|
||||||
def setControlOnMachine(self, machineName, bControl):
|
|
||||||
"""
|
|
||||||
adds or removes the control of ClusterController on the given machine
|
|
||||||
"""
|
|
||||||
self.m_clusterStatus.setControlOnMachine(machineName, bControl)
|
|
||||||
|
|
||||||
def run( self ):
|
|
||||||
"""
|
|
||||||
"""
|
|
||||||
self.m_iSessionId = self.storeSessionInDatabase()
|
|
||||||
log("storeSessionInDatabase completed")
|
|
||||||
DELAY_BETWEEN_MEASURES = 10 # in seconds
|
|
||||||
self.m_clusterStatus.startReadingThreads()
|
|
||||||
self.m_webServer.start()
|
|
||||||
while not self.m_clusterStatus.isReady():
|
|
||||||
log('waiting for system to be ready')
|
|
||||||
time.sleep(1)
|
|
||||||
None
|
|
||||||
logInfo('ClusterController::run : cluster initial readings have completed')
|
|
||||||
startTime = time.localtime()
|
|
||||||
while not self.m_bStop:
|
|
||||||
currentTime = time.time()
|
|
||||||
#clusterStatus.m_nodesStatus['simpatix10'].dump()
|
|
||||||
if (not self.m_lastEnergyStatusLogTime) or (currentTime > (self.m_lastEnergyStatusLogTime +self.DELAY_BETWEEN_ENERGY_STATUS_LOGS)):
|
|
||||||
iNumMachines = len(self.m_clusterStatus.m_clusterNodes)
|
|
||||||
iNumMachinesOn = 0
|
|
||||||
iNumSleepingMachines = 0
|
|
||||||
for machine in self.m_clusterStatus.m_clusterNodes.values():
|
|
||||||
ePowerState = machine.getPowerState()
|
|
||||||
if ePowerState == PowerState.ON:
|
|
||||||
iNumMachinesOn+=1
|
|
||||||
elif ePowerState == PowerState.SLEEP:
|
|
||||||
iNumSleepingMachines+=1
|
|
||||||
logInfo('%d machines (%d ON, %d SLEEPING)' % (iNumMachines, iNumMachinesOn, iNumSleepingMachines))
|
|
||||||
iNumSlots = self.m_clusterStatus.getNumControlledSlots()
|
|
||||||
iNumUsedSlots = self.m_clusterStatus.getNumUsedSlots()
|
|
||||||
iNumWastedSlots = self.m_clusterStatus.getNumWastedSlots()
|
|
||||||
iNumSleepingSlots = self.m_clusterStatus.getNumSleepingSlots()
|
|
||||||
logInfo('%d slots (%d used, %d wasted, %d sleeping)' % (iNumSlots, iNumUsedSlots, iNumWastedSlots, iNumSleepingSlots ))
|
|
||||||
logInfo('cluster estimated power consumption : %f W (saving from cluster controller : %f W)' % (self.m_clusterStatus.getCurrentPowerConsumption(), self.m_clusterStatus.getCurrentPowerSavings()) )
|
|
||||||
logInfo('cluster estimated energy consumption since %s : %f kWh (saving from cluster controller : %f kWh)' % (time.asctime(startTime), jouleToKwh(self.m_clusterStatus.getEnergyConsumption()), jouleToKwh(self.m_clusterStatus.getEnergySavings())))
|
|
||||||
self.updateSessionEnergyConsumptionInDatabase()
|
|
||||||
self.m_lastEnergyStatusLogTime = currentTime
|
|
||||||
|
|
||||||
self.updateNormalState()
|
def getNumPendingWakeUps(self):
|
||||||
time.sleep(DELAY_BETWEEN_MEASURES)
|
self.m_machinesThatNeedWakeupLock.acquire()
|
||||||
self.m_clusterStatus.stopReadingThreads()
|
numPendingWakeUps = len(self.m_machinesThatNeedWakeUp)
|
||||||
|
self.m_machinesThatNeedWakeupLock.release()
|
||||||
|
return numPendingWakeUps
|
||||||
|
|
||||||
|
def getNumPendingSleeps(self):
|
||||||
|
self.m_machinesThatNeedSleepingLock.acquire()
|
||||||
|
numPendingSleeps = len(self.m_machinesThatNeedSleeping)
|
||||||
|
self.m_machinesThatNeedSleepingLock.release()
|
||||||
|
return numPendingSleeps
|
||||||
|
|
||||||
|
def putIdleMachinesToSleep(self):
|
||||||
|
self.m_clusterStatus.m_lock.acquire()
|
||||||
|
idleMachines = self.m_clusterStatus.getIdleMachines()
|
||||||
|
# logInfo('idleMachines :')
|
||||||
|
self.m_machinesThatNeedToSleep = []
|
||||||
|
for machineName, idleMachine in idleMachines.items():
|
||||||
|
if idleMachine.getPowerState() == PowerState.ON:
|
||||||
|
# logInfo('\t%s' % machineName)
|
||||||
|
if idleMachine.getName() != 'simpatix10': # never put simpatix10 to sleep because it's the sge master and is also server for other things
|
||||||
|
self.m_machinesThatNeedSleeping[idleMachine.getName()] = idleMachine
|
||||||
|
self.m_clusterStatus.m_lock.release()
|
||||||
|
|
||||||
|
listOfMachinesThatNeedSleeping = self.m_machinesThatNeedSleeping.values() # duplicate the list so that we don't iterate on m_machinesThatNeedSleeping, which could cause a runtime error because callbacks alter m_machinesThatNeedWakeUp
|
||||||
|
for machine in listOfMachinesThatNeedSleeping:
|
||||||
|
logInfo('ClusterController::putIdleMachinesToSleep : requesting sleep for %s because it\'s idle' % machine.getName())
|
||||||
|
machine.requestSleep(SleepCompleteNotifier(machine.getName(), self))
|
||||||
|
|
||||||
|
if len(listOfMachinesThatNeedSleeping) != 0:
|
||||||
|
# hack : wait until the sleep requests are handled so that we don't request the same machine to sleep multiple times
|
||||||
|
while self.getNumPendingSleeps() > 0:
|
||||||
|
time.sleep(1)
|
||||||
|
|
||||||
|
def wakeUpMachinesForPendingJobs(self):
|
||||||
|
listOfMachinesThatNeedWakeUp = []
|
||||||
|
|
||||||
|
self.m_clusterStatus.m_lock.acquire()
|
||||||
|
pendingJobs = self.m_clusterStatus.getPendingJobs()
|
||||||
|
"""
|
||||||
|
logInfo('pending jobs :')
|
||||||
|
for job in pendingJobs.values():
|
||||||
|
logInfo('\t%d' % job.getId().asStr())
|
||||||
|
"""
|
||||||
|
if len(pendingJobs) != 0:
|
||||||
|
self.m_machinesThatNeedWakeUp = self.m_slotAllocator.getMachinesThatNeedWakeUp(pendingJobs, self.m_clusterStatus)
|
||||||
|
if len(self.m_machinesThatNeedWakeUp) == 0:
|
||||||
|
None
|
||||||
|
# logInfo('ClusterController::updateNormalState : no machine needs waking up')
|
||||||
|
else:
|
||||||
|
listOfMachinesThatNeedWakeUp = self.m_machinesThatNeedWakeUp.values() # duplicate the list so that we don't iterate on m_machinesThatNeedWakeUp, which would cause a runtime error because callbacks alter m_machinesThatNeedWakeUp
|
||||||
|
for machine in listOfMachinesThatNeedWakeUp:
|
||||||
|
logInfo('ClusterController::wakeUpMachinesForPendingJobs : requesting wake up for ' + machine.getName())
|
||||||
|
machine.requestWakeUp(WakeUpCompleteNotifier(machine.getName(), self))
|
||||||
|
self.m_clusterStatus.m_lock.release()
|
||||||
|
|
||||||
|
if len(listOfMachinesThatNeedWakeUp) != 0:
|
||||||
|
# hack : wait until the wakeup requests are handled so that a later sleep request doesn't cancel it
|
||||||
|
# and also wait for the jobs to come in
|
||||||
|
while self.getNumPendingWakeUps() > 0:
|
||||||
|
time.sleep(1)
|
||||||
|
iSGE_CHEK_RUNNABLE_JOBS_DELAY = 60 * 5 # max time it takes for sge between the fact that a queued job is runnable and SGE actually starting it (I've put a long time here because sometimes, qstat takes a long time to ralise that the machine is available after I wake it up)
|
||||||
|
logInfo('ClusterController::wakeUpMachinesForPendingJobs : all required machines are awake. Now give %d seconds to SGE to allocate slots.' % iSGE_CHEK_RUNNABLE_JOBS_DELAY)
|
||||||
|
# wait until SGE has a chance to allocate slots
|
||||||
|
time.sleep(iSGE_CHEK_RUNNABLE_JOBS_DELAY) # note : this is annoying because it blocks the main thread. This could be improved if we forbid the machines to go to sleep for that much time....
|
||||||
|
logInfo('ClusterController::wakeUpMachinesForPendingJobs : end of the delay given to SGE to allocate slots')
|
||||||
|
|
||||||
|
def updateNormalState(self):
|
||||||
|
# attempt to shut down machines that are idle
|
||||||
|
self.putIdleMachinesToSleep()
|
||||||
|
# wake up necessary machines if there are pending jobs
|
||||||
|
self.wakeUpMachinesForPendingJobs()
|
||||||
|
|
||||||
|
def storeSessionInDatabase(self):
|
||||||
|
conn = MySQLdb.connect('simpatix10', 'clusterctrl', '', 'clustercontroller')
|
||||||
|
assert conn
|
||||||
|
|
||||||
|
# retrieve the session id, as it's an auto_increment field
|
||||||
|
sqlCommand = "SELECT AUTO_INCREMENT FROM information_schema.TABLES WHERE TABLE_SCHEMA = 'clustercontroller' AND TABLE_NAME = 'sessions_desc'"
|
||||||
|
print(sqlCommand)
|
||||||
|
conn.query(sqlCommand)
|
||||||
|
r = conn.store_result()
|
||||||
|
iSessionId = r.fetch_row()[0][0]
|
||||||
|
|
||||||
|
# stores information about the session
|
||||||
|
sqlCommand = "INSERT INTO `sessions_desc` (`start_time`, end_time, `program_version`, `machine_name`, `pid`, num_controlled_machines) VALUES (NOW(), NOW(), '%s', 'simpatix10', %d, %d);" % (VERSION, os.getpid(), len(self.m_clusterStatus.m_clusterNodes))
|
||||||
|
print(sqlCommand)
|
||||||
|
conn.query(sqlCommand)
|
||||||
|
|
||||||
|
# initialize the energy savings table
|
||||||
|
sqlCommand = "INSERT INTO session_to_energy_savings (session_id, energy_savings_kwh) VALUES (%d,0.0);" % (iSessionId)
|
||||||
|
print(sqlCommand)
|
||||||
|
conn.query(sqlCommand)
|
||||||
|
|
||||||
|
conn.close()
|
||||||
|
print('Session Iid = %d' % iSessionId)
|
||||||
|
return iSessionId
|
||||||
|
|
||||||
|
def updateSessionEnergyConsumptionInDatabase(self):
|
||||||
|
conn = MySQLdb.connect('simpatix10', 'root', '', 'clustercontroller')
|
||||||
|
assert conn
|
||||||
|
|
||||||
|
# update energy savings for the current session
|
||||||
|
sqlCommand = "UPDATE session_to_energy_savings SET energy_savings_kwh=%f WHERE session_id=%d;" % (jouleToKwh(self.m_clusterStatus.getEnergySavings()), self.m_iSessionId)
|
||||||
|
print(sqlCommand)
|
||||||
|
conn.query(sqlCommand)
|
||||||
|
|
||||||
|
# update the end time of the current session
|
||||||
|
sqlCommand = "UPDATE sessions_desc SET end_time=NOW() WHERE session_id=%d;" % (self.m_iSessionId)
|
||||||
|
print(sqlCommand)
|
||||||
|
conn.query(sqlCommand)
|
||||||
|
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
def setControlOnMachine(self, machineName, bControl):
|
||||||
|
"""
|
||||||
|
adds or removes the control of ClusterController on the given machine
|
||||||
|
"""
|
||||||
|
self.m_clusterStatus.setControlOnMachine(machineName, bControl)
|
||||||
|
|
||||||
|
def run(self):
|
||||||
|
"""
|
||||||
|
"""
|
||||||
|
self.m_iSessionId = self.storeSessionInDatabase()
|
||||||
|
log("storeSessionInDatabase completed")
|
||||||
|
DELAY_BETWEEN_MEASURES = 10 # in seconds
|
||||||
|
self.m_clusterStatus.startReadingThreads()
|
||||||
|
self.m_webServer.start()
|
||||||
|
while not self.m_clusterStatus.isReady():
|
||||||
|
log('waiting for system to be ready')
|
||||||
|
time.sleep(1)
|
||||||
|
None
|
||||||
|
logInfo('ClusterController::run : cluster initial readings have completed')
|
||||||
|
startTime = time.localtime()
|
||||||
|
while not self.m_bStop:
|
||||||
|
currentTime = time.time()
|
||||||
|
# clusterStatus.m_nodesStatus['simpatix10'].dump()
|
||||||
|
if (not self.m_lastEnergyStatusLogTime) or (currentTime > (self.m_lastEnergyStatusLogTime + self.DELAY_BETWEEN_ENERGY_STATUS_LOGS)):
|
||||||
|
iNumMachines = len(self.m_clusterStatus.m_clusterNodes)
|
||||||
|
iNumMachinesOn = 0
|
||||||
|
iNumSleepingMachines = 0
|
||||||
|
for machine in self.m_clusterStatus.m_clusterNodes.values():
|
||||||
|
ePowerState = machine.getPowerState()
|
||||||
|
if ePowerState == PowerState.ON:
|
||||||
|
iNumMachinesOn += 1
|
||||||
|
elif ePowerState == PowerState.SLEEP:
|
||||||
|
iNumSleepingMachines += 1
|
||||||
|
logInfo('%d machines (%d ON, %d SLEEPING)' % (iNumMachines, iNumMachinesOn, iNumSleepingMachines))
|
||||||
|
iNumSlots = self.m_clusterStatus.getNumControlledSlots()
|
||||||
|
iNumUsedSlots = self.m_clusterStatus.getNumUsedSlots()
|
||||||
|
iNumWastedSlots = self.m_clusterStatus.getNumWastedSlots()
|
||||||
|
iNumSleepingSlots = self.m_clusterStatus.getNumSleepingSlots()
|
||||||
|
logInfo('%d slots (%d used, %d wasted, %d sleeping)' % (iNumSlots, iNumUsedSlots, iNumWastedSlots, iNumSleepingSlots))
|
||||||
|
logInfo('cluster estimated power consumption : %f W (saving from cluster controller : %f W)' % (self.m_clusterStatus.getCurrentPowerConsumption(), self.m_clusterStatus.getCurrentPowerSavings()))
|
||||||
|
logInfo('cluster estimated energy consumption since %s : %f kWh (saving from cluster controller : %f kWh)' % (time.asctime(startTime), jouleToKwh(self.m_clusterStatus.getEnergyConsumption()), jouleToKwh(self.m_clusterStatus.getEnergySavings())))
|
||||||
|
self.updateSessionEnergyConsumptionInDatabase()
|
||||||
|
self.m_lastEnergyStatusLogTime = currentTime
|
||||||
|
|
||||||
|
self.updateNormalState()
|
||||||
|
time.sleep(DELAY_BETWEEN_MEASURES)
|
||||||
|
self.m_clusterStatus.stopReadingThreads()
|
||||||
|
|
||||||
|
|
||||||
|
def storeClusterNodeStatus(clusterNodeStatus):
|
||||||
|
# conn = MySQLdb.connect('simpatix10', 'measures_writer', '', 'simpa_measurements')
|
||||||
|
conn = MySQLdb.connect('simpatix10', 'root', '', 'simpa_measurements')
|
||||||
|
assert conn
|
||||||
|
# conn.query("""INSERT INTO `fan_rpm_logs` (`fan_id`, `rpm`, `date`) VALUES ('titi', 2000, NOW());""")
|
||||||
|
'''
|
||||||
|
conn.query("""SELECT * FROM fan_rpm_logs""")
|
||||||
|
r=conn.store_result()
|
||||||
|
print r.fetch_row()[0]
|
||||||
|
'''
|
||||||
|
for key, sensor in clusterNodeStatus.m_sensors.items():
|
||||||
|
sensorId = clusterNodeStatus.m_clusterNodeName + '_' + sensor.m_name
|
||||||
|
if sensor.typeName() == 'Fan':
|
||||||
|
sqlCommand = """INSERT INTO `fan_rpm_logs` (`fan_id`, `rpm`, `date`) VALUES ('""" + sensorId + """', """ + str(sensor.m_rpms) + """, NOW());"""
|
||||||
|
print(sqlCommand)
|
||||||
|
conn.query(sqlCommand)
|
||||||
|
elif sensor.typeName() == 'Temperature':
|
||||||
|
sqlCommand = """INSERT INTO `temperature_logs` (`temp_sensor_id`, `temperature`, `date`) VALUES ('""" + sensorId + """', """ + str(sensor.m_temperature) + """, NOW());"""
|
||||||
|
print(sqlCommand)
|
||||||
|
conn.query(sqlCommand)
|
||||||
|
else:
|
||||||
|
assert False
|
||||||
|
conn.close()
|
||||||
|
|
||||||
def storeClusterNodeStatus( clusterNodeStatus ):
|
|
||||||
#conn = MySQLdb.connect('simpatix10', 'measures_writer', '', 'simpa_measurements')
|
|
||||||
conn = MySQLdb.connect('simpatix10', 'root', '', 'simpa_measurements')
|
|
||||||
assert(conn)
|
|
||||||
#conn.query("""INSERT INTO `fan_rpm_logs` (`fan_id`, `rpm`, `date`) VALUES ('titi', 2000, NOW());""")
|
|
||||||
'''
|
|
||||||
conn.query("""SELECT * FROM fan_rpm_logs""")
|
|
||||||
r=conn.store_result()
|
|
||||||
print r.fetch_row()[0]
|
|
||||||
'''
|
|
||||||
for key, sensor in clusterNodeStatus.m_sensors.items():
|
|
||||||
sensorId = clusterNodeStatus.m_clusterNodeName + '_' + sensor.m_name
|
|
||||||
if sensor.typeName() == 'Fan':
|
|
||||||
sqlCommand = """INSERT INTO `fan_rpm_logs` (`fan_id`, `rpm`, `date`) VALUES ('"""+sensorId+"""', """+str(sensor.m_rpms)+""", NOW());"""
|
|
||||||
print sqlCommand
|
|
||||||
conn.query(sqlCommand)
|
|
||||||
elif sensor.typeName() == 'Temperature':
|
|
||||||
sqlCommand = """INSERT INTO `temperature_logs` (`temp_sensor_id`, `temperature`, `date`) VALUES ('"""+sensorId+"""', """+str(sensor.m_temperature)+""", NOW());"""
|
|
||||||
print sqlCommand
|
|
||||||
conn.query(sqlCommand)
|
|
||||||
else:
|
|
||||||
assert(False)
|
|
||||||
conn.close()
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
#Lib.Util.sendTextMail( 'SimpaCluster <guillaume.raffy@univ-rennes1.fr>', 'guillaume.raffy@univ-rennes1.fr', 'mail subject', 'mail content')
|
# Lib.Util.sendTextMail('SimpaCluster <guillaume.raffy@univ-rennes1.fr>', 'guillaume.raffy@univ-rennes1.fr', 'mail subject', 'mail content')
|
||||||
try:
|
try:
|
||||||
logInfo('ClusterController v. %s starting....' % VERSION)
|
logInfo('ClusterController v. %s starting....' % VERSION)
|
||||||
#executeCommand('ping -o -t 1 simpatix310 > /dev/null')
|
# executeCommand('ping -o -t 1 simpatix310 > /dev/null')
|
||||||
#print executeCommand('ssh simpatix10 "ipmitool sensor"')
|
# print executeCommand('ssh simpatix10 "ipmitool sensor"')
|
||||||
#assert False, 'prout'
|
# assert False, 'prout'
|
||||||
controller = ClusterController()
|
controller = ClusterController()
|
||||||
controller.run()
|
controller.run()
|
||||||
#machineNameToMacAddress( 'simpatix10' )
|
# machineNameToMacAddress('simpatix10')
|
||||||
#except AssertionError, error:
|
# except AssertionError, error:
|
||||||
#except KeyboardInterrupt, error:
|
# except KeyboardInterrupt, error:
|
||||||
except BaseException, exception: # catches all exceptions, including the ctrl+C (KeyboardInterrupt)
|
except BaseException as exception: # catches all exceptions, including the ctrl+C (KeyboardInterrupt)
|
||||||
Util.onException(exception)
|
onException(exception)
|
||||||
|
|
|
@ -1,140 +1,142 @@
|
||||||
import threading
|
import threading
|
||||||
from PowerState import *
|
from PowerState import PowerState, PowerStateToStr
|
||||||
from ClusterNodeStatusUpdater import *
|
from ClusterNodeStatusUpdater import ClusterNodeStatusUpdater
|
||||||
import Lib.Util
|
import Lib.Util
|
||||||
import Lib.SimpaDbUtil
|
import Lib.SimpaDbUtil
|
||||||
|
from Log import logInfo, logWarning
|
||||||
|
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
from datetime import *
|
|
||||||
|
|
||||||
class ClusterNode:
|
class ClusterNode:
|
||||||
"""
|
"""
|
||||||
the state of a machine node
|
the state of a machine node
|
||||||
"""
|
"""
|
||||||
def __init__( self, machineName, cluster, gridEngine ):
|
def __init__(self, machineName, cluster, gridEngine):
|
||||||
self.m_name = machineName
|
self.m_name = machineName
|
||||||
self.m_cluster = cluster # the cluster this machine belongs to
|
self.m_cluster = cluster # the cluster this machine belongs to
|
||||||
self.m_requestedPowerState = PowerState.ON
|
self.m_requestedPowerState = PowerState.ON
|
||||||
self.m_powerState = PowerState.UNKNOWN
|
self.m_powerState = PowerState.UNKNOWN
|
||||||
self.m_lastPowerStateTime = None # time at which the last value of self.m_powerState has been set
|
self.m_lastPowerStateTime = None # time at which the last value of self.m_powerState has been set
|
||||||
self.m_machineStatusUpdater = ClusterNodeStatusUpdater( machineName, self, gridEngine )
|
self.m_machineStatusUpdater = ClusterNodeStatusUpdater(machineName, self, gridEngine)
|
||||||
self.m_energyConsumption = 0.0 # estimate of the energy consumption of this machine since the start of cluster controller (in joules)
|
self.m_energyConsumption = 0.0 # estimate of the energy consumption of this machine since the start of cluster controller (in joules)
|
||||||
self.m_energySavings = 0.0 # estimate of the energy savings on this machine caused by the cluster controller since it started (in joules)
|
self.m_energySavings = 0.0 # estimate of the energy savings on this machine caused by the cluster controller since it started (in joules)
|
||||||
|
|
||||||
def getName( self ):
|
|
||||||
return self.m_name
|
|
||||||
|
|
||||||
def isReady( self ):
|
|
||||||
if self.m_powerState == PowerState.UNKNOWN:
|
|
||||||
#logInfo( self.m_name + ' is not ready (waiting for power state)' )
|
|
||||||
return False
|
|
||||||
if self.m_powerState == PowerState.ON:
|
|
||||||
return True
|
|
||||||
#log( self.m_name + ' is ready' )
|
|
||||||
return True
|
|
||||||
|
|
||||||
def getPowerState( self ):
|
|
||||||
return self.m_powerState
|
|
||||||
|
|
||||||
def setShouldAlwaysBeOn( self ):
|
|
||||||
self.m_machineStatusUpdater.setShouldAlwaysBeOn( )
|
|
||||||
self.setPowerState( PowerState.ON )
|
|
||||||
|
|
||||||
def setPowerState( self, powerState ):
|
|
||||||
bUpdateRequiredChecks = False
|
|
||||||
if self.m_powerState == PowerState.UNKNOWN:
|
|
||||||
logInfo('ClusterNode::setPowerState : '+self.m_name+'\'s power state has been initialized to '+PowerStateToStr( powerState ))
|
|
||||||
self.m_powerState = powerState
|
|
||||||
self.m_lastPowerStateTime = datetime.now()
|
|
||||||
bUpdateRequiredChecks = True
|
|
||||||
else:
|
|
||||||
# update the estimation of energy consumption
|
|
||||||
self.updateEnergyMeasurements()
|
|
||||||
# then change the power state
|
|
||||||
if self.m_powerState != powerState:
|
|
||||||
logInfo('ClusterNode::setPowerState : '+self.m_name+'\'s power state has been changed to '+PowerStateToStr( powerState ))
|
|
||||||
self.m_powerState = powerState
|
|
||||||
self.m_lastPowerStateTime = datetime.now()
|
|
||||||
bUpdateRequiredChecks = True
|
|
||||||
if bUpdateRequiredChecks:
|
|
||||||
if self.m_powerState == PowerState.ON:
|
|
||||||
self.m_machineStatusUpdater.m_bCheckPowerState = True
|
|
||||||
self.m_machineStatusUpdater.m_bCheckSensors = True
|
|
||||||
elif self.m_powerState == PowerState.OFF:
|
|
||||||
self.m_machineStatusUpdater.m_bCheckPowerState = True
|
|
||||||
self.m_machineStatusUpdater.m_bCheckSensors = False
|
|
||||||
elif self.m_powerState == PowerState.SLEEP:
|
|
||||||
self.m_machineStatusUpdater.m_bCheckPowerState = True
|
|
||||||
self.m_machineStatusUpdater.m_bCheckSensors = False
|
|
||||||
elif self.m_powerState == PowerState.UNPLUGGED:
|
|
||||||
self.m_machineStatusUpdater.m_bCheckPowerState = True
|
|
||||||
self.m_machineStatusUpdater.m_bCheckSensors = False
|
|
||||||
else:
|
|
||||||
assert( False )
|
|
||||||
|
|
||||||
def onNewPowerStateReading( self, powerState ):
|
|
||||||
"""
|
|
||||||
called when a new powerstate reading arrives
|
|
||||||
"""
|
|
||||||
if powerState != self.getPowerState():
|
|
||||||
if self.getPowerState() != PowerState.UNKNOWN:
|
|
||||||
logWarning('ClusterNode::onNewPowerStateReading : '+self.m_name+'\'s power state has been (manually it seems) changed to '+PowerStateToStr( powerState ))
|
|
||||||
self.setPowerState( powerState )
|
|
||||||
|
|
||||||
def getPowerConsumptionForPowerState( self, ePowerState ):
|
def getName(self):
|
||||||
"""
|
return self.m_name
|
||||||
returns the power consumption estimation (in watts) of this machine for the given power state
|
|
||||||
"""
|
def isReady(self):
|
||||||
fCurrentIntensity = 0.0
|
if self.m_powerState == PowerState.UNKNOWN:
|
||||||
fCurrentVoltage = 220.0
|
# logInfo(self.m_name + ' is not ready (waiting for power state)')
|
||||||
# noticed on 26.08.2009 that putting 22 machines from sleep to on eats 17 A, resulting in difference of 0.77 A per machine
|
return False
|
||||||
if ePowerState == PowerState.ON:
|
if self.m_powerState == PowerState.ON:
|
||||||
fCurrentIntensity = 0.9 # value when the machine is doing nothing
|
return True
|
||||||
elif ePowerState == PowerState.OFF:
|
# log(self.m_name + ' is ready')
|
||||||
fCurrentIntensity = 0.1
|
return True
|
||||||
elif ePowerState == PowerState.SLEEP:
|
|
||||||
fCurrentIntensity = 0.1
|
def getPowerState(self):
|
||||||
elif ePowerState == PowerState.UNPLUGGED:
|
return self.m_powerState
|
||||||
fCurrentIntensity = 0.0
|
|
||||||
else:
|
def setShouldAlwaysBeOn(self):
|
||||||
assert(False)
|
self.m_machineStatusUpdater.setShouldAlwaysBeOn()
|
||||||
return fCurrentIntensity * fCurrentVoltage
|
self.setPowerState(PowerState.ON)
|
||||||
|
|
||||||
def updateEnergyMeasurements( self ):
|
def setPowerState(self, powerState):
|
||||||
timeInterval = datetime.now() - self.m_lastPowerStateTime
|
bUpdateRequiredChecks = False
|
||||||
self.m_energyConsumption += self.getPowerConsumptionForPowerState( self.m_powerState ) * timeInterval.seconds
|
if self.m_powerState == PowerState.UNKNOWN:
|
||||||
self.m_energySavings += ( self.getPowerConsumptionForPowerState( PowerState.ON ) - self.getPowerConsumptionForPowerState( self.m_powerState ) ) * timeInterval.seconds
|
logInfo('ClusterNode::setPowerState : ' + self.m_name + '\'s power state has been initialized to ' + PowerStateToStr(powerState))
|
||||||
self.m_lastPowerStateTime = datetime.now()
|
self.m_powerState = powerState
|
||||||
#logDebug('energy savings on %s : %f J' %(self.getName(), self.m_energySavings))
|
self.m_lastPowerStateTime = datetime.now()
|
||||||
|
bUpdateRequiredChecks = True
|
||||||
def getEnergyConsumption( self ):
|
else:
|
||||||
"""
|
# update the estimation of energy consumption
|
||||||
in joules
|
self.updateEnergyMeasurements()
|
||||||
"""
|
# then change the power state
|
||||||
self.updateEnergyMeasurements()
|
if self.m_powerState != powerState:
|
||||||
return self.m_energyConsumption
|
logInfo('ClusterNode::setPowerState : ' + self.m_name + '\'s power state has been changed to ' + PowerStateToStr(powerState))
|
||||||
|
self.m_powerState = powerState
|
||||||
def getPowerConsumption( self ):
|
self.m_lastPowerStateTime = datetime.now()
|
||||||
fCurrentPowerConsumption = self.getPowerConsumptionForPowerState( self.m_powerState )
|
bUpdateRequiredChecks = True
|
||||||
#logDebug('getPowerConsumption of %s : %f (powerstate = %d)' % (self.getName(), fCurrentPowerConsumption, self.m_powerState))
|
if bUpdateRequiredChecks:
|
||||||
return fCurrentPowerConsumption
|
if self.m_powerState == PowerState.ON:
|
||||||
|
self.m_machineStatusUpdater.m_bCheckPowerState = True
|
||||||
def getEnergySavings( self ):
|
self.m_machineStatusUpdater.m_bCheckSensors = True
|
||||||
self.updateEnergyMeasurements()
|
elif self.m_powerState == PowerState.OFF:
|
||||||
return self.m_energySavings
|
self.m_machineStatusUpdater.m_bCheckPowerState = True
|
||||||
|
self.m_machineStatusUpdater.m_bCheckSensors = False
|
||||||
def onSleepFailedBecauseAJobJustArrived( self ):
|
elif self.m_powerState == PowerState.SLEEP:
|
||||||
logInfo('%s was scheduled to sleep but the sleep is canceled because it\'s currently executing a new job' % self.m_name)
|
self.m_machineStatusUpdater.m_bCheckPowerState = True
|
||||||
|
self.m_machineStatusUpdater.m_bCheckSensors = False
|
||||||
def requestSleep( self, sleepCompleteNotifier = None ):
|
elif self.m_powerState == PowerState.UNPLUGGED:
|
||||||
self.m_machineStatusUpdater.requestSleep( sleepCompleteNotifier )
|
self.m_machineStatusUpdater.m_bCheckPowerState = True
|
||||||
|
self.m_machineStatusUpdater.m_bCheckSensors = False
|
||||||
def requestWakeUp( self, wakeUpCompleteNotifier = None ):
|
else:
|
||||||
self.m_machineStatusUpdater.requestWakeUp( wakeUpCompleteNotifier )
|
assert False
|
||||||
|
|
||||||
def getQueueMachineName( self ):
|
def onNewPowerStateReading(self, powerState):
|
||||||
return self.getCluster().getJobsState().getQueueMachine( self.m_name ).getName()
|
"""
|
||||||
assert( self.m_queueName != None )
|
called when a new powerstate reading arrives
|
||||||
return self.m_queueName
|
"""
|
||||||
|
if powerState != self.getPowerState():
|
||||||
def getCluster( self ):
|
if self.getPowerState() != PowerState.UNKNOWN:
|
||||||
return self.m_cluster
|
logWarning('ClusterNode::onNewPowerStateReading : ' + self.m_name + '\'s power state has been (manually it seems) changed to ' + PowerStateToStr(powerState))
|
||||||
|
self.setPowerState(powerState)
|
||||||
|
|
||||||
|
def getPowerConsumptionForPowerState(self, ePowerState):
|
||||||
|
"""
|
||||||
|
returns the power consumption estimation (in watts) of this machine for the given power state
|
||||||
|
"""
|
||||||
|
fCurrentIntensity = 0.0
|
||||||
|
fCurrentVoltage = 220.0
|
||||||
|
# noticed on 26.08.2009 that putting 22 machines from sleep to on eats 17 A, resulting in difference of 0.77 A per machine
|
||||||
|
if ePowerState == PowerState.ON:
|
||||||
|
fCurrentIntensity = 0.9 # value when the machine is doing nothing
|
||||||
|
elif ePowerState == PowerState.OFF:
|
||||||
|
fCurrentIntensity = 0.1
|
||||||
|
elif ePowerState == PowerState.SLEEP:
|
||||||
|
fCurrentIntensity = 0.1
|
||||||
|
elif ePowerState == PowerState.UNPLUGGED:
|
||||||
|
fCurrentIntensity = 0.0
|
||||||
|
else:
|
||||||
|
assert False
|
||||||
|
return fCurrentIntensity * fCurrentVoltage
|
||||||
|
|
||||||
|
def updateEnergyMeasurements(self):
|
||||||
|
timeInterval = datetime.now() - self.m_lastPowerStateTime
|
||||||
|
self.m_energyConsumption += self.getPowerConsumptionForPowerState(self.m_powerState) * timeInterval.seconds
|
||||||
|
self.m_energySavings += (self.getPowerConsumptionForPowerState(PowerState.ON) - self.getPowerConsumptionForPowerState(self.m_powerState)) * timeInterval.seconds
|
||||||
|
self.m_lastPowerStateTime = datetime.now()
|
||||||
|
# logDebug('energy savings on %s : %f J' %(self.getName(), self.m_energySavings))
|
||||||
|
|
||||||
|
def getEnergyConsumption(self):
|
||||||
|
"""
|
||||||
|
in joules
|
||||||
|
"""
|
||||||
|
self.updateEnergyMeasurements()
|
||||||
|
return self.m_energyConsumption
|
||||||
|
|
||||||
|
def getPowerConsumption(self):
|
||||||
|
fCurrentPowerConsumption = self.getPowerConsumptionForPowerState(self.m_powerState)
|
||||||
|
# logDebug('getPowerConsumption of %s : %f (powerstate = %d)' % (self.getName(), fCurrentPowerConsumption, self.m_powerState))
|
||||||
|
return fCurrentPowerConsumption
|
||||||
|
|
||||||
|
def getEnergySavings(self):
|
||||||
|
self.updateEnergyMeasurements()
|
||||||
|
return self.m_energySavings
|
||||||
|
|
||||||
|
def onSleepFailedBecauseAJobJustArrived(self):
|
||||||
|
logInfo('%s was scheduled to sleep but the sleep is canceled because it\'s currently executing a new job' % self.m_name)
|
||||||
|
|
||||||
|
def requestSleep(self, sleepCompleteNotifier=None):
|
||||||
|
self.m_machineStatusUpdater.requestSleep(sleepCompleteNotifier)
|
||||||
|
|
||||||
|
def requestWakeUp(self, wakeUpCompleteNotifier=None):
|
||||||
|
self.m_machineStatusUpdater.requestWakeUp(wakeUpCompleteNotifier)
|
||||||
|
|
||||||
|
def getQueueMachineName(self):
|
||||||
|
return self.getCluster().getJobsState().getQueueMachine(self.m_name).getName()
|
||||||
|
assert self.m_queueName is not None
|
||||||
|
return self.m_queueName
|
||||||
|
|
||||||
|
def getCluster(self):
|
||||||
|
return self.m_cluster
|
||||||
|
|
|
@ -2,187 +2,191 @@ import threading
|
||||||
import time
|
import time
|
||||||
import Lib.Util
|
import Lib.Util
|
||||||
import Lib.SimpaDbUtil
|
import Lib.SimpaDbUtil
|
||||||
import os
|
from PowerState import PowerState
|
||||||
import traceback
|
from Log import logInfo, logDebug
|
||||||
import sys
|
from Util import blockingWakeUpMachine, blockingPutMachineToSleep, getPowerState, onException
|
||||||
from PowerState import *
|
|
||||||
from QstatParser import *
|
|
||||||
import Util
|
|
||||||
|
|
||||||
class IWakeUpCompleteNotifier:
|
class IWakeUpCompleteNotifier:
|
||||||
"""
|
"""
|
||||||
interface for wakeup notifiers
|
interface for wakeup notifiers
|
||||||
"""
|
"""
|
||||||
def onWakeUpComplete( self ):
|
def onWakeUpComplete(self):
|
||||||
assert( False )
|
assert False
|
||||||
|
|
||||||
|
|
||||||
class ISleepCompleteNotifier:
|
class ISleepCompleteNotifier:
|
||||||
"""
|
"""
|
||||||
interface for sleep notifiers
|
interface for sleep notifiers
|
||||||
"""
|
"""
|
||||||
def onSleepComplete( self, bSleepSucceeded ):
|
def onSleepComplete(self, bSleepSucceeded):
|
||||||
assert( False )
|
assert False
|
||||||
|
|
||||||
|
|
||||||
class IRequest:
|
class IRequest:
|
||||||
GO_TO_SLEEP = 1
|
GO_TO_SLEEP = 1
|
||||||
WAKE_UP = 2
|
WAKE_UP = 2
|
||||||
CHECK_POWER_STATE = 3
|
CHECK_POWER_STATE = 3
|
||||||
|
|
||||||
def __init__( self, requestType ):
|
|
||||||
self.m_type = requestType
|
|
||||||
|
|
||||||
def getType( self ):
|
|
||||||
return self.m_type
|
|
||||||
|
|
||||||
def process( self, clusterNodeStatusUpdater ):
|
|
||||||
"""
|
|
||||||
processes this request
|
|
||||||
"""
|
|
||||||
assert( False ) # this method is abstract
|
|
||||||
|
|
||||||
class WakeUpRequest( IRequest ):
|
def __init__(self, requestType):
|
||||||
|
self.m_type = requestType
|
||||||
|
|
||||||
def __init__( self, wakeUpNotifier ):
|
def getType(self):
|
||||||
IRequest.__init__( self, IRequest.WAKE_UP )
|
return self.m_type
|
||||||
self.m_wakeUpNotifier = wakeUpNotifier
|
|
||||||
|
|
||||||
def process( self, clusterNodeStatusUpdater ):
|
def process(self, clusterNodeStatusUpdater):
|
||||||
assert( clusterNodeStatusUpdater.m_bShouldAlwaysBeOn == False ) # are we attempting to wake up a machine that should always be on ?
|
"""
|
||||||
logInfo('Handling wakeup request for %s' % clusterNodeStatusUpdater.getName() )
|
processes this request
|
||||||
bSuccess = blockingWakeUpMachine( clusterNodeStatusUpdater.getName() )
|
"""
|
||||||
assert( bSuccess )
|
assert False # this method is abstract
|
||||||
# activate the associated machine queue
|
|
||||||
if clusterNodeStatusUpdater.setQueueActivation( True ):
|
|
||||||
None # all is ok
|
|
||||||
else:
|
|
||||||
assert( False )
|
|
||||||
clusterNodeStatusUpdater.m_stateLock.acquire()
|
|
||||||
clusterNodeStatusUpdater.m_clusterNode.setPowerState( PowerState.ON )
|
|
||||||
clusterNodeStatusUpdater.m_stateLock.release()
|
|
||||||
if self.m_wakeUpNotifier:
|
|
||||||
logDebug('ClusterNodeStatusUpdater::run : Sending wakeup notification')
|
|
||||||
self.m_wakeUpNotifier.onWakeUpComplete()
|
|
||||||
|
|
||||||
class SleepRequest( IRequest ):
|
|
||||||
|
|
||||||
def __init__( self, sleepCompleteNotifier ):
|
|
||||||
IRequest.__init__( self, IRequest.GO_TO_SLEEP )
|
|
||||||
self.m_sleepCompleteNotifier = sleepCompleteNotifier
|
|
||||||
|
|
||||||
def process( self, clusterNodeStatusUpdater ):
|
class WakeUpRequest(IRequest):
|
||||||
assert( clusterNodeStatusUpdater.m_bShouldAlwaysBeOn == False ) # are we attempting to put a machine the should stay on to sleep ?
|
|
||||||
logInfo('Handling sleep request for %s' % clusterNodeStatusUpdater.getName() )
|
|
||||||
if clusterNodeStatusUpdater.setQueueActivation( False ):
|
|
||||||
if clusterNodeStatusUpdater.queueIsEmpty():
|
|
||||||
if blockingPutMachineToSleep( clusterNodeStatusUpdater.m_clusterNodeName ):
|
|
||||||
# now we know that the machine is asleep
|
|
||||||
clusterNodeStatusUpdater.m_stateLock.acquire()
|
|
||||||
clusterNodeStatusUpdater.m_clusterNode.setPowerState( PowerState.SLEEP )
|
|
||||||
clusterNodeStatusUpdater.m_stateLock.release()
|
|
||||||
if self.m_sleepCompleteNotifier:
|
|
||||||
self.m_sleepCompleteNotifier.onSleepComplete( True )
|
|
||||||
else:
|
|
||||||
assert( False )
|
|
||||||
else:
|
|
||||||
# reactivate the queue
|
|
||||||
if not clusterNodeStatusUpdater.setQueueActivation( True ):
|
|
||||||
assert( False )
|
|
||||||
clusterNodeStatusUpdater.m_stateLock.acquire()
|
|
||||||
clusterNodeStatusUpdater.m_clusterNode.setPowerState( PowerState.ON ) # this is necessary to reenable the various cyclic checks that were disabled on sleep request
|
|
||||||
clusterNodeStatusUpdater.m_stateLock.release()
|
|
||||||
clusterNodeStatusUpdater.m_clusterNode.onSleepFailedBecauseAJobJustArrived()
|
|
||||||
if self.m_sleepCompleteNotifier:
|
|
||||||
self.m_sleepCompleteNotifier.onSleepComplete( False )
|
|
||||||
else:
|
|
||||||
assert( False )
|
|
||||||
|
|
||||||
class CheckPowerStateRequest( IRequest ):
|
def __init__(self, wakeUpNotifier):
|
||||||
|
IRequest.__init__(self, IRequest.WAKE_UP)
|
||||||
|
self.m_wakeUpNotifier = wakeUpNotifier
|
||||||
|
|
||||||
def __init__( self ):
|
def process(self, clusterNodeStatusUpdater):
|
||||||
IRequest.__init__( self, IRequest.CHECK_POWER_STATE )
|
assert clusterNodeStatusUpdater.m_bShouldAlwaysBeOn is False # are we attempting to wake up a machine that should always be on ?
|
||||||
|
logInfo('Handling wakeup request for %s' % clusterNodeStatusUpdater.getName())
|
||||||
|
bSuccess = blockingWakeUpMachine(clusterNodeStatusUpdater.getName())
|
||||||
|
assert bSuccess
|
||||||
|
# activate the associated machine queue
|
||||||
|
if clusterNodeStatusUpdater.setQueueActivation(True):
|
||||||
|
pass # all is ok
|
||||||
|
else:
|
||||||
|
assert False
|
||||||
|
clusterNodeStatusUpdater.m_stateLock.acquire()
|
||||||
|
clusterNodeStatusUpdater.m_clusterNode.setPowerState(PowerState.ON)
|
||||||
|
clusterNodeStatusUpdater.m_stateLock.release()
|
||||||
|
if self.m_wakeUpNotifier:
|
||||||
|
logDebug('ClusterNodeStatusUpdater::run : Sending wakeup notification')
|
||||||
|
self.m_wakeUpNotifier.onWakeUpComplete()
|
||||||
|
|
||||||
def process( self, clusterNodeStatusUpdater ):
|
|
||||||
powerState = Util.getPowerState( clusterNodeStatusUpdater.m_clusterNodeName )
|
|
||||||
clusterNodeStatusUpdater.m_stateLock.acquire()
|
|
||||||
clusterNodeStatusUpdater.m_clusterNode.onNewPowerStateReading( powerState )
|
|
||||||
clusterNodeStatusUpdater.m_lastPowerStateCheckTime = time.time()
|
|
||||||
clusterNodeStatusUpdater.m_stateLock.release()
|
|
||||||
|
|
||||||
class ClusterNodeStatusUpdater( threading.Thread ):
|
class SleepRequest(IRequest):
|
||||||
DELAY_BETWEEN_POWERSTATE_CHECKS=5*60 # in seconds
|
|
||||||
|
def __init__(self, sleepCompleteNotifier):
|
||||||
def __init__( self, machineName, clusterNode, gridEngine ):
|
IRequest.__init__(self, IRequest.GO_TO_SLEEP)
|
||||||
threading.Thread.__init__(self)
|
self.m_sleepCompleteNotifier = sleepCompleteNotifier
|
||||||
self.m_clusterNodeName = machineName
|
|
||||||
self.m_clusterNode = clusterNode
|
def process(self, clusterNodeStatusUpdater):
|
||||||
self.m_gridEngine = gridEngine
|
assert not clusterNodeStatusUpdater.m_bShouldAlwaysBeOn # are we attempting to put a machine the should stay on to sleep ?
|
||||||
self.m_bStop = False
|
logInfo('Handling sleep request for %s' % clusterNodeStatusUpdater.getName())
|
||||||
self.m_lastPowerStateCheckTime = None #time.time()
|
if clusterNodeStatusUpdater.setQueueActivation(False):
|
||||||
self.m_bCheckPowerState = True
|
if clusterNodeStatusUpdater.queueIsEmpty():
|
||||||
self.m_stateLock = threading.Lock() # lock that prevents concurrent access to the state of this instance
|
if blockingPutMachineToSleep(clusterNodeStatusUpdater.m_clusterNodeName):
|
||||||
self.m_bShouldAlwaysBeOn = False # indicates that the machine should never go to sleep or off for whatever reason (eg simpatix10)
|
# now we know that the machine is asleep
|
||||||
self.m_pendingRequestsQueue = []
|
clusterNodeStatusUpdater.m_stateLock.acquire()
|
||||||
|
clusterNodeStatusUpdater.m_clusterNode.setPowerState(PowerState.SLEEP)
|
||||||
def getGridEngine( self ):
|
clusterNodeStatusUpdater.m_stateLock.release()
|
||||||
return self.m_gridEngine
|
if self.m_sleepCompleteNotifier:
|
||||||
|
self.m_sleepCompleteNotifier.onSleepComplete(True)
|
||||||
def getName( self ):
|
else:
|
||||||
return self.m_clusterNodeName
|
assert False
|
||||||
|
else:
|
||||||
def setShouldAlwaysBeOn( self ):
|
# reactivate the queue
|
||||||
print('%s should always be on' % (self.getName()) )
|
if not clusterNodeStatusUpdater.setQueueActivation(True):
|
||||||
self.m_bShouldAlwaysBeOn = True
|
assert False
|
||||||
|
clusterNodeStatusUpdater.m_stateLock.acquire()
|
||||||
def pushRequest( self, request ):
|
clusterNodeStatusUpdater.m_clusterNode.setPowerState(PowerState.ON) # this is necessary to reenable the various cyclic checks that were disabled on sleep request
|
||||||
self.m_stateLock.acquire()
|
clusterNodeStatusUpdater.m_stateLock.release()
|
||||||
self.m_pendingRequestsQueue.append(request)
|
clusterNodeStatusUpdater.m_clusterNode.onSleepFailedBecauseAJobJustArrived()
|
||||||
self.m_stateLock.release()
|
if self.m_sleepCompleteNotifier:
|
||||||
|
self.m_sleepCompleteNotifier.onSleepComplete(False)
|
||||||
def popRequest( self ):
|
else:
|
||||||
oldestRequest = None
|
assert False
|
||||||
self.m_stateLock.acquire()
|
|
||||||
if len(self.m_pendingRequestsQueue) != 0:
|
|
||||||
oldestRequest = self.m_pendingRequestsQueue.pop(0)
|
class CheckPowerStateRequest(IRequest):
|
||||||
self.m_stateLock.release()
|
|
||||||
return oldestRequest
|
def __init__(self):
|
||||||
|
IRequest.__init__(self, IRequest.CHECK_POWER_STATE)
|
||||||
def run( self ):
|
|
||||||
try:
|
def process(self, clusterNodeStatusUpdater):
|
||||||
|
powerState = getPowerState(clusterNodeStatusUpdater.m_clusterNodeName)
|
||||||
while not self.m_bStop :
|
clusterNodeStatusUpdater.m_stateLock.acquire()
|
||||||
# handle the oldest request
|
clusterNodeStatusUpdater.m_clusterNode.onNewPowerStateReading(powerState)
|
||||||
request = self.popRequest()
|
clusterNodeStatusUpdater.m_lastPowerStateCheckTime = time.time()
|
||||||
if request != None :
|
clusterNodeStatusUpdater.m_stateLock.release()
|
||||||
request.process( self )
|
|
||||||
|
|
||||||
# schedule a power state check if required
|
class ClusterNodeStatusUpdater(threading.Thread):
|
||||||
currentTime = time.time()
|
DELAY_BETWEEN_POWERSTATE_CHECKS = 5 * 60 # in seconds
|
||||||
if self.m_bCheckPowerState:
|
|
||||||
if not self.m_bShouldAlwaysBeOn: # don't do power checks on such machines because some current implementations of
|
def __init__(self, machineName, clusterNode, gridEngine):
|
||||||
# operations involved might cause the machine to go to sleep
|
threading.Thread.__init__(self)
|
||||||
if (not self.m_lastPowerStateCheckTime) or (currentTime > (self.m_lastPowerStateCheckTime + ClusterNodeStatusUpdater.DELAY_BETWEEN_POWERSTATE_CHECKS)):
|
self.m_clusterNodeName = machineName
|
||||||
self.pushRequest( CheckPowerStateRequest() )
|
self.m_clusterNode = clusterNode
|
||||||
|
self.m_gridEngine = gridEngine
|
||||||
time.sleep(1)
|
self.m_bStop = False
|
||||||
except BaseException, exception: # catches all exceptions, including the ctrl+C (KeyboardInterrupt)
|
self.m_lastPowerStateCheckTime = None # time.time()
|
||||||
Util.onException(exception)
|
self.m_bCheckPowerState = True
|
||||||
|
self.m_stateLock = threading.Lock() # lock that prevents concurrent access to the state of this instance
|
||||||
def requestSleep( self, sleepCompleteNotifier = None ):
|
self.m_bShouldAlwaysBeOn = False # indicates that the machine should never go to sleep or off for whatever reason (eg simpatix10)
|
||||||
assert( self.m_bShouldAlwaysBeOn == False )
|
self.m_pendingRequestsQueue = []
|
||||||
self.pushRequest( SleepRequest( sleepCompleteNotifier ) )
|
|
||||||
|
def getGridEngine(self):
|
||||||
def requestWakeUp( self, wakeUpNotifier = None ):
|
return self.m_gridEngine
|
||||||
assert( self.m_bShouldAlwaysBeOn == False )
|
|
||||||
self.pushRequest( WakeUpRequest( wakeUpNotifier ) )
|
def getName(self):
|
||||||
|
return self.m_clusterNodeName
|
||||||
def getQueueMachineName( self ):
|
|
||||||
return self.m_clusterNode.getQueueMachineName()
|
def setShouldAlwaysBeOn(self):
|
||||||
|
print('%s should always be on' % (self.getName()))
|
||||||
def setQueueActivation( self, bEnable ):
|
self.m_bShouldAlwaysBeOn = True
|
||||||
"""
|
|
||||||
@return true on success, false otherwise
|
def pushRequest(self, request):
|
||||||
"""
|
self.m_stateLock.acquire()
|
||||||
return self.getGridEngine().setQueueInstanceActivation( self.getQueueMachineName(), bEnable )
|
self.m_pendingRequestsQueue.append(request)
|
||||||
|
self.m_stateLock.release()
|
||||||
def queueIsEmpty( self ):
|
|
||||||
return self.getGridEngine().queueIsEmpty( self.getName() )
|
def popRequest(self):
|
||||||
|
oldestRequest = None
|
||||||
|
self.m_stateLock.acquire()
|
||||||
|
if len(self.m_pendingRequestsQueue) != 0:
|
||||||
|
oldestRequest = self.m_pendingRequestsQueue.pop(0)
|
||||||
|
self.m_stateLock.release()
|
||||||
|
return oldestRequest
|
||||||
|
|
||||||
|
def run(self):
|
||||||
|
try:
|
||||||
|
|
||||||
|
while not self.m_bStop:
|
||||||
|
# handle the oldest request
|
||||||
|
request = self.popRequest()
|
||||||
|
if request is not None:
|
||||||
|
request.process(self)
|
||||||
|
|
||||||
|
# schedule a power state check if required
|
||||||
|
currentTime = time.time()
|
||||||
|
if self.m_bCheckPowerState:
|
||||||
|
if not self.m_bShouldAlwaysBeOn: # don't do power checks on such machines because some current implementations of
|
||||||
|
# operations involved might cause the machine to go to sleep
|
||||||
|
if (not self.m_lastPowerStateCheckTime) or (currentTime > (self.m_lastPowerStateCheckTime + ClusterNodeStatusUpdater.DELAY_BETWEEN_POWERSTATE_CHECKS)):
|
||||||
|
self.pushRequest(CheckPowerStateRequest())
|
||||||
|
|
||||||
|
time.sleep(1)
|
||||||
|
except BaseException as exception: # catches all exceptions, including the ctrl+C (KeyboardInterrupt)
|
||||||
|
onException(exception)
|
||||||
|
|
||||||
|
def requestSleep(self, sleepCompleteNotifier=None):
|
||||||
|
assert not self.m_bShouldAlwaysBeOn
|
||||||
|
self.pushRequest(SleepRequest(sleepCompleteNotifier))
|
||||||
|
|
||||||
|
def requestWakeUp(self, wakeUpNotifier=None):
|
||||||
|
assert self.m_bShouldAlwaysBeOn is False
|
||||||
|
self.pushRequest(WakeUpRequest(wakeUpNotifier))
|
||||||
|
|
||||||
|
def getQueueMachineName(self):
|
||||||
|
return self.m_clusterNode.getQueueMachineName()
|
||||||
|
|
||||||
|
def setQueueActivation(self, bEnable):
|
||||||
|
"""
|
||||||
|
@return true on success, false otherwise
|
||||||
|
"""
|
||||||
|
return self.getGridEngine().setQueueInstanceActivation(self.getQueueMachineName(), bEnable)
|
||||||
|
|
||||||
|
def queueIsEmpty(self):
|
||||||
|
return self.getGridEngine().queueIsEmpty(self.getName())
|
||||||
|
|
|
@ -1,209 +1,209 @@
|
||||||
import threading
|
import threading
|
||||||
from JobsStateUpdater import *
|
from JobsStateUpdater import JobsStateUpdater
|
||||||
import Lib.Util
|
import Lib.Util
|
||||||
import Lib.SimpaDbUtil
|
import Lib.SimpaDbUtil
|
||||||
from ClusterNode import *
|
from ClusterNode import ClusterNode
|
||||||
|
from Log import logInfo, logError
|
||||||
|
from PowerState import PowerState
|
||||||
import time
|
import time
|
||||||
|
|
||||||
|
|
||||||
class ClusterStatus:
|
class ClusterStatus:
|
||||||
"""
|
"""
|
||||||
The current state (jobs, sensors) of the cluster
|
The current state (jobs, sensors) of the cluster
|
||||||
|
|
||||||
@param gridEngine the interface to the batch job tool (in our case it's sun grid engine)
|
|
||||||
"""
|
|
||||||
def __init__(self, gridEngine):
|
|
||||||
self.m_gridEngine = gridEngine
|
|
||||||
self.m_clusterNodes = {}
|
|
||||||
self.m_lock = threading.Lock() # to prevent concurrent access to this instance
|
|
||||||
self.m_jobsStateUpdater = JobsStateUpdater( self )
|
|
||||||
self.m_jobsState = None
|
|
||||||
#self.m_controlledMachineNames = [ 'simpatix30' ]
|
|
||||||
self.m_controlledMachineNames = [] # [ 'simpatix30' ]
|
|
||||||
if False:
|
|
||||||
for iMachine in range(11, 40):
|
|
||||||
if (iMachine == 31) or (iMachine == 32):
|
|
||||||
continue # these machines don't seem to be able to go to sleep properly (bug 00000010)
|
|
||||||
if (iMachine == 18):
|
|
||||||
continue # this machine needs maintenance (restarting because it's very slow for an unknown reason)
|
|
||||||
self.m_controlledMachineNames.append( 'simpatix%d' % iMachine )
|
|
||||||
nodeNames = Lib.SimpaDbUtil.getClusterMachinesNames()
|
|
||||||
for nodeName in nodeNames:
|
|
||||||
if nodeName in self.m_controlledMachineNames:
|
|
||||||
logInfo( 'machine %s is under the cluster controller\'s control' % nodeName )
|
|
||||||
clusterNode = ClusterNode( nodeName, self, gridEngine )
|
|
||||||
if nodeName == 'simpatix10':
|
|
||||||
clusterNode.setShouldAlwaysBeOn()
|
|
||||||
self.m_clusterNodes[ nodeName ] = clusterNode
|
|
||||||
return
|
|
||||||
|
|
||||||
|
|
||||||
def setControlOnMachine(self, machineName, bControl):
|
|
||||||
if bControl:
|
|
||||||
# add machineName under control of ClusterController
|
|
||||||
for k, v in self.m_clusterNodes.items():
|
|
||||||
if v.getName() == machineName :
|
|
||||||
return # nothing to do : machineName is already under the control of ClusterController
|
|
||||||
|
|
||||||
clusterNode = ClusterNode( machineName, self, self.m_gridEngine )
|
@param gridEngine the interface to the batch job tool (in our case it's sun grid engine)
|
||||||
if machineName == 'simpatix10':
|
"""
|
||||||
clusterNode.setShouldAlwaysBeOn()
|
def __init__(self, gridEngine):
|
||||||
self.m_clusterNodes[ machineName ] = clusterNode
|
self.m_gridEngine = gridEngine
|
||||||
clusterNode.m_machineStatusUpdater.start()
|
self.m_clusterNodes = {}
|
||||||
else:
|
self.m_lock = threading.Lock() # to prevent concurrent access to this instance
|
||||||
# remove machineName from control of ClusterController
|
self.m_jobsStateUpdater = JobsStateUpdater(self)
|
||||||
clusterNode = self.m_clusterNodes.get(machineName)
|
self.m_jobsState = None
|
||||||
if clusterNode:
|
# self.m_controlledMachineNames = ['simpatix30']
|
||||||
clusterNode.m_machineStatusUpdater.m_bStop = True
|
self.m_controlledMachineNames = [] # ['simpatix30']
|
||||||
clusterNode.m_machineStatusUpdater.join()
|
if False:
|
||||||
self.m_clusterNodes.pop(machineName)
|
for iMachine in range(11, 40):
|
||||||
|
if (iMachine == 31) or (iMachine == 32):
|
||||||
def getGridEngine( self ):
|
continue # these machines don't seem to be able to go to sleep properly (bug 00000010)
|
||||||
return self.m_gridEngine
|
if (iMachine == 18):
|
||||||
|
continue # this machine needs maintenance (restarting because it's very slow for an unknown reason)
|
||||||
def getMachines( self ):
|
self.m_controlledMachineNames.append('simpatix%d' % iMachine)
|
||||||
return self.m_clusterNodes
|
nodeNames = Lib.SimpaDbUtil.getClusterMachinesNames()
|
||||||
|
for nodeName in nodeNames:
|
||||||
def startReadingThreads( self ):
|
if nodeName in self.m_controlledMachineNames:
|
||||||
for k, v in self.m_clusterNodes.items():
|
logInfo('machine %s is under the cluster controller\'s control' % nodeName)
|
||||||
v.m_machineStatusUpdater.start()
|
clusterNode = ClusterNode(nodeName, self, gridEngine)
|
||||||
self.m_jobsStateUpdater.start()
|
if nodeName == 'simpatix10':
|
||||||
|
clusterNode.setShouldAlwaysBeOn()
|
||||||
def stopReadingThreads( self ):
|
self.m_clusterNodes[nodeName] = clusterNode
|
||||||
for k, v in self.m_clusterNodes.items():
|
return
|
||||||
v.m_machineStatusUpdater.m_bStop = True
|
|
||||||
v.m_machineStatusUpdater.join()
|
|
||||||
self.m_jobsStateUpdater.m_bStop = True
|
|
||||||
self.m_jobsStateUpdater.join()
|
|
||||||
|
|
||||||
def onNewJobsState( self, newJobsState ):
|
|
||||||
#logDebug( 'ClusterStatus::onNewJobsState : attempting to acquire lock to access m_jobsState' )
|
|
||||||
self.m_lock.acquire()
|
|
||||||
#logDebug( 'ClusterStatus::onNewJobsState : got lock to access m_jobsState' )
|
|
||||||
self.m_jobsState = newJobsState
|
|
||||||
self.m_lock.release()
|
|
||||||
|
|
||||||
def getJobsOnMachine( self, machineName ):
|
|
||||||
return self.m_jobsState.getJobsOnMachine( machineName )
|
|
||||||
|
|
||||||
def isReady( self ):
|
|
||||||
for k, v in self.m_clusterNodes.items():
|
|
||||||
if not v.isReady():
|
|
||||||
logInfo( 'ClusterStatus::isReady : not ready because of ' + v.getName() )
|
|
||||||
return False
|
|
||||||
#log('ClusterStatus::isReady() : '+k+' is ready')
|
|
||||||
#assert( False )
|
|
||||||
if self.m_jobsState == None:
|
|
||||||
logInfo( 'ClusterStatus::isReady : not ready because waiting for jobs state' )
|
|
||||||
return False
|
|
||||||
return True
|
|
||||||
|
|
||||||
def getIdleMachines( self ):
|
|
||||||
assert( self.isReady )
|
|
||||||
bBUG_00000009_IS_STILL_ALIVE = True
|
|
||||||
if bBUG_00000009_IS_STILL_ALIVE:
|
|
||||||
currentTime = time.time()
|
|
||||||
fJOBS_STATE_MAX_ALLOWED_AGE = 3600
|
|
||||||
fJobsStateAge = currentTime - self.m_jobsState.getTime()
|
|
||||||
if fJobsStateAge > fJOBS_STATE_MAX_ALLOWED_AGE:
|
|
||||||
logError('ClusterStatus::getIdleMachines : age of jobs state is too old (%f s). This is bug 00000009.' % (fJobsStateAge))
|
|
||||||
assert( False )
|
|
||||||
idleMachines = {}
|
|
||||||
for machineName, machine in self.m_clusterNodes.items():
|
|
||||||
if machine.getPowerState() == PowerState.ON:
|
|
||||||
jobsOnThisMachine = self.getJobsOnMachine( machineName )
|
|
||||||
if len(jobsOnThisMachine) == 0:
|
|
||||||
idleMachines[ machineName ] = machine
|
|
||||||
return idleMachines
|
|
||||||
|
|
||||||
def getPendingJobs( self ):
|
|
||||||
return self.m_jobsState.getPendingJobs()
|
|
||||||
|
|
||||||
def getJobsState( self ):
|
|
||||||
return self.m_jobsState
|
|
||||||
|
|
||||||
def queueMachineFitsJobRequirements( self, queueMachine, jobRequirements ):
|
|
||||||
if jobRequirements.m_queues:
|
|
||||||
bQueueIsInAllowedQueues = False
|
|
||||||
for queueName in jobRequirements.m_queues:
|
|
||||||
if queueName == queueMachine.getQueueName():
|
|
||||||
bQueueIsInAllowedQueues = True
|
|
||||||
if not bQueueIsInAllowedQueues:
|
|
||||||
logInfo('queueMachineFitsJobRequirements : queueMachine '+queueMachine.getName()+' rejected because it\'s not in the allowed queues')
|
|
||||||
return False
|
|
||||||
return True
|
|
||||||
|
|
||||||
def getEnergyConsumption( self ):
|
|
||||||
"""
|
|
||||||
returns an estimate of the energy consumption since the start of the cluster controller (in joules)
|
|
||||||
"""
|
|
||||||
fEnergyConsumption = 0.0
|
|
||||||
for machine in self.m_clusterNodes.values():
|
|
||||||
if machine.isReady(): # there are cases where the machine is not ready yet (for example, it's just been added to clustercontroller's control)
|
|
||||||
fEnergyConsumption += machine.getEnergyConsumption()
|
|
||||||
return fEnergyConsumption
|
|
||||||
|
|
||||||
def getEnergySavings( self ):
|
|
||||||
"""
|
|
||||||
returns an estimate of the energy saving since the start of the cluster controller (in joules)
|
|
||||||
"""
|
|
||||||
fEnergySavings = 0.0
|
|
||||||
for machine in self.m_clusterNodes.values():
|
|
||||||
if machine.isReady():
|
|
||||||
fEnergySavings += machine.getEnergySavings()
|
|
||||||
return fEnergySavings
|
|
||||||
|
|
||||||
def getCurrentPowerConsumption( self ):
|
|
||||||
fPowerConsumption = 0.0
|
|
||||||
for machine in self.m_clusterNodes.values():
|
|
||||||
if machine.isReady():
|
|
||||||
fPowerConsumption += machine.getPowerConsumption()
|
|
||||||
return fPowerConsumption
|
|
||||||
|
|
||||||
def getCurrentPowerSavings( self ):
|
|
||||||
fPowerSavings = 0.0
|
|
||||||
for machine in self.m_clusterNodes.values():
|
|
||||||
if machine.isReady():
|
|
||||||
fPowerSavings += machine.getPowerConsumptionForPowerState( PowerState.ON ) - machine.getPowerConsumption()
|
|
||||||
return fPowerSavings
|
|
||||||
|
|
||||||
def getNumControlledSlots( self ):
|
def setControlOnMachine(self, machineName, bControl):
|
||||||
self.m_lock.acquire()
|
if bControl:
|
||||||
iNumControlledSlots = 0
|
# add machineName under control of ClusterController
|
||||||
for machine in self.m_clusterNodes.values():
|
for k, v in self.m_clusterNodes.items():
|
||||||
queueMachine = self.m_jobsState.getQueueMachine( machine.getName() )
|
if v.getName() == machineName:
|
||||||
iNumControlledSlots += queueMachine.getNumSlots()
|
return # nothing to do : machineName is already under the control of ClusterController
|
||||||
self.m_lock.release()
|
|
||||||
return iNumControlledSlots
|
|
||||||
|
|
||||||
def getNumUsedSlots( self ):
|
|
||||||
self.m_lock.acquire()
|
|
||||||
iNumUsedSlots = 0
|
|
||||||
for machine in self.m_clusterNodes.values():
|
|
||||||
queueMachine = self.m_jobsState.getQueueMachine( machine.getName() )
|
|
||||||
iNumUsedSlotsOnThisMachine = queueMachine.getNumSlots() - self.m_jobsState.getNumFreeSlotsOnQueueMachine(queueMachine)
|
|
||||||
assert(iNumUsedSlotsOnThisMachine >= 0)
|
|
||||||
iNumUsedSlots += iNumUsedSlotsOnThisMachine
|
|
||||||
self.m_lock.release()
|
|
||||||
return iNumUsedSlots
|
|
||||||
|
|
||||||
def getNumWastedSlots( self ):
|
|
||||||
self.m_lock.acquire()
|
|
||||||
iNumWastedSlots = 0
|
|
||||||
for machine in self.m_clusterNodes.values():
|
|
||||||
if machine.getPowerState() == PowerState.ON:
|
|
||||||
queueMachine = self.m_jobsState.getQueueMachine( machine.getName() )
|
|
||||||
iNumWastedSlots += self.m_jobsState.getNumFreeSlotsOnQueueMachine(queueMachine)
|
|
||||||
self.m_lock.release()
|
|
||||||
return iNumWastedSlots
|
|
||||||
|
|
||||||
def getNumSleepingSlots( self ):
|
clusterNode = ClusterNode(machineName, self, self.m_gridEngine)
|
||||||
self.m_lock.acquire()
|
if machineName == 'simpatix10':
|
||||||
iNumSleepingSlots = 0
|
clusterNode.setShouldAlwaysBeOn()
|
||||||
for machine in self.m_clusterNodes.values():
|
self.m_clusterNodes[machineName] = clusterNode
|
||||||
if machine.getPowerState() == PowerState.SLEEP:
|
clusterNode.m_machineStatusUpdater.start()
|
||||||
queueMachine = self.m_jobsState.getQueueMachine( machine.getName() )
|
else:
|
||||||
iNumSleepingSlots += self.m_jobsState.getNumFreeSlotsOnQueueMachine(queueMachine)
|
# remove machineName from control of ClusterController
|
||||||
self.m_lock.release()
|
clusterNode = self.m_clusterNodes.get(machineName)
|
||||||
return iNumSleepingSlots
|
if clusterNode:
|
||||||
|
clusterNode.m_machineStatusUpdater.m_bStop = True
|
||||||
|
clusterNode.m_machineStatusUpdater.join()
|
||||||
|
self.m_clusterNodes.pop(machineName)
|
||||||
|
|
||||||
|
def getGridEngine(self):
|
||||||
|
return self.m_gridEngine
|
||||||
|
|
||||||
|
def getMachines(self):
|
||||||
|
return self.m_clusterNodes
|
||||||
|
|
||||||
|
def startReadingThreads(self):
|
||||||
|
for k, v in self.m_clusterNodes.items():
|
||||||
|
v.m_machineStatusUpdater.start()
|
||||||
|
self.m_jobsStateUpdater.start()
|
||||||
|
|
||||||
|
def stopReadingThreads(self):
|
||||||
|
for k, v in self.m_clusterNodes.items():
|
||||||
|
v.m_machineStatusUpdater.m_bStop = True
|
||||||
|
v.m_machineStatusUpdater.join()
|
||||||
|
self.m_jobsStateUpdater.m_bStop = True
|
||||||
|
self.m_jobsStateUpdater.join()
|
||||||
|
|
||||||
|
def onNewJobsState(self, newJobsState):
|
||||||
|
# logDebug('ClusterStatus::onNewJobsState : attempting to acquire lock to access m_jobsState')
|
||||||
|
self.m_lock.acquire()
|
||||||
|
# logDebug('ClusterStatus::onNewJobsState : got lock to access m_jobsState')
|
||||||
|
self.m_jobsState = newJobsState
|
||||||
|
self.m_lock.release()
|
||||||
|
|
||||||
|
def getJobsOnMachine(self, machineName):
|
||||||
|
return self.m_jobsState.getJobsOnMachine(machineName)
|
||||||
|
|
||||||
|
def isReady(self):
|
||||||
|
for k, v in self.m_clusterNodes.items():
|
||||||
|
if not v.isReady():
|
||||||
|
logInfo('ClusterStatus::isReady : not ready because of ' + v.getName())
|
||||||
|
return False
|
||||||
|
# log('ClusterStatus::isReady() : '+k+' is ready')
|
||||||
|
# assert(False)
|
||||||
|
if self.m_jobsState is None:
|
||||||
|
logInfo('ClusterStatus::isReady : not ready because waiting for jobs state')
|
||||||
|
return False
|
||||||
|
return True
|
||||||
|
|
||||||
|
def getIdleMachines(self):
|
||||||
|
assert self.isReady
|
||||||
|
bBUG_00000009_IS_STILL_ALIVE = True
|
||||||
|
if bBUG_00000009_IS_STILL_ALIVE:
|
||||||
|
currentTime = time.time()
|
||||||
|
fJOBS_STATE_MAX_ALLOWED_AGE = 3600
|
||||||
|
fJobsStateAge = currentTime - self.m_jobsState.getTime()
|
||||||
|
if fJobsStateAge > fJOBS_STATE_MAX_ALLOWED_AGE:
|
||||||
|
logError('ClusterStatus::getIdleMachines : age of jobs state is too old (%f s). This is bug 00000009.' % (fJobsStateAge))
|
||||||
|
assert False
|
||||||
|
idleMachines = {}
|
||||||
|
for machineName, machine in self.m_clusterNodes.items():
|
||||||
|
if machine.getPowerState() == PowerState.ON:
|
||||||
|
jobsOnThisMachine = self.getJobsOnMachine(machineName)
|
||||||
|
if len(jobsOnThisMachine) == 0:
|
||||||
|
idleMachines[machineName] = machine
|
||||||
|
return idleMachines
|
||||||
|
|
||||||
|
def getPendingJobs(self):
|
||||||
|
return self.m_jobsState.getPendingJobs()
|
||||||
|
|
||||||
|
def getJobsState(self):
|
||||||
|
return self.m_jobsState
|
||||||
|
|
||||||
|
def queueMachineFitsJobRequirements(self, queueMachine, jobRequirements):
|
||||||
|
if jobRequirements.m_queues:
|
||||||
|
bQueueIsInAllowedQueues = False
|
||||||
|
for queueName in jobRequirements.m_queues:
|
||||||
|
if queueName == queueMachine.getQueueName():
|
||||||
|
bQueueIsInAllowedQueues = True
|
||||||
|
if not bQueueIsInAllowedQueues:
|
||||||
|
logInfo('queueMachineFitsJobRequirements : queueMachine ' + queueMachine.getName() + ' rejected because it\'s not in the allowed queues')
|
||||||
|
return False
|
||||||
|
return True
|
||||||
|
|
||||||
|
def getEnergyConsumption(self):
|
||||||
|
"""
|
||||||
|
returns an estimate of the energy consumption since the start of the cluster controller (in joules)
|
||||||
|
"""
|
||||||
|
fEnergyConsumption = 0.0
|
||||||
|
for machine in self.m_clusterNodes.values():
|
||||||
|
if machine.isReady(): # there are cases where the machine is not ready yet (for example, it's just been added to clustercontroller's control)
|
||||||
|
fEnergyConsumption += machine.getEnergyConsumption()
|
||||||
|
return fEnergyConsumption
|
||||||
|
|
||||||
|
def getEnergySavings(self):
|
||||||
|
"""
|
||||||
|
returns an estimate of the energy saving since the start of the cluster controller (in joules)
|
||||||
|
"""
|
||||||
|
fEnergySavings = 0.0
|
||||||
|
for machine in self.m_clusterNodes.values():
|
||||||
|
if machine.isReady():
|
||||||
|
fEnergySavings += machine.getEnergySavings()
|
||||||
|
return fEnergySavings
|
||||||
|
|
||||||
|
def getCurrentPowerConsumption(self):
|
||||||
|
fPowerConsumption = 0.0
|
||||||
|
for machine in self.m_clusterNodes.values():
|
||||||
|
if machine.isReady():
|
||||||
|
fPowerConsumption += machine.getPowerConsumption()
|
||||||
|
return fPowerConsumption
|
||||||
|
|
||||||
|
def getCurrentPowerSavings(self):
|
||||||
|
fPowerSavings = 0.0
|
||||||
|
for machine in self.m_clusterNodes.values():
|
||||||
|
if machine.isReady():
|
||||||
|
fPowerSavings += machine.getPowerConsumptionForPowerState(PowerState.ON) - machine.getPowerConsumption()
|
||||||
|
return fPowerSavings
|
||||||
|
|
||||||
|
def getNumControlledSlots(self):
|
||||||
|
self.m_lock.acquire()
|
||||||
|
iNumControlledSlots = 0
|
||||||
|
for machine in self.m_clusterNodes.values():
|
||||||
|
queueMachine = self.m_jobsState.getQueueMachine(machine.getName())
|
||||||
|
iNumControlledSlots += queueMachine.getNumSlots()
|
||||||
|
self.m_lock.release()
|
||||||
|
return iNumControlledSlots
|
||||||
|
|
||||||
|
def getNumUsedSlots(self):
|
||||||
|
self.m_lock.acquire()
|
||||||
|
iNumUsedSlots = 0
|
||||||
|
for machine in self.m_clusterNodes.values():
|
||||||
|
queueMachine = self.m_jobsState.getQueueMachine(machine.getName())
|
||||||
|
iNumUsedSlotsOnThisMachine = queueMachine.getNumSlots() - self.m_jobsState.getNumFreeSlotsOnQueueMachine(queueMachine)
|
||||||
|
assert iNumUsedSlotsOnThisMachine >= 0
|
||||||
|
iNumUsedSlots += iNumUsedSlotsOnThisMachine
|
||||||
|
self.m_lock.release()
|
||||||
|
return iNumUsedSlots
|
||||||
|
|
||||||
|
def getNumWastedSlots(self):
|
||||||
|
self.m_lock.acquire()
|
||||||
|
iNumWastedSlots = 0
|
||||||
|
for machine in self.m_clusterNodes.values():
|
||||||
|
if machine.getPowerState() == PowerState.ON:
|
||||||
|
queueMachine = self.m_jobsState.getQueueMachine(machine.getName())
|
||||||
|
iNumWastedSlots += self.m_jobsState.getNumFreeSlotsOnQueueMachine(queueMachine)
|
||||||
|
self.m_lock.release()
|
||||||
|
return iNumWastedSlots
|
||||||
|
|
||||||
|
def getNumSleepingSlots(self):
|
||||||
|
self.m_lock.acquire()
|
||||||
|
iNumSleepingSlots = 0
|
||||||
|
for machine in self.m_clusterNodes.values():
|
||||||
|
if machine.getPowerState() == PowerState.SLEEP:
|
||||||
|
queueMachine = self.m_jobsState.getQueueMachine(machine.getName())
|
||||||
|
iNumSleepingSlots += self.m_jobsState.getNumFreeSlotsOnQueueMachine(queueMachine)
|
||||||
|
self.m_lock.release()
|
||||||
|
return iNumSleepingSlots
|
||||||
|
|
|
@ -1,9 +1,9 @@
|
||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
|
|
||||||
"""
|
"""
|
||||||
script that installs ClusterController on simpatix10
|
script that installs ClusterController on simpatix10
|
||||||
to start ClusterController :
|
to start ClusterController :
|
||||||
launchctl start fr.univ-rennes1.ipr.ClusterController
|
launchctl start fr.univ-rennes1.ipr.ClusterController
|
||||||
"""
|
"""
|
||||||
import sys
|
import sys
|
||||||
sys.path.insert(0, '..')
|
sys.path.insert(0, '..')
|
||||||
|
@ -11,32 +11,32 @@ from Lib.Util import *
|
||||||
import os
|
import os
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
machineName = 'simpatix10'
|
machineName = 'simpatix10'
|
||||||
strThisDir = os.getcwd()
|
strThisDir = os.getcwd()
|
||||||
strPythonDevDir = strThisDir + '/..'
|
strPythonDevDir = strThisDir + '/..'
|
||||||
print( 'installing ClusterController on '+machineName )
|
print( 'installing ClusterController on '+machineName )
|
||||||
remoteCommand = ''
|
remoteCommand = ''
|
||||||
remoteCommand += 'mkdir -p /usr/local/bin/ipr/Python;'
|
remoteCommand += 'mkdir -p /usr/local/bin/ipr/Python;'
|
||||||
remoteCommand += 'rm -r /usr/local/bin/ipr/Python/Lib;'
|
remoteCommand += 'rm -r /usr/local/bin/ipr/Python/Lib;'
|
||||||
remoteCommand += 'rm -r /usr/local/bin/ipr/Python/ClusterController;'
|
remoteCommand += 'rm -r /usr/local/bin/ipr/Python/ClusterController;'
|
||||||
remoteCommand += 'cp -r %s/Lib /usr/local/bin/ipr/Python/;' % strPythonDevDir
|
remoteCommand += 'cp -r %s/Lib /usr/local/bin/ipr/Python/;' % strPythonDevDir
|
||||||
remoteCommand += 'cp -r %s/ClusterController /usr/local/bin/ipr/Python/;' % strPythonDevDir
|
remoteCommand += 'cp -r %s/ClusterController /usr/local/bin/ipr/Python/;' % strPythonDevDir
|
||||||
remoteCommand += 'cp %s/ClusterController/ClusterController.plist /Library/LaunchDaemons/fr.univ-rennes1.ipr.ClusterController.plist;' % strPythonDevDir
|
remoteCommand += 'cp %s/ClusterController/ClusterController.plist /Library/LaunchDaemons/fr.univ-rennes1.ipr.ClusterController.plist;' % strPythonDevDir
|
||||||
remoteCommand += 'cp -r %s/ClusterController/ClusterControllerLauncher.sh /usr/local/bin/ipr/Python/ClusterController/;' % strPythonDevDir
|
remoteCommand += 'cp -r %s/ClusterController/ClusterControllerLauncher.sh /usr/local/bin/ipr/Python/ClusterController/;' % strPythonDevDir
|
||||||
remoteCommand += 'launchctl unload /Library/LaunchDaemons/fr.univ-rennes1.ipr.ClusterController.plist;'
|
remoteCommand += 'launchctl unload /Library/LaunchDaemons/fr.univ-rennes1.ipr.ClusterController.plist;'
|
||||||
remoteCommand += 'launchctl load /Library/LaunchDaemons/fr.univ-rennes1.ipr.ClusterController.plist;'
|
remoteCommand += 'launchctl load /Library/LaunchDaemons/fr.univ-rennes1.ipr.ClusterController.plist;'
|
||||||
command = 'ssh root@'+ machineName +' "'+remoteCommand+'"'
|
command = 'ssh root@'+ machineName +' "'+remoteCommand+'"'
|
||||||
( returnCode, stdout, stderr ) = executeCommand( command )
|
( returnCode, stdout, stderr ) = executeCommand( command )
|
||||||
for strSingleCommand in remoteCommand.split(';'):
|
for strSingleCommand in remoteCommand.split(';'):
|
||||||
print(strSingleCommand)
|
print(strSingleCommand)
|
||||||
print(stdout)
|
print(stdout)
|
||||||
print(stderr)
|
print(stderr)
|
||||||
if returnCode == 0:
|
if returnCode == 0:
|
||||||
print('install succeeded on '+machineName)
|
print('install succeeded on '+machineName)
|
||||||
else:
|
else:
|
||||||
print('install failed on '+machineName+' (see below for detail)')
|
print('install failed on '+machineName+' (see below for detail)')
|
||||||
print stderr
|
print stderr
|
||||||
#assert( False )
|
#assert( False )
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,130 +1,146 @@
|
||||||
|
|
||||||
class JobStateFlags:
|
class JobStateFlags:
|
||||||
RUNNING=1 # the job is running
|
RUNNING = 1 # the job is running
|
||||||
WAITING=2 # the job is waiting
|
WAITING = 2 # the job is waiting
|
||||||
QUEUED=4 # not sure what that exactly means but it reflects the q state of jobs as seen in the pending jobs list from qstat -f -u \*
|
QUEUED = 4 # not sure what that exactly means but it reflects the q state of jobs as seen in the pending jobs list from qstat -f -u \*
|
||||||
TRANSFERING=8
|
TRANSFERING = 8
|
||||||
DELETED=16
|
DELETED = 16
|
||||||
HOLD=32
|
HOLD = 32
|
||||||
ERROR=64
|
ERROR = 64
|
||||||
SUSPENDED=128
|
SUSPENDED = 128
|
||||||
|
|
||||||
|
|
||||||
class ParallelEnvironment:
|
class ParallelEnvironment:
|
||||||
MPI=1
|
MPI = 1
|
||||||
|
|
||||||
|
|
||||||
class JobRequirements:
|
class JobRequirements:
|
||||||
def __init__( self ):
|
def __init__(self):
|
||||||
self.m_numSlots = None
|
self.m_numSlots = None
|
||||||
self.m_strArchitecture = None # machine architecture
|
self.m_strArchitecture = None # machine architecture
|
||||||
self.m_parallelEnvironment = None
|
self.m_parallelEnvironment = None
|
||||||
self.m_queues = None # the list of queues this job is allowed to run on
|
self.m_queues = None # the list of queues this job is allowed to run on
|
||||||
|
|
||||||
|
|
||||||
class JobId:
|
class JobId:
|
||||||
"""
|
"""
|
||||||
the identifier of a job.
|
the identifier of a job.
|
||||||
We treat each element of a job array as a separate job
|
We treat each element of a job array as a separate job
|
||||||
A single integer is no longer enough to identify a job because all elements in a job array
|
A single integer is no longer enough to identify a job because all elements in a job array
|
||||||
share the same sge job identifier. To uniquely define a job array element, we also use the task id.
|
share the same sge job identifier. To uniquely define a job array element, we also use the task id.
|
||||||
"""
|
"""
|
||||||
MAX_NUM_JOBS_IN_ARRAY = 1000000
|
MAX_NUM_JOBS_IN_ARRAY = 1000000
|
||||||
def __init__( self, iJobId, iJobArrayElementId = None):
|
|
||||||
if iJobArrayElementId is not None:
|
def __init__(self, iJobId, iJobArrayElementId=None):
|
||||||
assert iJobArrayElementId <= self.MAX_NUM_JOBS_IN_ARRAY
|
if iJobArrayElementId is not None:
|
||||||
self.m_iJobId = iJobId
|
assert iJobArrayElementId <= self.MAX_NUM_JOBS_IN_ARRAY
|
||||||
self.m_iJobArrayElementId = iJobArrayElementId # None if this identifier does not refer to a job array element
|
self.m_iJobId = iJobId
|
||||||
|
self.m_iJobArrayElementId = iJobArrayElementId # None if this identifier does not refer to a job array element
|
||||||
def __hash__( self ):
|
|
||||||
"""
|
def __hash__(self):
|
||||||
required to use a JobId as a dict hash key
|
"""
|
||||||
"""
|
required to use a JobId as a dict hash key
|
||||||
hash = self.m_iJobId * self.MAX_NUM_JOBS_IN_ARRAY
|
"""
|
||||||
if self.m_iJobArrayElementId is not None:
|
hash = self.m_iJobId * self.MAX_NUM_JOBS_IN_ARRAY
|
||||||
hash += self.m_iJobArrayElementId
|
if self.m_iJobArrayElementId is not None:
|
||||||
return hash
|
hash += self.m_iJobArrayElementId
|
||||||
|
return hash
|
||||||
def __eq__( self, other ):
|
|
||||||
"""
|
def __eq__(self, other):
|
||||||
required to use a JobId as a dict hash key
|
"""
|
||||||
"""
|
required to use a JobId as a dict hash key
|
||||||
if self.m_iJobId != other.m_iJobId:
|
"""
|
||||||
return False
|
if self.m_iJobId != other.m_iJobId:
|
||||||
if self.m_iJobArrayElementId != other.m_iJobArrayElementId:
|
return False
|
||||||
return False
|
if self.m_iJobArrayElementId != other.m_iJobArrayElementId:
|
||||||
return True
|
return False
|
||||||
|
return True
|
||||||
def isJobArrayElement( self ):
|
|
||||||
return (self.m_iJobArrayElementId != None)
|
def isJobArrayElement(self):
|
||||||
|
return (self.m_iJobArrayElementId is not None)
|
||||||
def getMainId(self):
|
|
||||||
return self.m_iJobId
|
def getMainId(self):
|
||||||
|
return self.m_iJobId
|
||||||
def asStr( self ):
|
|
||||||
strResult = '%s' % self.m_iJobId
|
def asStr(self):
|
||||||
if self.isJobArrayElement():
|
strResult = '%s' % self.m_iJobId
|
||||||
strResult += '.%d' % self.m_iJobArrayElementId
|
if self.isJobArrayElement():
|
||||||
return strResult
|
strResult += '.%d' % self.m_iJobArrayElementId
|
||||||
|
return strResult
|
||||||
|
|
||||||
|
|
||||||
class Job:
|
class Job:
|
||||||
def __init__( self, jobId ):
|
def __init__(self, jobId):
|
||||||
self.m_jobId = jobId
|
self.m_jobId = jobId
|
||||||
self.m_startTime = None
|
self.m_startTime = None
|
||||||
self.m_submitTime = None
|
self.m_submitTime = None
|
||||||
self.m_owner = None
|
self.m_owner = None
|
||||||
self.m_scriptName = None
|
self.m_scriptName = None
|
||||||
self.m_slots = {}
|
self.m_slots = {}
|
||||||
self.m_stateFlags = 0
|
self.m_stateFlags = 0
|
||||||
self.m_jobRequirements = JobRequirements()
|
self.m_jobRequirements = JobRequirements()
|
||||||
self.m_requestedRamPerCore = 0
|
self.m_requestedRamPerCore = 0
|
||||||
def getId( self ):
|
|
||||||
return self.m_jobId
|
def getId(self):
|
||||||
def setState( self, state ):
|
return self.m_jobId
|
||||||
self.m_stateFlags = state
|
|
||||||
def setOwner( self, jobOwner ):
|
def setState(self, state):
|
||||||
if self.m_owner:
|
self.m_stateFlags = state
|
||||||
assert( self.m_owner == jobOwner )
|
|
||||||
self.m_owner = jobOwner
|
def setOwner(self, jobOwner):
|
||||||
def getOwner( self ):
|
if self.m_owner:
|
||||||
return self.m_owner
|
assert self.m_owner == jobOwner
|
||||||
def setStartTime( self, jobStartTime ):
|
self.m_owner = jobOwner
|
||||||
if self.m_startTime:
|
|
||||||
assert( self.m_startTime == jobStartTime )
|
def getOwner(self):
|
||||||
self.m_startTime = jobStartTime
|
return self.m_owner
|
||||||
def setSubmitTime( self, jobSubmitTime ):
|
|
||||||
if self.m_submitTime:
|
def setStartTime(self, jobStartTime):
|
||||||
assert( self.m_submitTime == jobSubmitTime )
|
if self.m_startTime:
|
||||||
self.m_submitTime = jobSubmitTime
|
assert self.m_startTime == jobStartTime
|
||||||
def getStartTime( self ):
|
self.m_startTime = jobStartTime
|
||||||
return self.m_startTime
|
|
||||||
def setScriptName( self, jobScriptName ):
|
def setSubmitTime(self, jobSubmitTime):
|
||||||
if self.m_scriptName:
|
if self.m_submitTime:
|
||||||
assert( self.m_scriptName == jobScriptName )
|
assert self.m_submitTime == jobSubmitTime
|
||||||
self.m_scriptName = jobScriptName
|
self.m_submitTime = jobSubmitTime
|
||||||
def addSlots( self, queueMachineName, numSlots ):
|
|
||||||
assert( self.m_slots.get( queueMachineName ) == None )
|
def getStartTime(self):
|
||||||
if self.m_slots.get( queueMachineName ) == None:
|
return self.m_startTime
|
||||||
self.m_slots[ queueMachineName ] = numSlots
|
|
||||||
else:
|
def setScriptName(self, jobScriptName):
|
||||||
# should never happen
|
if self.m_scriptName:
|
||||||
self.m_slots[ queueMachineName ] += numSlots
|
assert self.m_scriptName == jobScriptName
|
||||||
def getSlots( self ):
|
self.m_scriptName = jobScriptName
|
||||||
return self.m_slots
|
|
||||||
def setNumRequiredSlots( self, numSlots ):
|
def addSlots(self, queueMachineName, numSlots):
|
||||||
self.m_jobRequirements.m_numSlots = numSlots
|
assert self.m_slots.get(queueMachineName) is None
|
||||||
def isPending( self ):
|
if self.m_slots.get(queueMachineName) is None:
|
||||||
"""
|
self.m_slots[queueMachineName] = numSlots
|
||||||
returns true if this job is waiting in the queue for whatever reason
|
else:
|
||||||
"""
|
# should never happen
|
||||||
return self.m_stateFlags & JobStateFlags.QUEUED
|
self.m_slots[queueMachineName] += numSlots
|
||||||
def getRequestedRamPerCore( self ):
|
|
||||||
"""
|
def getSlots(self):
|
||||||
requested RAM per core in bytes
|
return self.m_slots
|
||||||
"""
|
|
||||||
return self.m_requestedRamPerCore
|
def setNumRequiredSlots(self, numSlots):
|
||||||
def setRequestedRamPerCore( self, requestedRam ):
|
self.m_jobRequirements.m_numSlots = numSlots
|
||||||
"""
|
|
||||||
requestedRam : requested RAM per core in bytes
|
def isPending(self):
|
||||||
"""
|
"""
|
||||||
self.m_requestedRamPerCore=requestedRam
|
returns true if this job is waiting in the queue for whatever reason
|
||||||
|
"""
|
||||||
|
return self.m_stateFlags & JobStateFlags.QUEUED
|
||||||
|
|
||||||
|
def getRequestedRamPerCore(self):
|
||||||
|
"""
|
||||||
|
requested RAM per core in bytes
|
||||||
|
"""
|
||||||
|
return self.m_requestedRamPerCore
|
||||||
|
|
||||||
|
def setRequestedRamPerCore(self, requestedRam):
|
||||||
|
"""
|
||||||
|
requestedRam : requested RAM per core in bytes
|
||||||
|
"""
|
||||||
|
self.m_requestedRamPerCore = requestedRam
|
||||||
|
|
|
@ -1,85 +1,86 @@
|
||||||
from .Log import *
|
from .Log import *
|
||||||
|
|
||||||
|
|
||||||
class JobsState:
|
class JobsState:
|
||||||
"""
|
"""
|
||||||
represents a snapshot of the state of SGE jobs as seen by the SGE command "qstat -f -u \*"
|
represents a snapshot of the state of SGE jobs as seen by the SGE command "qstat -f -u \*"
|
||||||
"""
|
"""
|
||||||
def __init__( self ):
|
def __init__(self):
|
||||||
self.m_jobs = {} # list of jobs
|
self.m_jobs = {} # list of jobs
|
||||||
self.m_jobArrayJobs = {} # a dictionary of jobs for each job array, indexed by job array id
|
self.m_jobArrayJobs = {} # a dictionary of jobs for each job array, indexed by job array id
|
||||||
self.m_queueMachines = {} # list of queue machines such as allintel.q@simpatix10
|
self.m_queueMachines = {} # list of queue machines such as allintel.q@simpatix10
|
||||||
self.m_stateTime = None # the time at which the state was snapshot
|
self.m_stateTime = None # the time at which the state was snapshot
|
||||||
|
|
||||||
def deleteAllJobs( self ):
|
def deleteAllJobs(self):
|
||||||
self.m_jobs = {}
|
self.m_jobs = {}
|
||||||
self.m_jobArrayJobs = {}
|
self.m_jobArrayJobs = {}
|
||||||
|
|
||||||
def addJob( self, job ):
|
|
||||||
jobId = job.getId()
|
|
||||||
self.m_jobs[ jobId ] = job
|
|
||||||
if jobId.isJobArrayElement():
|
|
||||||
tasks = self.m_jobArrayJobs.get(jobId.m_iJobId)
|
|
||||||
if tasks == None:
|
|
||||||
tasks = {}
|
|
||||||
self.m_jobArrayJobs[ jobId.m_iJobId ] = tasks
|
|
||||||
tasks[jobId] = job
|
|
||||||
|
|
||||||
def getJob( self, jobId ):
|
|
||||||
return self.m_jobs.get( jobId )
|
|
||||||
|
|
||||||
def getJobArrayJobs( self, iJobArrayId ):
|
def addJob(self, job):
|
||||||
return self.m_jobArrayJobs.get( iJobArrayId )
|
jobId = job.getId()
|
||||||
|
self.m_jobs[jobId] = job
|
||||||
def setTime( self, stateTime ):
|
if jobId.isJobArrayElement():
|
||||||
self.m_stateTime = stateTime
|
tasks = self.m_jobArrayJobs.get(jobId.m_iJobId)
|
||||||
|
if tasks is None:
|
||||||
|
tasks = {}
|
||||||
|
self.m_jobArrayJobs[jobId.m_iJobId] = tasks
|
||||||
|
tasks[jobId] = job
|
||||||
|
|
||||||
def getTime( self ):
|
def getJob(self, jobId):
|
||||||
return self.m_stateTime
|
return self.m_jobs.get(jobId)
|
||||||
|
|
||||||
def getJobsOnMachine( self, machineName ):
|
def getJobArrayJobs(self, iJobArrayId):
|
||||||
jobsOnMachine = {}
|
return self.m_jobArrayJobs.get(iJobArrayId)
|
||||||
for jobId, job in self.m_jobs.items():
|
|
||||||
for queueMachineName, numSlots in job.getSlots().items():
|
def setTime(self, stateTime):
|
||||||
jobMachineName = queueMachineName.split('@')[1]
|
self.m_stateTime = stateTime
|
||||||
if jobMachineName == machineName:
|
|
||||||
jobsOnMachine[ jobId ] = job
|
def getTime(self):
|
||||||
return jobsOnMachine
|
return self.m_stateTime
|
||||||
|
|
||||||
def getNumFreeSlotsOnQueueMachine( self, queueMachine ):
|
def getJobsOnMachine(self, machineName):
|
||||||
#logInfo('getNumFreeSlotsOnQueueMachine : looking for free slots on queuemachine %s' % queueMachine.getName() )
|
jobsOnMachine = {}
|
||||||
numUsedSlots = 0
|
for jobId, job in self.m_jobs.items():
|
||||||
for job in self.m_jobs.values():
|
for queueMachineName, numSlots in job.getSlots().items():
|
||||||
numUsedSlotsByThisJob = job.getSlots().get( queueMachine.getName() )
|
jobMachineName = queueMachineName.split('@')[1]
|
||||||
if numUsedSlotsByThisJob != None:
|
if jobMachineName == machineName:
|
||||||
#logInfo('getNumFreeSlotsOnQueueMachine : job %d uses %d slots' % (job.getId().asStr(), numUsedSlotsByThisJob) )
|
jobsOnMachine[jobId] = job
|
||||||
numUsedSlots += numUsedSlotsByThisJob
|
return jobsOnMachine
|
||||||
else:
|
|
||||||
None
|
def getNumFreeSlotsOnQueueMachine(self, queueMachine):
|
||||||
#logInfo('getNumFreeSlotsOnQueueMachine : job %d uses no slot' % job.getId().asStr() )
|
# logInfo('getNumFreeSlotsOnQueueMachine : looking for free slots on queuemachine %s' % queueMachine.getName())
|
||||||
numFreeSlots = queueMachine.getNumSlots() - numUsedSlots
|
numUsedSlots = 0
|
||||||
assert( numFreeSlots >= 0 )
|
for job in self.m_jobs.values():
|
||||||
return numFreeSlots
|
numUsedSlotsByThisJob = job.getSlots().get(queueMachine.getName())
|
||||||
|
if numUsedSlotsByThisJob is not None:
|
||||||
def addQueueMachine( self, queueMachine ):
|
# logInfo('getNumFreeSlotsOnQueueMachine : job %d uses %d slots' % (job.getId().asStr(), numUsedSlotsByThisJob))
|
||||||
self.m_queueMachines[ queueMachine.getName() ] = queueMachine
|
numUsedSlots += numUsedSlotsByThisJob
|
||||||
|
else:
|
||||||
def getQueueMachine( self, machineName ):
|
None
|
||||||
"""
|
# logInfo('getNumFreeSlotsOnQueueMachine : job %d uses no slot' % job.getId().asStr())
|
||||||
finds the queue machine associated with a machine
|
numFreeSlots = queueMachine.getNumSlots() - numUsedSlots
|
||||||
"""
|
assert numFreeSlots >= 0
|
||||||
queueMachine = None
|
return numFreeSlots
|
||||||
for qmName, qm in self.m_queueMachines.items():
|
|
||||||
if qm.m_machineName == machineName:
|
def addQueueMachine(self, queueMachine):
|
||||||
assert( queueMachine == None ) # to be sure that no more than one queue machine is on a given machine
|
self.m_queueMachines[queueMachine.getName()] = queueMachine
|
||||||
queueMachine = qm
|
|
||||||
return queueMachine
|
def getQueueMachine(self, machineName):
|
||||||
|
"""
|
||||||
def getQueueMachines( self ):
|
finds the queue machine associated with a machine
|
||||||
return self.m_queueMachines
|
"""
|
||||||
|
queueMachine = None
|
||||||
def getPendingJobs( self ):
|
for qmName, qm in self.m_queueMachines.items():
|
||||||
pendingJobs = {}
|
if qm.m_machineName == machineName:
|
||||||
for jobId, job in self.m_jobs.items():
|
assert queueMachine is None # to be sure that no more than one queue machine is on a given machine
|
||||||
if job.isPending():
|
queueMachine = qm
|
||||||
pendingJobs[ job.getId() ] = job
|
return queueMachine
|
||||||
return pendingJobs
|
|
||||||
|
def getQueueMachines(self):
|
||||||
|
return self.m_queueMachines
|
||||||
|
|
||||||
|
def getPendingJobs(self):
|
||||||
|
pendingJobs = {}
|
||||||
|
for jobId, job in self.m_jobs.items():
|
||||||
|
if job.isPending():
|
||||||
|
pendingJobs[job.getId()] = job
|
||||||
|
return pendingJobs
|
||||||
|
|
|
@ -6,30 +6,30 @@ import sys
|
||||||
import time
|
import time
|
||||||
|
|
||||||
class JobsStateUpdater( threading.Thread ):
|
class JobsStateUpdater( threading.Thread ):
|
||||||
DELAY_BETWEEN_STATUS_CHECKS=10 # in seconds
|
DELAY_BETWEEN_STATUS_CHECKS=10 # in seconds
|
||||||
def __init__( self, clusterStatus ):
|
def __init__( self, clusterStatus ):
|
||||||
threading.Thread.__init__(self)
|
threading.Thread.__init__(self)
|
||||||
self.m_clusterStatus = clusterStatus
|
self.m_clusterStatus = clusterStatus
|
||||||
self.m_bStop = False
|
self.m_bStop = False
|
||||||
|
|
||||||
def getName( self ):
|
def getName( self ):
|
||||||
return 'JobsStateUpdater'
|
return 'JobsStateUpdater'
|
||||||
|
|
||||||
def getGridEngine( self ):
|
def getGridEngine( self ):
|
||||||
return self.m_clusterStatus.getGridEngine()
|
return self.m_clusterStatus.getGridEngine()
|
||||||
|
|
||||||
def updateClusterStatus( self ):
|
def updateClusterStatus( self ):
|
||||||
#log('JobsStateUpdater::updateClusterStatus : start')
|
#log('JobsStateUpdater::updateClusterStatus : start')
|
||||||
|
|
||||||
jobsState = self.getGridEngine().getCurrentJobsState()
|
jobsState = self.getGridEngine().getCurrentJobsState()
|
||||||
# update the jobs in the cluster status
|
# update the jobs in the cluster status
|
||||||
self.m_clusterStatus.onNewJobsState( jobsState )
|
self.m_clusterStatus.onNewJobsState( jobsState )
|
||||||
#log('JobsStateUpdater::updateClusterStatus : end')
|
#log('JobsStateUpdater::updateClusterStatus : end')
|
||||||
|
|
||||||
def run( self ):
|
def run( self ):
|
||||||
try:
|
try:
|
||||||
while not self.m_bStop :
|
while not self.m_bStop :
|
||||||
self.updateClusterStatus()
|
self.updateClusterStatus()
|
||||||
time.sleep(JobsStateUpdater.DELAY_BETWEEN_STATUS_CHECKS)
|
time.sleep(JobsStateUpdater.DELAY_BETWEEN_STATUS_CHECKS)
|
||||||
except BaseException, exception: # catches all exceptions, including the ctrl+C (KeyboardInterrupt)
|
except BaseException, exception: # catches all exceptions, including the ctrl+C (KeyboardInterrupt)
|
||||||
Util.onException(exception)
|
Util.onException(exception)
|
||||||
|
|
|
@ -1,29 +1,33 @@
|
||||||
import time
|
import time
|
||||||
import threading
|
import threading
|
||||||
|
|
||||||
gLogFilePath = '/tmp/ClusterController.log'#'/var/log/ClusterController.log'
|
gLogFilePath = '/tmp/ClusterController.log' # '/var/log/ClusterController.log'
|
||||||
|
|
||||||
def log( message ):
|
|
||||||
threadName = threading.currentThread().getName()
|
|
||||||
logMessage = time.asctime(time.localtime())+' : '+ threadName + ' : ' + message
|
|
||||||
print(logMessage)
|
|
||||||
f = open(gLogFilePath, 'a+')
|
|
||||||
assert( f )
|
|
||||||
try:
|
|
||||||
f.write( logMessage + '\n' )
|
|
||||||
finally:
|
|
||||||
f.close()
|
|
||||||
|
|
||||||
def logDebug( message ):
|
def log(message):
|
||||||
log('[D]'+message)
|
threadName = threading.currentThread().getName()
|
||||||
return
|
logMessage = time.asctime(time.localtime()) + ' : ' + threadName + ' : ' + message
|
||||||
|
print(logMessage)
|
||||||
def logInfo( message ):
|
f = open(gLogFilePath, 'a+')
|
||||||
log('[I]'+message)
|
assert f
|
||||||
|
try:
|
||||||
|
f.write(logMessage + '\n')
|
||||||
|
finally:
|
||||||
|
f.close()
|
||||||
|
|
||||||
def logWarning( message ):
|
|
||||||
log('[W]'+message)
|
|
||||||
|
|
||||||
def logError( message ):
|
def logDebug(message):
|
||||||
log('[E]'+message)
|
log('[D]' + message)
|
||||||
|
return
|
||||||
|
|
||||||
|
|
||||||
|
def logInfo(message):
|
||||||
|
log('[I]' + message)
|
||||||
|
|
||||||
|
|
||||||
|
def logWarning(message):
|
||||||
|
log('[W]' + message)
|
||||||
|
|
||||||
|
|
||||||
|
def logError(message):
|
||||||
|
log('[E]' + message)
|
||||||
|
|
|
@ -1,21 +1,22 @@
|
||||||
|
|
||||||
class PowerState:
|
class PowerState:
|
||||||
UNKNOWN=0
|
UNKNOWN = 0
|
||||||
OFF=1
|
OFF = 1
|
||||||
ON=2
|
ON = 2
|
||||||
SLEEP=3
|
SLEEP = 3
|
||||||
UNPLUGGED=4
|
UNPLUGGED = 4
|
||||||
|
|
||||||
def PowerStateToStr( powerState ):
|
|
||||||
if powerState == PowerState.UNKNOWN:
|
def PowerStateToStr(powerState):
|
||||||
return 'UNKNOWN'
|
if powerState == PowerState.UNKNOWN:
|
||||||
if powerState == PowerState.OFF:
|
return 'UNKNOWN'
|
||||||
return 'OFF'
|
if powerState == PowerState.OFF:
|
||||||
if powerState == PowerState.ON:
|
return 'OFF'
|
||||||
return 'ON'
|
if powerState == PowerState.ON:
|
||||||
if powerState == PowerState.SLEEP:
|
return 'ON'
|
||||||
return 'SLEEP'
|
if powerState == PowerState.SLEEP:
|
||||||
if powerState == PowerState.UNPLUGGED:
|
return 'SLEEP'
|
||||||
return 'UNPLUGGED'
|
if powerState == PowerState.UNPLUGGED:
|
||||||
else:
|
return 'UNPLUGGED'
|
||||||
assert( False )
|
else:
|
||||||
|
assert False
|
||||||
|
|
|
@ -1,249 +1,255 @@
|
||||||
import io
|
import io
|
||||||
import re
|
import re
|
||||||
from .JobsState import *
|
from .JobsState import JobsState
|
||||||
from .QueueMachine import *
|
from .QueueMachine import QueueMachine, QueueMachineStateFlags
|
||||||
from .Util import *
|
from .Util import *
|
||||||
from .Log import *
|
from .Log import logError
|
||||||
from .Job import *
|
from .Job import JobStateFlags, JobId, Job, ParallelEnvironment
|
||||||
|
import logging
|
||||||
|
|
||||||
|
|
||||||
class QstatParser:
|
class QstatParser:
|
||||||
def parseJobState( self, strJobStatus ):
|
def parseJobState(self, strJobStatus):
|
||||||
jobState = 0
|
jobState = 0
|
||||||
for i in range(0, len(strJobStatus) ):
|
for i in range(0, len(strJobStatus)):
|
||||||
c = strJobStatus[i]
|
c = strJobStatus[i]
|
||||||
if c == 'r':
|
if c == 'r':
|
||||||
jobState += JobStateFlags.RUNNING
|
jobState += JobStateFlags.RUNNING
|
||||||
elif c == 'w':
|
elif c == 'w':
|
||||||
jobState += JobStateFlags.WAITING
|
jobState += JobStateFlags.WAITING
|
||||||
elif c == 'q':
|
elif c == 'q':
|
||||||
jobState += JobStateFlags.QUEUED
|
jobState += JobStateFlags.QUEUED
|
||||||
elif c == 't':
|
elif c == 't':
|
||||||
jobState += JobStateFlags.TRANSFERING
|
jobState += JobStateFlags.TRANSFERING
|
||||||
elif c == 'd':
|
elif c == 'd':
|
||||||
jobState += JobStateFlags.DELETED
|
jobState += JobStateFlags.DELETED
|
||||||
elif c == 'h':
|
elif c == 'h':
|
||||||
jobState += JobStateFlags.HOLD
|
jobState += JobStateFlags.HOLD
|
||||||
elif c == 's':
|
elif c == 's':
|
||||||
jobState += JobStateFlags.SUSPENDED
|
jobState += JobStateFlags.SUSPENDED
|
||||||
elif c == 'E':
|
elif c == 'E':
|
||||||
jobState += JobStateFlags.ERROR
|
jobState += JobStateFlags.ERROR
|
||||||
else:
|
else:
|
||||||
assert False, 'unhandled job state flag :"' + c + '"'
|
assert False, 'unhandled job state flag :"' + c + '"'
|
||||||
return jobState
|
return jobState
|
||||||
def parseQueueMachineState( self, strQueueMachineStatus ):
|
|
||||||
queueMachineState = 0
|
def parseQueueMachineState(self, strQueueMachineStatus):
|
||||||
for i in range(0, len(strQueueMachineStatus) ):
|
queueMachineState = 0
|
||||||
c = strQueueMachineStatus[i]
|
for i in range(0, len(strQueueMachineStatus)):
|
||||||
if c == 'd':
|
c = strQueueMachineStatus[i]
|
||||||
queueMachineState += QueueMachineStateFlags.DISABLED
|
if c == 'd':
|
||||||
elif c == 'a':
|
queueMachineState += QueueMachineStateFlags.DISABLED
|
||||||
queueMachineState += QueueMachineStateFlags.ALARM
|
elif c == 'a':
|
||||||
elif c == 'u':
|
queueMachineState += QueueMachineStateFlags.ALARM
|
||||||
queueMachineState += QueueMachineStateFlags.UNKNOWN
|
elif c == 'u':
|
||||||
elif c == 'E':
|
queueMachineState += QueueMachineStateFlags.UNKNOWN
|
||||||
queueMachineState += QueueMachineStateFlags.ERROR
|
elif c == 'E':
|
||||||
elif c == 'o':
|
queueMachineState += QueueMachineStateFlags.ERROR
|
||||||
queueMachineState += QueueMachineStateFlags.OBSOLETE
|
elif c == 'o':
|
||||||
elif c == 's':
|
queueMachineState += QueueMachineStateFlags.OBSOLETE
|
||||||
queueMachineState += QueueMachineStateFlags.SUSPENDED
|
elif c == 's':
|
||||||
else:
|
queueMachineState += QueueMachineStateFlags.SUSPENDED
|
||||||
assert False, 'unhandled queue machine state flag :"' + c + '"'
|
else:
|
||||||
return queueMachineState
|
assert False, 'unhandled queue machine state flag :"' + c + '"'
|
||||||
def parseQstatOutput( self, qstatOutput ):
|
return queueMachineState
|
||||||
"""
|
|
||||||
parses result of command 'qstat -f -u \* -pri'
|
def parseQstatOutput(self, qstatOutput):
|
||||||
"""
|
"""
|
||||||
|
parses result of command 'qstat -f -u \* -pri'
|
||||||
def parse_pending_tasks(task_ranges_sequence):
|
"""
|
||||||
"""
|
|
||||||
parses a job's task ids encoded in the form of a string containing a sequence of ranges
|
def parse_pending_tasks(task_ranges_sequence):
|
||||||
|
"""
|
||||||
:param str task_ranges_sequence: a job's task ids encoded in the form of a string containing a sequence of non overlapping ranges separated with a comma. Each range is expected to be in the form "<min_index>-<max_index>:<step>"
|
parses a job's task ids encoded in the form of a string containing a sequence of ranges
|
||||||
:return list(int): the list of task ids
|
|
||||||
|
:param str task_ranges_sequence: a job's task ids encoded in the form of a string containing a sequence of non overlapping ranges separated with a comma. Each range is expected to be in the form "<min_index>-<max_index>:<step>"
|
||||||
for example, this function would return [1, 2, 3, 4, 6, 7, 8] for the input string "1-4:1,6-8:1"
|
:return list(int): the list of task ids
|
||||||
"""
|
|
||||||
task_ids = []
|
for example, this function would return [1, 2, 3, 4, 6, 7, 8] for the input string "1-4:1,6-8:1"
|
||||||
astrRanges = re.split(',', task_ranges_sequence)
|
"""
|
||||||
for strRange in astrRanges:
|
task_ids = []
|
||||||
singleIndexMatch = re.match('^(?P<elementIndex>[0-9]+)$', strRange)
|
astrRanges = re.split(',', task_ranges_sequence)
|
||||||
if singleIndexMatch:
|
for strRange in astrRanges:
|
||||||
iElementIndex = int(singleIndexMatch.group('elementIndex'))
|
singleIndexMatch = re.match('^(?P<elementIndex>[0-9]+)$', strRange)
|
||||||
task_ids.extend(range(iElementIndex, iElementIndex+1))
|
if singleIndexMatch:
|
||||||
else:
|
iElementIndex = int(singleIndexMatch.group('elementIndex'))
|
||||||
# we expect strRange to be of the form "1-4:1", where :
|
task_ids.extend(range(iElementIndex, iElementIndex + 1))
|
||||||
# the 1st number is the min element index (sge imposes it to be greater than 0)
|
else:
|
||||||
# the 2nd number is the max element index
|
# we expect strRange to be of the form "1-4:1", where :
|
||||||
# the 3rd number is the step between consecutive element indices
|
# the 1st number is the min element index (sge imposes it to be greater than 0)
|
||||||
rangeMatch = re.match( '^(?P<minElementIndex>[0-9]+)-(?P<maxElementIndex>[0-9]+):(?P<stepBetweenIndices>[0-9]+)$', strRange)
|
# the 2nd number is the max element index
|
||||||
if rangeMatch == None:
|
# the 3rd number is the step between consecutive element indices
|
||||||
logError('unexpected format for job array details : "%s" (line="%s"' % (strRange, line) )
|
rangeMatch = re.match('^(?P<minElementIndex>[0-9]+)-(?P<maxElementIndex>[0-9]+):(?P<stepBetweenIndices>[0-9]+)$', strRange)
|
||||||
assert(False)
|
if rangeMatch is None:
|
||||||
iMinElementIndex=int(rangeMatch.group('minElementIndex'))
|
logError('unexpected format for job array details : "%s" (line="%s"' % (strRange, line))
|
||||||
iMaxElementIndex=int(rangeMatch.group('maxElementIndex'))
|
assert False
|
||||||
iStepBetweenIndices=int(rangeMatch.group('stepBetweenIndices'))
|
iMinElementIndex = int(rangeMatch.group('minElementIndex'))
|
||||||
task_ids.extend(range(iMinElementIndex, iMaxElementIndex+1, iStepBetweenIndices))
|
iMaxElementIndex = int(rangeMatch.group('maxElementIndex'))
|
||||||
return task_ids
|
iStepBetweenIndices = int(rangeMatch.group('stepBetweenIndices'))
|
||||||
|
task_ids.extend(range(iMinElementIndex, iMaxElementIndex + 1, iStepBetweenIndices))
|
||||||
|
return task_ids
|
||||||
# ugly hack to work around the fact that qstat truncates the fqdn of cluster nodes
|
|
||||||
# graffy@physix-master:~$ qstat -f -u \*
|
# ugly hack to work around the fact that qstat truncates the fqdn of cluster nodes
|
||||||
# queuename qtype resv/used/tot. load_avg arch states
|
# graffy@physix-master:~$ qstat -f -u \*
|
||||||
# ---------------------------------------------------------------------------------
|
# queuename qtype resv/used/tot. load_avg arch states
|
||||||
# main.q@physix88.ipr.univ-renne BIP 0/0/36 14.03 lx-amd64
|
# ---------------------------------------------------------------------------------
|
||||||
# TODO: fix this properly by parsing the output of 'qstat -f -u \* -xml' instead of 'qstat -f -u \*'
|
# main.q@physix88.ipr.univ-renne BIP 0/0/36 14.03 lx-amd64
|
||||||
qstatOutput = re.sub('\.univ[^ ]*', '.univ-rennes1.fr', qstatOutput)
|
# TODO: fix this properly by parsing the output of 'qstat -f -u \* -xml' instead of 'qstat -f -u \*'
|
||||||
|
qstatOutput = re.sub(r'\.univ[^ ]*', '.univ-rennes1.fr', qstatOutput)
|
||||||
jobsState = JobsState()
|
|
||||||
f = io.StringIO(qstatOutput)
|
jobsState = JobsState()
|
||||||
line = f.readline()
|
f = io.StringIO(qstatOutput)
|
||||||
currentQueueMachine = None
|
line = f.readline()
|
||||||
bInPendingJobsSection = False
|
currentQueueMachine = None
|
||||||
# examples of job line :
|
bInPendingJobsSection = False
|
||||||
# 43521 0.55108 Confidiso3 aghoufi r 08/19/2009 18:40:09 1
|
# examples of job line :
|
||||||
# a typical job line in the pending jobs section looks like this :
|
# 43521 0.55108 Confidiso3 aghoufi r 08/19/2009 18:40:09 1
|
||||||
# 43645 0.00000 LC_LV_MC aghoufi qw 08/21/2009 08:14:58 1
|
# a typical job line in the pending jobs section looks like this :
|
||||||
# a typical running job array line looks like this
|
# 43645 0.00000 LC_LV_MC aghoufi qw 08/21/2009 08:14:58 1
|
||||||
# 43619 0.56000 SimpleJobA raffy r 08/20/2009 18:13:03 1 3
|
# a typical running job array line looks like this
|
||||||
# a typical job array line in the pending jobs section looks like this
|
# 43619 0.56000 SimpleJobA raffy r 08/20/2009 18:13:03 1 3
|
||||||
# 43646 0.00000 SimpleJobA raffy qw 08/21/2009 09:56:40 1 1-4:1
|
# a typical job array line in the pending jobs section looks like this
|
||||||
|
# 43646 0.00000 SimpleJobA raffy qw 08/21/2009 09:56:40 1 1-4:1
|
||||||
# nurg The job's total urgency value in normalized fashion.
|
|
||||||
# npprior The job's -p priority in normalized fashion.
|
# nurg The job's total urgency value in normalized fashion.
|
||||||
# ntckts The job's ticket amount in normalized fashion.
|
# npprior The job's -p priority in normalized fashion.
|
||||||
# ppri The job's -p priority as specified by the user.
|
# ntckts The job's ticket amount in normalized fashion.
|
||||||
|
# ppri The job's -p priority as specified by the user.
|
||||||
jobRegularExp = re.compile( '^[ ]*(?P<jobId>[^ ]+)[ ]+(?P<JobPriority>[0-9.]+)[ ]+(?P<nurg>[0-9.]+)[ ]+(?P<npprior>[0-9.]+)[ ]+(?P<ntckts>[0-9.]+)[ ]+(?P<ppri>-?[0-9]+)[ ]+(?P<jobScriptName>[^ ]+)[ ]+(?P<jobOwner>[^ ]+)[ ]+(?P<jobStatus>[^ ]+)[ ]+(?P<jobStartOrSubmitTime>[0-9][0-9]/[0-9][0-9]/[0-9][0-9][0-9][0-9] [0-9][0-9]:[0-9][0-9]:[0-9][0-9])[ ]+(?P<numSlots>[0-9]+)[ ]+(?P<jobArrayDetails>[^\n]*)[\s]*$' )
|
|
||||||
# example of machine line :
|
jobRegularExp = re.compile(r'^[ ]*(?P<jobId>[^ ]+)[ ]+(?P<JobPriority>[0-9.]+)[ ]+(?P<nurg>[0-9.]+)[ ]+(?P<npprior>[0-9.]+)[ ]+(?P<ntckts>[0-9.]+)[ ]+(?P<ppri>-?[0-9]+)[ ]+(?P<jobScriptName>[^ ]+)[ ]+(?P<jobOwner>[^ ]+)[ ]+(?P<jobStatus>[^ ]+)[ ]+(?P<jobStartOrSubmitTime>[0-9][0-9]/[0-9][0-9]/[0-9][0-9][0-9][0-9] [0-9][0-9]:[0-9][0-9]:[0-9][0-9])[ ]+(?P<numSlots>[0-9]+)[ ]+(?P<jobArrayDetails>[^\n]*)[\s]*$')
|
||||||
# allintel.q@simpatix34.univ-ren BIP 0/6/8 6.00 darwin-x86
|
# example of machine line :
|
||||||
machineRegularExp = re.compile( '^(?P<queueName>[^@]+)@(?P<machineName>[^ ]+)[ ]+(?P<queueTypeString>[^ ]+)[ ]+(?P<numReservedSlots>[^/]+)/(?P<numUsedSlots>[^/]+)/(?P<numTotalSlots>[^ ]+)[ ]+(?P<cpuLoad>[^ ]+)[\s]+(?P<archName>[^ ]+)[\s]+(?P<queueMachineStatus>[^\s]*)' )
|
# allintel.q@simpatix34.univ-ren BIP 0/6/8 6.00 darwin-x86
|
||||||
pendingJobsHeaderRegularExp = re.compile( '^ - PENDING JOBS - PENDING JOBS - PENDING JOBS - PENDING JOBS - PENDING JOBS[?]*' )
|
machineRegularExp = re.compile(r'^(?P<queueName>[^@]+)@(?P<machineName>[^ ]+)[ ]+(?P<queueTypeString>[^ ]+)[ ]+(?P<numReservedSlots>[^/]+)/(?P<numUsedSlots>[^/]+)/(?P<numTotalSlots>[^ ]+)[ ]+(?P<cpuLoad>[^ ]+)[\s]+(?P<archName>[^ ]+)[\s]+(?P<queueMachineStatus>[^\s]*)')
|
||||||
while( len(line) > 0 ):
|
pendingJobsHeaderRegularExp = re.compile('^ - PENDING JOBS - PENDING JOBS - PENDING JOBS - PENDING JOBS - PENDING JOBS[?]*')
|
||||||
# print line
|
while len(line) > 0:
|
||||||
# check if the current line is a line describing a job running on a machine
|
# print line
|
||||||
matchObj = jobRegularExp.match( line )
|
# check if the current line is a line describing a job running on a machine
|
||||||
if matchObj:
|
matchObj = jobRegularExp.match(line)
|
||||||
# we are dealing with a job line
|
if matchObj:
|
||||||
if not bInPendingJobsSection:
|
# we are dealing with a job line
|
||||||
assert( currentQueueMachine )
|
if not bInPendingJobsSection:
|
||||||
#log('QstatParser::parseQstatOutput : jobId = "'+matchObj.group('jobId')+'"')
|
assert currentQueueMachine
|
||||||
iJobId = int(matchObj.group('jobId'))
|
# log('QstatParser::parseQstatOutput : jobId = "'+matchObj.group('jobId')+'"')
|
||||||
jobState = self.parseJobState( matchObj.group('jobStatus') )
|
iJobId = int(matchObj.group('jobId'))
|
||||||
strJobArrayDetails = matchObj.group('jobArrayDetails')
|
logging.debug('iJobId = %d' % iJobId)
|
||||||
bIsJobArray = (len(strJobArrayDetails) != 0)
|
jobState = self.parseJobState(matchObj.group('jobStatus'))
|
||||||
#logDebug('strJobArrayDetails = "%s", bIsJobArray=%d' % (strJobArrayDetails, int(bIsJobArray)))
|
strJobArrayDetails = matchObj.group('jobArrayDetails')
|
||||||
# each element of a job array is treated as a separate job for the sake of simplicity.
|
bIsJobArray = (len(strJobArrayDetails) != 0)
|
||||||
# For these elements, the job id in sge sense is the same, but they are different in this program's sense
|
# logDebug('strJobArrayDetails = "%s", bIsJobArray=%d' % (strJobArrayDetails, int(bIsJobArray)))
|
||||||
task_ids = range(0,1) # just one element, unless it's a job array
|
# each element of a job array is treated as a separate job for the sake of simplicity.
|
||||||
if bIsJobArray:
|
# For these elements, the job id in sge sense is the same, but they are different in this program's sense
|
||||||
if bInPendingJobsSection:
|
task_ids = range(0, 1) # just one element, unless it's a job array
|
||||||
task_ids = parse_pending_tasks(strJobArrayDetails)
|
if bIsJobArray:
|
||||||
else:
|
if bInPendingJobsSection:
|
||||||
# we are in the running jobs section, and here we expect the strJobArrayDetails to just contain the index of the job array element
|
task_ids = parse_pending_tasks(strJobArrayDetails)
|
||||||
iJobArrayElementIndex = int(strJobArrayDetails)
|
else:
|
||||||
assert(iJobArrayElementIndex != 0) # sge does not allow element indices to be 0
|
# we are in the running jobs section, and here we expect the strJobArrayDetails to just contain the index of the job array element
|
||||||
task_ids = range(iJobArrayElementIndex,iJobArrayElementIndex+1)
|
iJobArrayElementIndex = int(strJobArrayDetails)
|
||||||
for task_id in task_ids:
|
assert iJobArrayElementIndex != 0 # sge does not allow element indices to be 0
|
||||||
jobId = None
|
task_ids = range(iJobArrayElementIndex, iJobArrayElementIndex + 1)
|
||||||
if bIsJobArray:
|
logging.debug('task_ids = %s' % task_ids)
|
||||||
jobId = JobId(iJobId, task_id)
|
for task_id in task_ids:
|
||||||
else:
|
logging.debug('task_id = %s' % task_id)
|
||||||
jobId = JobId(iJobId)
|
jobId = None
|
||||||
job = jobsState.getJob(jobId)
|
if bIsJobArray:
|
||||||
#logDebug('iElementIndex = %d job id = %s' % (iElementIndex, jobId.asStr()))
|
jobId = JobId(iJobId, task_id)
|
||||||
if job == None:
|
else:
|
||||||
# this job hasn't been encountered yet in the output of qstat ...
|
jobId = JobId(iJobId)
|
||||||
# we could either be in the pending jobs section or in the running jobs section
|
job = jobsState.getJob(jobId)
|
||||||
job = Job(jobId)
|
# logDebug('iElementIndex = %d job id = %s' % (iElementIndex, jobId.asStr()))
|
||||||
jobsState.addJob( job )
|
if job is None:
|
||||||
job.setState( jobState )
|
# this job hasn't been encountered yet in the output of qstat ...
|
||||||
strJobStartOrSubmitTime = matchObj.group('jobStartOrSubmitTime')
|
# we could either be in the pending jobs section or in the running jobs section
|
||||||
jobStartOrSubmitTime = time.strptime(strJobStartOrSubmitTime, '%m/%d/%Y %H:%M:%S')
|
job = Job(jobId)
|
||||||
if bInPendingJobsSection:
|
jobsState.addJob(job)
|
||||||
job.setSubmitTime( jobStartOrSubmitTime )
|
job.setState(jobState)
|
||||||
else:
|
strJobStartOrSubmitTime = matchObj.group('jobStartOrSubmitTime')
|
||||||
job.setStartTime( jobStartOrSubmitTime )
|
jobStartOrSubmitTime = time.strptime(strJobStartOrSubmitTime, '%m/%d/%Y %H:%M:%S')
|
||||||
job.setOwner( matchObj.group('jobOwner') )
|
if bInPendingJobsSection:
|
||||||
job.setScriptName( matchObj.group('jobScriptName') )
|
job.setSubmitTime(jobStartOrSubmitTime)
|
||||||
if bInPendingJobsSection:
|
else:
|
||||||
job.setNumRequiredSlots(int(matchObj.group('numSlots')))
|
job.setStartTime(jobStartOrSubmitTime)
|
||||||
else:
|
job.setOwner(matchObj.group('jobOwner'))
|
||||||
assert( not bInPendingJobsSection ) # if we are in the pending jobs section, the job should be new
|
job.setScriptName(matchObj.group('jobScriptName'))
|
||||||
if not bInPendingJobsSection:
|
if bInPendingJobsSection:
|
||||||
job.addSlots( currentQueueMachine.getName(), int(matchObj.group('numSlots')) )
|
job.setNumRequiredSlots(int(matchObj.group('numSlots')))
|
||||||
else:
|
else:
|
||||||
# the current line does not describe a job
|
assert not bInPendingJobsSection # if we are in the pending jobs section, the job should be new
|
||||||
if not bInPendingJobsSection:
|
if not bInPendingJobsSection:
|
||||||
# check if this line describes the status of a machine
|
job.addSlots(currentQueueMachine.getName(), int(matchObj.group('numSlots')))
|
||||||
matchObj = machineRegularExp.match( line )
|
else:
|
||||||
if matchObj:
|
# the current line does not describe a job
|
||||||
queueName = matchObj.group('queueName')
|
if not bInPendingJobsSection:
|
||||||
machineName = matchObj.group('machineName')
|
# check if this line describes the status of a machine
|
||||||
queueMachine = QueueMachine( queueName, machineName )
|
matchObj = machineRegularExp.match(line)
|
||||||
#log(line)
|
if matchObj:
|
||||||
#log('matchObj.group(queueTypeString) :' + matchObj.group('queueTypeString'))
|
queueName = matchObj.group('queueName')
|
||||||
#log('matchObj.group(numTotalSlots) :' + matchObj.group('numTotalSlots'))
|
machineName = matchObj.group('machineName')
|
||||||
queueMachine.setNumSlots( int( matchObj.group('numTotalSlots') ) )
|
queueMachine = QueueMachine(queueName, machineName)
|
||||||
queueMachine.setNumUsedSlots( int( matchObj.group('numUsedSlots') ) )
|
# log(line)
|
||||||
strCpuLoad = matchObj.group('cpuLoad')
|
# log('matchObj.group(queueTypeString) :' + matchObj.group('queueTypeString'))
|
||||||
if strCpuLoad != '-NA-':
|
# log('matchObj.group(numTotalSlots) :' + matchObj.group('numTotalSlots'))
|
||||||
queueMachine.setCpuLoad( float(strCpuLoad) )
|
queueMachine.setNumSlots(int(matchObj.group('numTotalSlots')))
|
||||||
|
queueMachine.setNumUsedSlots(int(matchObj.group('numUsedSlots')))
|
||||||
strQueueMachineState = matchObj.group('queueMachineStatus')
|
strCpuLoad = matchObj.group('cpuLoad')
|
||||||
queueMachine.setState( self.parseQueueMachineState( strQueueMachineState ) )
|
if strCpuLoad != '-NA-':
|
||||||
#log('QstatParser::parseQstatOutput : queueName = "'+matchObj.group('queueName')+'"')
|
queueMachine.setCpuLoad(float(strCpuLoad))
|
||||||
#log('QstatParser::parseQstatOutput : machineName = "'+matchObj.group('machineName')+'"')
|
|
||||||
currentQueueMachine = queueMachine
|
strQueueMachineState = matchObj.group('queueMachineStatus')
|
||||||
jobsState.addQueueMachine( queueMachine )
|
queueMachine.setState(self.parseQueueMachineState(strQueueMachineState))
|
||||||
else:
|
# log('QstatParser::parseQstatOutput : queueName = "'+matchObj.group('queueName')+'"')
|
||||||
matchObj = pendingJobsHeaderRegularExp.match( line )
|
# log('QstatParser::parseQstatOutput : machineName = "'+matchObj.group('machineName')+'"')
|
||||||
if matchObj:
|
currentQueueMachine = queueMachine
|
||||||
bInPendingJobsSection = True
|
jobsState.addQueueMachine(queueMachine)
|
||||||
currentQueueMachine = None
|
else:
|
||||||
else:
|
matchObj = pendingJobsHeaderRegularExp.match(line)
|
||||||
#print line
|
if matchObj:
|
||||||
None
|
bInPendingJobsSection = True
|
||||||
else:
|
currentQueueMachine = None
|
||||||
# we are in a pending jobs section
|
else:
|
||||||
matchObj = re.match('^[#]+$', line)
|
# print line
|
||||||
if not matchObj:
|
None
|
||||||
# unexpected line
|
else:
|
||||||
print('line = "' + line + '"')
|
# we are in a pending jobs section
|
||||||
assert( False )
|
matchObj = re.match('^[#]+$', line)
|
||||||
None
|
if not matchObj:
|
||||||
line = f.readline()
|
# unexpected line
|
||||||
f.close()
|
print('line = "' + line + '"')
|
||||||
return jobsState
|
assert False
|
||||||
def parseJobDetails( self, qstatOutput, job ):
|
None
|
||||||
"""
|
line = f.readline()
|
||||||
adds to job the details parsed from the output of the "qstat -j <jobid>" command
|
f.close()
|
||||||
"""
|
return jobsState
|
||||||
f = io.StringIO(qstatOutput)
|
|
||||||
line = f.readline()
|
def parseJobDetails(self, qstatOutput, job):
|
||||||
fieldRegularExp = re.compile( '^(?P<fieldName>[^:]+):[ ]+(?P<fieldValue>[?]*)$' )
|
"""
|
||||||
while( len(line) > 0 ):
|
adds to job the details parsed from the output of the "qstat -j <jobid>" command
|
||||||
# print line
|
"""
|
||||||
# check if the current line is a line describing a job running on a machine
|
f = io.StringIO(qstatOutput)
|
||||||
matchObj = fieldRegularExp.match( line )
|
line = f.readline()
|
||||||
if matchObj:
|
fieldRegularExp = re.compile('^(?P<fieldName>[^:]+):[ ]+(?P<fieldValue>[?]*)$')
|
||||||
fieldName = matchObj.group('fieldName')
|
while len(line) > 0:
|
||||||
strFieldValue = matchObj.group('fieldValue')
|
# print line
|
||||||
if fieldName == 'job_number':
|
# check if the current line is a line describing a job running on a machine
|
||||||
assert( job.getId().asStr() == strFieldValue )
|
matchObj = fieldRegularExp.match(line)
|
||||||
elif fieldName == 'hard_queue_list':
|
if matchObj:
|
||||||
allowedQueues = strFieldValue.split(',')
|
fieldName = matchObj.group('fieldName')
|
||||||
assert(len(allowedQueues) > 0)
|
strFieldValue = matchObj.group('fieldValue')
|
||||||
job.m_jobRequirements.m_queues = allowedQueues
|
if fieldName == 'job_number':
|
||||||
elif fieldName == 'parallel environment':
|
assert job.getId().asStr() == strFieldValue
|
||||||
# the value could be 'ompi range: 32'
|
elif fieldName == 'hard_queue_list':
|
||||||
matchObj = re.match('ompi range: (?P<numSlots>[0-9]+)[?]*', strFieldValue)
|
allowedQueues = strFieldValue.split(',')
|
||||||
if matchObj:
|
assert len(allowedQueues) > 0
|
||||||
job.m_jobRequirements.m_parallelEnvironment = ParallelEnvironment.MPI
|
job.m_jobRequirements.m_queues = allowedQueues
|
||||||
else:
|
elif fieldName == 'parallel environment':
|
||||||
assert( False )
|
# the value could be 'ompi range: 32'
|
||||||
else:
|
matchObj = re.match('ompi range: (?P<numSlots>[0-9]+)[?]*', strFieldValue)
|
||||||
# ignore he other fields
|
if matchObj:
|
||||||
None
|
job.m_jobRequirements.m_parallelEnvironment = ParallelEnvironment.MPI
|
||||||
line = f.readline()
|
else:
|
||||||
f.close()
|
assert False
|
||||||
|
else:
|
||||||
|
# ignore he other fields
|
||||||
|
None
|
||||||
|
line = f.readline()
|
||||||
|
f.close()
|
||||||
|
|
|
@ -1,65 +1,81 @@
|
||||||
|
|
||||||
class QueueMachineStateFlags: #
|
class QueueMachineStateFlags: #
|
||||||
DISABLED=1 # the queue machine is disabled
|
DISABLED = 1 # the queue machine is disabled
|
||||||
ALARM=2 # the queue machine is in alarm state (see man qstat)
|
ALARM = 2 # the queue machine is in alarm state (see man qstat)
|
||||||
UNKNOWN=4 # the queue machine is in unknown state because sge_execd cannot be contected (see man qstat)
|
UNKNOWN = 4 # the queue machine is in unknown state because sge_execd cannot be contected (see man qstat)
|
||||||
ERROR=8 # the queue is in error state
|
ERROR = 8 # the queue is in error state
|
||||||
OBSOLETE=16 # the queue no longer exists but it is still visible because it still contains running jobs
|
OBSOLETE = 16 # the queue no longer exists but it is still visible because it still contains running jobs
|
||||||
SUSPENDED=32 # the queue machine is suspended
|
SUSPENDED = 32 # the queue machine is suspended
|
||||||
|
|
||||||
|
|
||||||
class QueueMachine:
|
class QueueMachine:
|
||||||
"""
|
"""
|
||||||
a QueueMachine instance represents a given SGE queue on a given machine (eg allintel.q@simpatix10)
|
a QueueMachine instance represents a given SGE queue on a given machine (eg allintel.q@simpatix10)
|
||||||
"""
|
"""
|
||||||
def __init__( self, queueName, machineName ):
|
def __init__(self, queueName, machineName):
|
||||||
self.m_queueName = queueName
|
self.m_queueName = queueName
|
||||||
self.m_machineName = machineName
|
self.m_machineName = machineName
|
||||||
self.m_numSlots = None
|
self.m_numSlots = None
|
||||||
self.m_numUsedSlots = None
|
self.m_numUsedSlots = None
|
||||||
self.m_fCpuLoad = None
|
self.m_fCpuLoad = None
|
||||||
self.m_stateFlags = 0
|
self.m_stateFlags = 0
|
||||||
self.m_strDisableMessage = ''
|
self.m_strDisableMessage = ''
|
||||||
def getName( self ):
|
|
||||||
"""
|
def getName(self):
|
||||||
returns the name of the machine queue (such as allintel.q@simpatix10)
|
"""
|
||||||
"""
|
returns the name of the machine queue (such as allintel.q@simpatix10)
|
||||||
return self.m_queueName + '@' + self.m_machineName
|
"""
|
||||||
|
return self.m_queueName + '@' + self.m_machineName
|
||||||
def getQueueName( self ):
|
|
||||||
return self.m_queueName
|
def getQueueName(self):
|
||||||
def getMachineName( self ):
|
return self.m_queueName
|
||||||
return self.m_machineName
|
|
||||||
def setNumSlots( self, numSlots ):
|
def getMachineName(self):
|
||||||
self.m_numSlots = numSlots
|
return self.m_machineName
|
||||||
def setNumUsedSlots( self, numSlots ):
|
|
||||||
self.m_numUsedSlots = numSlots
|
def setNumSlots(self, numSlots):
|
||||||
def getNumSlots( self ):
|
self.m_numSlots = numSlots
|
||||||
assert( self.m_numSlots != None )
|
|
||||||
return self.m_numSlots
|
def setNumUsedSlots(self, numSlots):
|
||||||
def getNumUsedSlots( self ):
|
self.m_numUsedSlots = numSlots
|
||||||
assert( self.m_numUsedSlots != None )
|
|
||||||
return self.m_numUsedSlots
|
def getNumSlots(self):
|
||||||
def setCpuLoad( self, fCpuLoad ):
|
assert self.m_numSlots is not None
|
||||||
self.m_fCpuLoad = fCpuLoad
|
return self.m_numSlots
|
||||||
def cpuLoadIsAvailable( self ):
|
|
||||||
return self.m_fCpuLoad != None
|
def getNumUsedSlots(self):
|
||||||
def getCpuLoad( self ):
|
assert self.m_numUsedSlots is not None
|
||||||
assert( self.m_fCpuLoad != None )
|
return self.m_numUsedSlots
|
||||||
return self.m_fCpuLoad
|
|
||||||
def setState( self, state ):
|
def setCpuLoad(self, fCpuLoad):
|
||||||
self.m_stateFlags = state
|
self.m_fCpuLoad = fCpuLoad
|
||||||
def isDisabled( self ):
|
|
||||||
return self.m_stateFlags & QueueMachineStateFlags.DISABLED
|
def cpuLoadIsAvailable(self):
|
||||||
def isInErrorState( self ):
|
return self.m_fCpuLoad is not None
|
||||||
return self.m_stateFlags & QueueMachineStateFlags.ERROR
|
|
||||||
def isResponding( self ):
|
def getCpuLoad(self):
|
||||||
return not (self.m_stateFlags & QueueMachineStateFlags.UNKNOWN)
|
assert self.m_fCpuLoad is not None
|
||||||
def isInAlarmState( self ):
|
return self.m_fCpuLoad
|
||||||
return self.m_stateFlags & QueueMachineStateFlags.ALARM
|
|
||||||
def isSuspended( self ):
|
def setState(self, state):
|
||||||
return self.m_stateFlags & QueueMachineStateFlags.SUSPENDED
|
self.m_stateFlags = state
|
||||||
"""
|
|
||||||
def getStateAsString( self ):
|
def isDisabled(self):
|
||||||
assert( self.m_strState != None )
|
return self.m_stateFlags & QueueMachineStateFlags.DISABLED
|
||||||
return self.m_strState
|
|
||||||
"""
|
def isInErrorState(self):
|
||||||
|
return self.m_stateFlags & QueueMachineStateFlags.ERROR
|
||||||
|
|
||||||
|
def isResponding(self):
|
||||||
|
return not (self.m_stateFlags & QueueMachineStateFlags.UNKNOWN)
|
||||||
|
|
||||||
|
def isInAlarmState(self):
|
||||||
|
return self.m_stateFlags & QueueMachineStateFlags.ALARM
|
||||||
|
|
||||||
|
def isSuspended(self):
|
||||||
|
return self.m_stateFlags & QueueMachineStateFlags.SUSPENDED
|
||||||
|
"""
|
||||||
|
def getStateAsString(self):
|
||||||
|
assert(self.m_strState is not None)
|
||||||
|
return self.m_strState
|
||||||
|
"""
|
||||||
|
|
|
@ -1,141 +1,147 @@
|
||||||
from PowerState import *
|
from PowerState import PowerState
|
||||||
from Log import *
|
from Log import logInfo
|
||||||
import time
|
import time
|
||||||
import copy
|
import copy
|
||||||
|
|
||||||
|
|
||||||
class Slot:
|
class Slot:
|
||||||
def __init__( self ):
|
def __init__(self):
|
||||||
self.m_queueMachine = None
|
self.m_queueMachine = None
|
||||||
self.m_numSlots = None
|
self.m_numSlots = None
|
||||||
self.m_job = None # job for which this slot is allocated
|
self.m_job = None # job for which this slot is allocated
|
||||||
|
|
||||||
|
|
||||||
class SlotAllocator:
|
class SlotAllocator:
|
||||||
"""
|
"""
|
||||||
a class that defines a strategy for allocating free slots for the given pending jobs
|
a class that defines a strategy for allocating free slots for the given pending jobs
|
||||||
"""
|
"""
|
||||||
def getMachinesThatNeedWakeUp( self, pendingJobs, clusterState ):
|
def getMachinesThatNeedWakeUp(self, pendingJobs, clusterState):
|
||||||
"""
|
"""
|
||||||
returns the list of machines that need to wake up to make pending jobs running
|
returns the list of machines that need to wake up to make pending jobs running
|
||||||
"""
|
"""
|
||||||
assert( False ) # this method is abstract
|
assert False # this method is abstract
|
||||||
|
|
||||||
class SimpleSlotAllocator( SlotAllocator ):
|
|
||||||
def getMachinesThatNeedWakeUp( self, pendingJobs, clusterState ):
|
class SimpleSlotAllocator(SlotAllocator):
|
||||||
machinesThatNeedWakeUp = {}
|
def getMachinesThatNeedWakeUp(self, pendingJobs, clusterState):
|
||||||
highestPriorityPendingJob = pendingJobs.values()[0]
|
machinesThatNeedWakeUp = {}
|
||||||
logInfo('SimpleSlotAllocator::getMachinesThatNeedWakeUp : looking for free slots for job ' + highestPriorityPendingJob.getId().asStr() )
|
highestPriorityPendingJob = pendingJobs.values()[0]
|
||||||
numFreeSlots = {} # contains the number of free slots for each queueMachine
|
logInfo('SimpleSlotAllocator::getMachinesThatNeedWakeUp : looking for free slots for job ' + highestPriorityPendingJob.getId().asStr())
|
||||||
for queueMachine in clusterState.getJobsState().getQueueMachines().values():
|
numFreeSlots = {} # contains the number of free slots for each queueMachine
|
||||||
numFreeSlots[ queueMachine ] = clusterState.getJobsState().getNumFreeSlotsOnQueueMachine( queueMachine )
|
for queueMachine in clusterState.getJobsState().getQueueMachines().values():
|
||||||
logInfo('SimpleSlotAllocator::getMachinesThatNeedWakeUp : init numFreeSlots[ %s ] with %d ' % (queueMachine.getName(), numFreeSlots[ queueMachine ]) )
|
numFreeSlots[queueMachine] = clusterState.getJobsState().getNumFreeSlotsOnQueueMachine(queueMachine)
|
||||||
remainingNumSlotsToAllocate = highestPriorityPendingJob.m_jobRequirements.m_numSlots
|
logInfo('SimpleSlotAllocator::getMachinesThatNeedWakeUp : init numFreeSlots[%s] with %d ' % (queueMachine.getName(), numFreeSlots[queueMachine]))
|
||||||
logInfo('SimpleSlotAllocator::getMachinesThatNeedWakeUp : still %d slots to find' % remainingNumSlotsToAllocate )
|
remainingNumSlotsToAllocate = highestPriorityPendingJob.m_jobRequirements.m_numSlots
|
||||||
# first look in running machines if there are available slots
|
logInfo('SimpleSlotAllocator::getMachinesThatNeedWakeUp : still %d slots to find' % remainingNumSlotsToAllocate)
|
||||||
for queueMachine in clusterState.getJobsState().getQueueMachines().values():
|
# first look in running machines if there are available slots
|
||||||
logInfo('SimpleSlotAllocator::getMachinesThatNeedWakeUp : examining queueMachine %s ' % queueMachine.getName() )
|
for queueMachine in clusterState.getJobsState().getQueueMachines().values():
|
||||||
machine = clusterState.getMachines()[ queueMachine.getMachineName() ]
|
logInfo('SimpleSlotAllocator::getMachinesThatNeedWakeUp : examining queueMachine %s ' % queueMachine.getName())
|
||||||
if machine.getPowerState() == PowerState.ON:
|
machine = clusterState.getMachines()[queueMachine.getMachineName()]
|
||||||
if clusterState.queueMachineFitsJobRequirements( queueMachine, highestPriorityPendingJob.m_jobRequirements ):
|
if machine.getPowerState() == PowerState.ON:
|
||||||
numSlotsAllocatedOnThisMachine = min( numFreeSlots[ queueMachine ], remainingNumSlotsToAllocate )
|
if clusterState.queueMachineFitsJobRequirements(queueMachine, highestPriorityPendingJob.m_jobRequirements):
|
||||||
logInfo('SimpleSlotAllocator::getMachinesThatNeedWakeUp : found %d slots on already running %s ' % (numSlotsAllocatedOnThisMachine, queueMachine.getMachineName() ) )
|
numSlotsAllocatedOnThisMachine = min(numFreeSlots[queueMachine], remainingNumSlotsToAllocate)
|
||||||
|
logInfo('SimpleSlotAllocator::getMachinesThatNeedWakeUp : found %d slots on already running %s ' % (numSlotsAllocatedOnThisMachine, queueMachine.getMachineName()))
|
||||||
remainingNumSlotsToAllocate -= numSlotsAllocatedOnThisMachine
|
|
||||||
numFreeSlots[ queueMachine ] -= numSlotsAllocatedOnThisMachine
|
remainingNumSlotsToAllocate -= numSlotsAllocatedOnThisMachine
|
||||||
logInfo('SimpleSlotAllocator::getMachinesThatNeedWakeUp : still %d slots to find' % remainingNumSlotsToAllocate )
|
numFreeSlots[queueMachine] -= numSlotsAllocatedOnThisMachine
|
||||||
assert( remainingNumSlotsToAllocate >= 0 )
|
logInfo('SimpleSlotAllocator::getMachinesThatNeedWakeUp : still %d slots to find' % remainingNumSlotsToAllocate)
|
||||||
if remainingNumSlotsToAllocate == 0:
|
assert remainingNumSlotsToAllocate >= 0
|
||||||
break
|
if remainingNumSlotsToAllocate == 0:
|
||||||
if remainingNumSlotsToAllocate > 0:
|
break
|
||||||
# now look into machines that are asleep
|
if remainingNumSlotsToAllocate > 0:
|
||||||
for queueMachine in clusterState.getJobsState().getQueueMachines().values():
|
# now look into machines that are asleep
|
||||||
logInfo('SimpleSlotAllocator::getMachinesThatNeedWakeUp : examining queueMachine %s ' % queueMachine.getName() )
|
for queueMachine in clusterState.getJobsState().getQueueMachines().values():
|
||||||
machine = clusterState.getMachines()[ queueMachine.getMachineName() ]
|
logInfo('SimpleSlotAllocator::getMachinesThatNeedWakeUp : examining queueMachine %s ' % queueMachine.getName())
|
||||||
if machine.getPowerState() == PowerState.SLEEP:
|
machine = clusterState.getMachines()[queueMachine.getMachineName()]
|
||||||
if clusterState.queueMachineFitsJobRequirements( queueMachine, highestPriorityPendingJob.m_jobRequirements ):
|
if machine.getPowerState() == PowerState.SLEEP:
|
||||||
numSlotsAllocatedOnThisMachine = min( numFreeSlots[ queueMachine ], remainingNumSlotsToAllocate )
|
if clusterState.queueMachineFitsJobRequirements(queueMachine, highestPriorityPendingJob.m_jobRequirements):
|
||||||
logInfo('SimpleSlotAllocator::getMachinesThatNeedWakeUp : found %d slots on sleeping %s ' % (numSlotsAllocatedOnThisMachine, queueMachine.getMachineName() ) )
|
numSlotsAllocatedOnThisMachine = min(numFreeSlots[queueMachine], remainingNumSlotsToAllocate)
|
||||||
remainingNumSlotsToAllocate -= numSlotsAllocatedOnThisMachine
|
logInfo('SimpleSlotAllocator::getMachinesThatNeedWakeUp : found %d slots on sleeping %s ' % (numSlotsAllocatedOnThisMachine, queueMachine.getMachineName()))
|
||||||
numFreeSlots[ queueMachine ] -= numSlotsAllocatedOnThisMachine
|
remainingNumSlotsToAllocate -= numSlotsAllocatedOnThisMachine
|
||||||
machinesThatNeedWakeUp[ machine.getName() ] = machine
|
numFreeSlots[queueMachine] -= numSlotsAllocatedOnThisMachine
|
||||||
logInfo('SimpleSlotAllocator::getMachinesThatNeedWakeUp : still %d slots to find' % remainingNumSlotsToAllocate )
|
machinesThatNeedWakeUp[machine.getName()] = machine
|
||||||
assert( remainingNumSlotsToAllocate >= 0 )
|
logInfo('SimpleSlotAllocator::getMachinesThatNeedWakeUp : still %d slots to find' % remainingNumSlotsToAllocate)
|
||||||
if remainingNumSlotsToAllocate == 0:
|
assert remainingNumSlotsToAllocate >= 0
|
||||||
break
|
if remainingNumSlotsToAllocate == 0:
|
||||||
if remainingNumSlotsToAllocate != 0:
|
break
|
||||||
return {} # not enough slots available
|
if remainingNumSlotsToAllocate != 0:
|
||||||
return machinesThatNeedWakeUp
|
return {} # not enough slots available
|
||||||
|
return machinesThatNeedWakeUp
|
||||||
class DecoupledSlotAllocator( SlotAllocator ):
|
|
||||||
"""
|
|
||||||
a slot allocator that doesn't know much about sge, and does not attempts to guess what sge'sceduler would do
|
class DecoupledSlotAllocator(SlotAllocator):
|
||||||
Instead, it uses a very simple strategy : it wakes up all the machines periodically to allow jobs to get in.
|
"""
|
||||||
"""
|
a slot allocator that doesn't know much about sge, and does not attempts to guess what sge'sceduler would do
|
||||||
def __init__( self ):
|
Instead, it uses a very simple strategy : it wakes up all the machines periodically to allow jobs to get in.
|
||||||
self.m_delayBetweenPeriodicChecks = -1 # in seconds. Disable periodic checks by setting this to -1
|
"""
|
||||||
self.m_lastCheckTime = time.time()
|
def __init__(self):
|
||||||
self.m_lastClusterState = None
|
self.m_delayBetweenPeriodicChecks = -1 # in seconds. Disable periodic checks by setting this to -1
|
||||||
def jobsStateHasChanged( self, newClusterState ):
|
self.m_lastCheckTime = time.time()
|
||||||
"""
|
self.m_lastClusterState = None
|
||||||
returns true if there is a change in the cluster state that can cause a pending job
|
|
||||||
to start (provided all machines are enabled)
|
def jobsStateHasChanged(self, newClusterState):
|
||||||
"""
|
"""
|
||||||
oldJobs = {}
|
returns true if there is a change in the cluster state that can cause a pending job
|
||||||
if self.m_lastClusterState:
|
to start (provided all machines are enabled)
|
||||||
oldJobs = self.m_lastClusterState.m_jobsState.m_jobs
|
"""
|
||||||
newJobs = newClusterState.m_jobsState.m_jobs
|
oldJobs = {}
|
||||||
bJobsHaveChanged = False
|
if self.m_lastClusterState:
|
||||||
oldJobsOnly = oldJobs.copy() # shallow copy
|
oldJobs = self.m_lastClusterState.m_jobsState.m_jobs
|
||||||
#print 'oldJobs : ', oldJobs
|
newJobs = newClusterState.m_jobsState.m_jobs
|
||||||
#print 'newJobs : ', newJobs
|
bJobsHaveChanged = False
|
||||||
"""
|
oldJobsOnly = oldJobs.copy() # shallow copy
|
||||||
print 'self.m_lastClusterState', self.m_lastClusterState
|
# print 'oldJobs : ', oldJobs
|
||||||
print 'newClusterState', newClusterState
|
# print 'newJobs : ', newJobs
|
||||||
if self.m_lastClusterState:
|
"""
|
||||||
print 'self.m_lastClusterState.m_jobsState', self.m_lastClusterState.m_jobsState
|
print 'self.m_lastClusterState', self.m_lastClusterState
|
||||||
print 'newClusterState.m_jobsState', newClusterState.m_jobsState
|
print 'newClusterState', newClusterState
|
||||||
print 'id(self.m_lastClusterState) : ', id(self.m_lastClusterState)
|
if self.m_lastClusterState:
|
||||||
print 'id(newClusterState) : ', id(newClusterState)
|
print 'self.m_lastClusterState.m_jobsState', self.m_lastClusterState.m_jobsState
|
||||||
print 'len(oldJobs) : ', len(oldJobs)
|
print 'newClusterState.m_jobsState', newClusterState.m_jobsState
|
||||||
print 'len(newJobs) : ', len(newJobs)
|
print 'id(self.m_lastClusterState) : ', id(self.m_lastClusterState)
|
||||||
print 'id(oldJobs) : ', id(oldJobs)
|
print 'id(newClusterState) : ', id(newClusterState)
|
||||||
print 'id(newJobs) : ', id(newJobs)
|
print 'len(oldJobs) : ', len(oldJobs)
|
||||||
"""
|
print 'len(newJobs) : ', len(newJobs)
|
||||||
for newJob in newJobs.values():
|
print 'id(oldJobs) : ', id(oldJobs)
|
||||||
#logDebug('DecoupledSlotAllocator::jobsStateHasChanged newJob id=%s' % newJob.getId().asStr())
|
print 'id(newJobs) : ', id(newJobs)
|
||||||
if newJob.getId() in oldJobs:
|
"""
|
||||||
#logDebug('DecoupledSlotAllocator::jobsStateHasChanged job id=%d is in old jobs' % newJob.getId())
|
for newJob in newJobs.values():
|
||||||
del oldJobsOnly[newJob.getId()]
|
# logDebug('DecoupledSlotAllocator::jobsStateHasChanged newJob id=%s' % newJob.getId().asStr())
|
||||||
else:
|
if newJob.getId() in oldJobs:
|
||||||
# ah ... a new job has arrived
|
# logDebug('DecoupledSlotAllocator::jobsStateHasChanged job id=%d is in old jobs' % newJob.getId())
|
||||||
logInfo('A new job (jobId =%s) has been detected ' % newJob.getId().asStr() )
|
del oldJobsOnly[newJob.getId()]
|
||||||
bJobsHaveChanged = True
|
else:
|
||||||
if len(oldJobsOnly) != 0:
|
# ah ... a new job has arrived
|
||||||
for oldJob in oldJobsOnly.values():
|
logInfo('A new job (jobId =%s) has been detected ' % newJob.getId().asStr())
|
||||||
logInfo('Job (jobId =%s) has finished' % oldJob.getId().asStr() )
|
bJobsHaveChanged = True
|
||||||
# at least one old job has finished, freeing some slots
|
if len(oldJobsOnly) != 0:
|
||||||
bJobsHaveChanged = True
|
for oldJob in oldJobsOnly.values():
|
||||||
return bJobsHaveChanged
|
logInfo('Job (jobId =%s) has finished' % oldJob.getId().asStr())
|
||||||
def getMachinesThatNeedWakeUp( self, pendingJobs, clusterState ):
|
# at least one old job has finished, freeing some slots
|
||||||
machinesThatNeedWakeUp = {}
|
bJobsHaveChanged = True
|
||||||
bJobsStateHasChanged = self.jobsStateHasChanged( clusterState )
|
return bJobsHaveChanged
|
||||||
currentTime = time.time()
|
|
||||||
# we do periodic checks to detect changes in cluster state that are not detected by jobsStateHasChanged
|
def getMachinesThatNeedWakeUp(self, pendingJobs, clusterState):
|
||||||
# for example changes in the requirements, in the allocation policy, etc...
|
machinesThatNeedWakeUp = {}
|
||||||
bItsTimeForPeriodicCheck = False
|
bJobsStateHasChanged = self.jobsStateHasChanged(clusterState)
|
||||||
if self.m_delayBetweenPeriodicChecks > 0:
|
currentTime = time.time()
|
||||||
bItsTimeForPeriodicCheck = (currentTime - self.m_lastCheckTime) > self.m_delayBetweenPeriodicChecks
|
# we do periodic checks to detect changes in cluster state that are not detected by jobsStateHasChanged
|
||||||
if bJobsStateHasChanged or bItsTimeForPeriodicCheck:
|
# for example changes in the requirements, in the allocation policy, etc...
|
||||||
if bJobsStateHasChanged:
|
bItsTimeForPeriodicCheck = False
|
||||||
logInfo('DecoupledSlotAllocator::getMachinesThatNeedWakeUp : waking up machines that are asleep because jobs state has changed')
|
if self.m_delayBetweenPeriodicChecks > 0:
|
||||||
else:
|
bItsTimeForPeriodicCheck = (currentTime - self.m_lastCheckTime) > self.m_delayBetweenPeriodicChecks
|
||||||
logInfo('DecoupledSlotAllocator::getMachinesThatNeedWakeUp : waking up machines that are asleep for periodic check (to be sure pending jobs get a chance to start)')
|
if bJobsStateHasChanged or bItsTimeForPeriodicCheck:
|
||||||
for queueMachine in clusterState.getJobsState().getQueueMachines().values():
|
if bJobsStateHasChanged:
|
||||||
if queueMachine.getMachineName() in clusterState.getMachines():
|
logInfo('DecoupledSlotAllocator::getMachinesThatNeedWakeUp : waking up machines that are asleep because jobs state has changed')
|
||||||
# this means that the machine is under the cluster controller's control
|
else:
|
||||||
machine = clusterState.getMachines()[ queueMachine.getMachineName() ]
|
logInfo('DecoupledSlotAllocator::getMachinesThatNeedWakeUp : waking up machines that are asleep for periodic check (to be sure pending jobs get a chance to start)')
|
||||||
if machine.getPowerState() == PowerState.SLEEP:
|
for queueMachine in clusterState.getJobsState().getQueueMachines().values():
|
||||||
machinesThatNeedWakeUp[ machine.getName() ] = machine
|
if queueMachine.getMachineName() in clusterState.getMachines():
|
||||||
self.m_lastCheckTime = currentTime
|
# this means that the machine is under the cluster controller's control
|
||||||
self.m_lastClusterState = copy.copy(clusterState)
|
machine = clusterState.getMachines()[queueMachine.getMachineName()]
|
||||||
#print 'self.m_lastClusterState', self.m_lastClusterState
|
if machine.getPowerState() == PowerState.SLEEP:
|
||||||
return machinesThatNeedWakeUp
|
machinesThatNeedWakeUp[machine.getName()] = machine
|
||||||
|
self.m_lastCheckTime = currentTime
|
||||||
|
self.m_lastClusterState = copy.copy(clusterState)
|
||||||
|
# print 'self.m_lastClusterState', self.m_lastClusterState
|
||||||
|
return machinesThatNeedWakeUp
|
||||||
|
|
|
@ -1,58 +1,58 @@
|
||||||
import Util
|
import time
|
||||||
from QstatParser import *
|
from Util import executeProgram
|
||||||
|
from QstatParser import QstatParser
|
||||||
|
from Log import logDebug, logWarning
|
||||||
|
|
||||||
|
|
||||||
class SunGridEngine:
|
class SunGridEngine:
|
||||||
|
|
||||||
def getCurrentJobsState( self ):
|
def getCurrentJobsState(self):
|
||||||
bBUG_00000009_IS_STILL_ALIVE = True
|
bBUG_00000009_IS_STILL_ALIVE = True
|
||||||
if bBUG_00000009_IS_STILL_ALIVE:
|
if bBUG_00000009_IS_STILL_ALIVE:
|
||||||
logDebug('Querying the current state of jobs')
|
logDebug('Querying the current state of jobs')
|
||||||
returnCode = -1
|
returnCode = -1
|
||||||
delayBetweenAttemps = 5 # in seconds
|
delayBetweenAttemps = 5 # in seconds
|
||||||
while returnCode != 0:
|
while returnCode != 0:
|
||||||
command = ['qstat', '-f', '-u', '*']
|
command = ['qstat', '-f', '-u', '*']
|
||||||
(returnCode, qstatOutput, stderr) = executeProgram( command )
|
(returnCode, qstatOutput, stderr) = executeProgram(command)
|
||||||
if returnCode != 0:
|
if returnCode != 0:
|
||||||
logWarning('command "%s" failed (returnCode = %d, stdout="%s", stderr="%s"). Retrying in %d seconds' % (' '.join(command), returnCode, qstatOutput, stderr, delayBetweenAttemps))
|
logWarning('command "%s" failed (returnCode = %d, stdout="%s", stderr="%s"). Retrying in %d seconds' % (' '.join(command), returnCode, qstatOutput, stderr, delayBetweenAttemps))
|
||||||
time.sleep(delayBetweenAttemps)
|
time.sleep(delayBetweenAttemps)
|
||||||
if bBUG_00000009_IS_STILL_ALIVE:
|
if bBUG_00000009_IS_STILL_ALIVE:
|
||||||
logDebug('Just got current state of jobs')
|
logDebug('Just got current state of jobs')
|
||||||
|
|
||||||
jobsState = QstatParser().parseQstatOutput( qstatOutput )
|
jobsState = QstatParser().parseQstatOutput(qstatOutput)
|
||||||
jobsState.setTime( time.time() )
|
jobsState.setTime(time.time())
|
||||||
|
|
||||||
|
|
||||||
# read the requirements for pending jobs (which parallel environment, which queue, which architecture) from sge
|
|
||||||
if False: # no need for job details at the moment and since it's very slow, it's been disabled
|
|
||||||
for unused_jobId, job in jobsState.getPendingJobs().items():
|
|
||||||
(returnCode, stdout, stderr) = executeProgram( ['qstat', '-j', job.getId().asStr()] )
|
|
||||||
assert returnCode != 0, 'prout'
|
|
||||||
QstatParser().parseJobDetails( stdout, job )
|
|
||||||
|
|
||||||
return jobsState
|
|
||||||
|
|
||||||
def setQueueInstanceActivation( self, strQueueInstanceName, bEnable ):
|
# read the requirements for pending jobs (which parallel environment, which queue, which architecture) from sge
|
||||||
argument = 'd'
|
if False: # no need for job details at the moment and since it's very slow, it's been disabled
|
||||||
if bEnable:
|
for unused_jobId, job in jobsState.getPendingJobs().items():
|
||||||
argument = 'e'
|
(returnCode, stdout, stderr) = executeProgram(['qstat', '-j', job.getId().asStr()])
|
||||||
bBUG_00000269_IS_STILL_ALIVE = True # for some reason, qmod -d (and maybe any sge command) could fail with error: commlib error: can't connect to service (Address already in use)
|
assert returnCode != 0, 'prout'
|
||||||
delayBetweenAttemps = 5 # in seconds
|
QstatParser().parseJobDetails(stdout, job)
|
||||||
while True:
|
|
||||||
errorCode, unused_stdout, unused_stderr = executeProgram(['qmod', '-'+argument, strQueueInstanceName])
|
|
||||||
if bBUG_00000269_IS_STILL_ALIVE:
|
|
||||||
# if the command failed, try again
|
|
||||||
if errorCode == 0:
|
|
||||||
break
|
|
||||||
time.sleep(delayBetweenAttemps)
|
|
||||||
else:
|
|
||||||
break
|
|
||||||
return (errorCode == 0)
|
|
||||||
|
|
||||||
def queueIsEmpty( self, strMachineName ):
|
|
||||||
(returnCode, qstatOutput, unused_stderr) = executeProgram( ['qstat', '-f', '-u', '*'] )
|
|
||||||
assert( returnCode == 0 )
|
|
||||||
jobsState = QstatParser().parseQstatOutput( qstatOutput )
|
|
||||||
jobs = jobsState.getJobsOnMachine( strMachineName )
|
|
||||||
return (len(jobs) == 0)
|
|
||||||
|
|
||||||
|
return jobsState
|
||||||
|
|
||||||
|
def setQueueInstanceActivation(self, strQueueInstanceName, bEnable):
|
||||||
|
argument = 'd'
|
||||||
|
if bEnable:
|
||||||
|
argument = 'e'
|
||||||
|
bBUG_00000269_IS_STILL_ALIVE = True # for some reason, qmod -d (and maybe any sge command) could fail with error: commlib error: can't connect to service (Address already in use)
|
||||||
|
delayBetweenAttemps = 5 # in seconds
|
||||||
|
while True:
|
||||||
|
errorCode, unused_stdout, unused_stderr = executeProgram(['qmod', '-' + argument, strQueueInstanceName])
|
||||||
|
if bBUG_00000269_IS_STILL_ALIVE:
|
||||||
|
# if the command failed, try again
|
||||||
|
if errorCode == 0:
|
||||||
|
break
|
||||||
|
time.sleep(delayBetweenAttemps)
|
||||||
|
else:
|
||||||
|
break
|
||||||
|
return (errorCode == 0)
|
||||||
|
|
||||||
|
def queueIsEmpty(self, strMachineName):
|
||||||
|
(returnCode, qstatOutput, unused_stderr) = executeProgram(['qstat', '-f', '-u', '*'])
|
||||||
|
assert returnCode == 0
|
||||||
|
jobsState = QstatParser().parseQstatOutput(qstatOutput)
|
||||||
|
jobs = jobsState.getJobsOnMachine(strMachineName)
|
||||||
|
return (len(jobs) == 0)
|
||||||
|
|
|
@ -1,53 +1,56 @@
|
||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
import sys
|
import sys
|
||||||
sys.path.insert(0, '..')
|
sys.path.insert(0, '..')
|
||||||
from Log import *
|
from Log import logInfo
|
||||||
import Util
|
import Util
|
||||||
from PowerState import *
|
from PowerState import PowerState
|
||||||
|
|
||||||
from HTMLParser import HTMLParser
|
from HTMLParser import HTMLParser
|
||||||
|
|
||||||
|
|
||||||
def Test0000():
|
def Test0000():
|
||||||
logInfo('Testing bug 00000003 if a series of wake up, goto sleep can shutdown a machine')
|
logInfo('Testing bug 00000003 if a series of wake up, goto sleep can shutdown a machine')
|
||||||
strTargetMachineName = 'simpatix12'
|
strTargetMachineName = 'simpatix12'
|
||||||
ePowerState = Util.getPowerState(strTargetMachineName)
|
ePowerState = Util.getPowerState(strTargetMachineName)
|
||||||
while True:
|
while True:
|
||||||
if ePowerState == PowerState.ON:
|
if ePowerState == PowerState.ON:
|
||||||
bSuccess = Util.blockingPutMachineToSleep(strTargetMachineName)
|
bSuccess = Util.blockingPutMachineToSleep(strTargetMachineName)
|
||||||
assert( bSuccess )
|
assert bSuccess
|
||||||
bSuccess = Util.blockingPutMachineToSleep(strTargetMachineName)
|
bSuccess = Util.blockingPutMachineToSleep(strTargetMachineName)
|
||||||
ePowerState = PowerState.SLEEP
|
ePowerState = PowerState.SLEEP
|
||||||
elif ePowerState == PowerState.SLEEP:
|
elif ePowerState == PowerState.SLEEP:
|
||||||
bSuccess = Util.blockingWakeUpMachine(strTargetMachineName)
|
bSuccess = Util.blockingWakeUpMachine(strTargetMachineName)
|
||||||
assert( bSuccess )
|
assert bSuccess
|
||||||
ePowerState = PowerState.ON
|
ePowerState = PowerState.ON
|
||||||
else:
|
else:
|
||||||
assert(False)
|
assert False
|
||||||
|
|
||||||
|
|
||||||
def Test0001():
|
def Test0001():
|
||||||
logInfo('Testing bug 00000003 : could it be caused by a sleep and a power on at the same tim ?')
|
logInfo('Testing bug 00000003 : could it be caused by a sleep and a power on at the same tim ?')
|
||||||
strTargetMachineName = 'simpatix12'
|
strTargetMachineName = 'simpatix12'
|
||||||
ePowerState = Util.getPowerState(strTargetMachineName)
|
ePowerState = Util.getPowerState(strTargetMachineName)
|
||||||
if ePowerState == PowerState.SLEEP:
|
if ePowerState == PowerState.SLEEP:
|
||||||
bSuccess = Util.blockingWakeUpMachine(strTargetMachineName)
|
bSuccess = Util.blockingWakeUpMachine(strTargetMachineName)
|
||||||
assert( bSuccess )
|
assert bSuccess
|
||||||
ePowerState = PowerState.ON
|
ePowerState = PowerState.ON
|
||||||
assert(ePowerState == PowerState.ON)
|
assert ePowerState == PowerState.ON
|
||||||
Util.executeCommand("ssh %s 'pmset sleepnow'" % strTargetMachineName )
|
Util.executeCommand("ssh %s 'pmset sleepnow'" % strTargetMachineName)
|
||||||
bSuccess = Util.blockingWakeUpMachine(strTargetMachineName)
|
bSuccess = Util.blockingWakeUpMachine(strTargetMachineName)
|
||||||
assert(bSuccess)
|
assert bSuccess
|
||||||
|
|
||||||
|
|
||||||
def Test0002():
|
def Test0002():
|
||||||
logInfo('Testing bug 00000003 : could it be caused by a power on quickly followed by a sleep ?')
|
logInfo('Testing bug 00000003 : could it be caused by a power on quickly followed by a sleep ?')
|
||||||
strTargetMachineName = 'simpatix12'
|
strTargetMachineName = 'simpatix12'
|
||||||
ePowerState = Util.getPowerState(strTargetMachineName)
|
ePowerState = Util.getPowerState(strTargetMachineName)
|
||||||
if ePowerState == PowerState.ON:
|
if ePowerState == PowerState.ON:
|
||||||
bSuccess = Util.blockingWakeUpMachine(strTargetMachineName)
|
bSuccess = Util.blockingWakeUpMachine(strTargetMachineName)
|
||||||
assert( bSuccess )
|
assert bSuccess
|
||||||
ePowerState = PowerState.SLEEP
|
ePowerState = PowerState.SLEEP
|
||||||
assert(ePowerState == PowerState.SLEEP)
|
assert ePowerState == PowerState.SLEEP
|
||||||
Util.executeIpmiCommand( strTargetMachineName, 'chassis power on' )
|
Util.executeIpmiCommand(strTargetMachineName, 'chassis power on')
|
||||||
Util.executeCommand("ssh %s 'pmset sleepnow'" % strTargetMachineName )
|
Util.executeCommand("ssh %s 'pmset sleepnow'" % strTargetMachineName)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
Test0000()
|
Test0000()
|
||||||
|
|
|
@ -1,228 +1,234 @@
|
||||||
#import .Util
|
# import .Util
|
||||||
#import ..SimpaDbUtil
|
# import ..SimpaDbUtil
|
||||||
from .Log import *
|
from .Log import logDebug, logInfo, logWarning, logError
|
||||||
from .PowerState import *
|
from .PowerState import PowerState, PowerStateToStr
|
||||||
import re
|
import re
|
||||||
import io
|
import io
|
||||||
import os
|
import os
|
||||||
import traceback
|
import traceback
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
def executeProgram( astrArguments ):
|
|
||||||
bBUG_00000008_IS_STILL_ACTIVE = True
|
|
||||||
if bBUG_00000008_IS_STILL_ACTIVE:
|
|
||||||
logDebug('executeProgram : program = [%s]' % (','.join(astrArguments) ))
|
|
||||||
(returnCode, stdout, stderr) = Lib.Util.executeProgram( astrArguments )
|
|
||||||
if bBUG_00000008_IS_STILL_ACTIVE:
|
|
||||||
logDebug('executeCommand : return code of [%s] = %d' % (','.join(astrArguments), returnCode))
|
|
||||||
# for debugging purpose, log info in case the command failed
|
|
||||||
if returnCode != 0:
|
|
||||||
logDebug('executeCommand : return code of [%s] = %d' % (','.join(astrArguments), returnCode))
|
|
||||||
logDebug('executeCommand : stdout of [%s] = %s' % (','.join(astrArguments), stdout))
|
|
||||||
logDebug('executeCommand : stderr of [%s] = %s' % (','.join(astrArguments), stderr))
|
|
||||||
return (returnCode, stdout, stderr)
|
|
||||||
|
|
||||||
def executeCommand( command ):
|
def executeProgram(astrArguments):
|
||||||
#logDebug('executeCommand : command = ' + command)
|
bBUG_00000008_IS_STILL_ACTIVE = True
|
||||||
(returnCode, stdout, stderr) = Lib.Util.executeCommand( command )
|
if bBUG_00000008_IS_STILL_ACTIVE:
|
||||||
#logDebug('executeCommand : return code of "'+command+'" = '+str(returnCode))
|
logDebug('executeProgram : program = [%s]' % (','.join(astrArguments)))
|
||||||
return (returnCode, stdout, stderr)
|
(returnCode, stdout, stderr) = Lib.Util.executeProgram(astrArguments)
|
||||||
|
if bBUG_00000008_IS_STILL_ACTIVE:
|
||||||
|
logDebug('executeCommand : return code of [%s] = %d' % (','.join(astrArguments), returnCode))
|
||||||
|
# for debugging purpose, log info in case the command failed
|
||||||
|
if returnCode != 0:
|
||||||
|
logDebug('executeCommand : return code of [%s] = %d' % (','.join(astrArguments), returnCode))
|
||||||
|
logDebug('executeCommand : stdout of [%s] = %s' % (','.join(astrArguments), stdout))
|
||||||
|
logDebug('executeCommand : stderr of [%s] = %s' % (','.join(astrArguments), stderr))
|
||||||
|
return (returnCode, stdout, stderr)
|
||||||
|
|
||||||
def executeIpmiCommand( machineName, ipmiCommandArgs ):
|
|
||||||
lomIpAddress = Lib.SimpaDbUtil.getLightOutManagementIpAddress( machineName )
|
|
||||||
lomPasswordFilepath = '/usr/local/etc/LightOutManagementPassword.txt'
|
|
||||||
astrProgram = ['ipmitool', '-U', 'admin', '-H', lomIpAddress, '-f', lomPasswordFilepath]
|
|
||||||
astrProgram.extend( ipmiCommandArgs )
|
|
||||||
#print 'executeIpmiCommand'
|
|
||||||
#print astrProgram
|
|
||||||
bBUG_00000005_IS_STILL_ACTIVE = True
|
|
||||||
if bBUG_00000005_IS_STILL_ACTIVE:
|
|
||||||
# bug 00000005 causes ipmitool to randomly fail for no apparent reason (two consecutive calls might give different errors, these errors not always being timeouts). Therefore we try and try again, until the command succeeds. If we don't do this, cluster controller keeps stopping because ipmi commands fail. The effect of this hack is that the UNPLUGGED power state is no longer detected; therefore, with this hack, cluster controller is expecting all machines to be plugged.
|
|
||||||
bCommandSucceeded = False
|
|
||||||
while not bCommandSucceeded:
|
|
||||||
(returnCode, stdout, stderr) = executeProgram( astrProgram )
|
|
||||||
if returnCode == 0:
|
|
||||||
bCommandSucceeded = True
|
|
||||||
else:
|
|
||||||
logWarning('the command "%s" failed. Retrying a bit later' % ' '.join(astrProgram))
|
|
||||||
time.sleep(5) # wait for 5 seconds before the next attempt, in order not to saturate activity
|
|
||||||
else:
|
|
||||||
(returnCode, stdout, stderr) = executeProgram( astrProgram )
|
|
||||||
"""
|
|
||||||
sh-3.2# ipmitool -U admin -H 129.20.27.220 -f /usr/local/etc/LightOutManagementPassword.txt sensor get 'ACPI State'
|
|
||||||
Unabled to establish a session with the BMC.
|
|
||||||
Command failed due to insufficient resources for session (0xFFFEF901)
|
|
||||||
-> this error means that the number of active conections to the BMC has reached the maximum (usually 5).
|
|
||||||
|
|
||||||
sh-3.2# ipmitool -U admin -H 129.20.27.212 -f /usr/local/etc/LightOutManagementPassword.txt sensor get 'ACPI State'
|
|
||||||
Unabled to establish a session with the BMC.
|
|
||||||
Command failed due to Unknown (0xFFFEF923) (0xFFFEF923)
|
|
||||||
|
|
||||||
sh-3.2# ipmitool -U admin -H 129.20.27.212 -f /usr/local/etc/LightOutManagementPassword.txt sensor get 'ACPI State'
|
def executeCommand(command):
|
||||||
Unabled to establish a session with the BMC.
|
# logDebug('executeCommand : command = ' + command)
|
||||||
Command failed due to Timeout (0xFFFEF9C3)
|
(returnCode, stdout, stderr) = Lib.Util.executeCommand(command)
|
||||||
"""
|
# logDebug('executeCommand : return code of "'+command+'" = '+str(returnCode))
|
||||||
|
return (returnCode, stdout, stderr)
|
||||||
return (returnCode, stdout, stderr)
|
|
||||||
|
|
||||||
|
def executeIpmiCommand(machineName, ipmiCommandArgs):
|
||||||
|
lomIpAddress = Lib.SimpaDbUtil.getLightOutManagementIpAddress(machineName)
|
||||||
|
lomPasswordFilepath = '/usr/local/etc/LightOutManagementPassword.txt'
|
||||||
|
astrProgram = ['ipmitool', '-U', 'admin', '-H', lomIpAddress, '-f', lomPasswordFilepath]
|
||||||
|
astrProgram.extend(ipmiCommandArgs)
|
||||||
|
# print 'executeIpmiCommand'
|
||||||
|
# print astrProgram
|
||||||
|
bBUG_00000005_IS_STILL_ACTIVE = True
|
||||||
|
if bBUG_00000005_IS_STILL_ACTIVE:
|
||||||
|
# bug 00000005 causes ipmitool to randomly fail for no apparent reason (two consecutive calls might give different errors, these errors not always being timeouts). Therefore we try and try again, until the command succeeds. If we don't do this, cluster controller keeps stopping because ipmi commands fail. The effect of this hack is that the UNPLUGGED power state is no longer detected; therefore, with this hack, cluster controller is expecting all machines to be plugged.
|
||||||
|
bCommandSucceeded = False
|
||||||
|
while not bCommandSucceeded:
|
||||||
|
(returnCode, stdout, stderr) = executeProgram(astrProgram)
|
||||||
|
if returnCode == 0:
|
||||||
|
bCommandSucceeded = True
|
||||||
|
else:
|
||||||
|
logWarning('the command "%s" failed. Retrying a bit later' % ' '.join(astrProgram))
|
||||||
|
time.sleep(5) # wait for 5 seconds before the next attempt, in order not to saturate activity
|
||||||
|
else:
|
||||||
|
(returnCode, stdout, stderr) = executeProgram(astrProgram)
|
||||||
|
"""
|
||||||
|
sh-3.2# ipmitool -U admin -H 129.20.27.220 -f /usr/local/etc/LightOutManagementPassword.txt sensor get 'ACPI State'
|
||||||
|
Unabled to establish a session with the BMC.
|
||||||
|
Command failed due to insufficient resources for session (0xFFFEF901)
|
||||||
|
-> this error means that the number of active conections to the BMC has reached the maximum (usually 5).
|
||||||
|
|
||||||
|
sh-3.2# ipmitool -U admin -H 129.20.27.212 -f /usr/local/etc/LightOutManagementPassword.txt sensor get 'ACPI State'
|
||||||
|
Unabled to establish a session with the BMC.
|
||||||
|
Command failed due to Unknown (0xFFFEF923) (0xFFFEF923)
|
||||||
|
|
||||||
|
sh-3.2# ipmitool -U admin -H 129.20.27.212 -f /usr/local/etc/LightOutManagementPassword.txt sensor get 'ACPI State'
|
||||||
|
Unabled to establish a session with the BMC.
|
||||||
|
Command failed due to Timeout (0xFFFEF9C3)
|
||||||
|
"""
|
||||||
|
|
||||||
|
return (returnCode, stdout, stderr)
|
||||||
|
|
||||||
|
|
||||||
|
def getPowerState(machineName):
|
||||||
|
ePowerState = PowerState.UNKNOWN
|
||||||
|
bPowerStateRead = False
|
||||||
|
iNumFailedAttempts = 0
|
||||||
|
while not bPowerStateRead:
|
||||||
|
(returnCode, stdout, stderr) = executeIpmiCommand(machineName, ['sensor', 'get', 'ACPI State'])
|
||||||
|
if returnCode == 0:
|
||||||
|
matchObj = re.search(r'\[(?P<AcpiState>S[0-9][^\:]*)\:', stdout)
|
||||||
|
bBUG_00000002_IS_STILL_ACTIVE = True
|
||||||
|
if bBUG_00000002_IS_STILL_ACTIVE:
|
||||||
|
if matchObj is None:
|
||||||
|
# the following warning has been commented out because it pollutes the logs and apparently
|
||||||
|
# it's a 'feature' of all 4 core machines : if the machine is woken up using ipmitool, then
|
||||||
|
# no power on event is logged ...
|
||||||
|
# logWarning('degenerate ipmitool output for machine %s (see bug 00000002). Assuming power in on because that''s what I noticed when I had the case.' % machineName)
|
||||||
|
return PowerState.ON
|
||||||
|
else:
|
||||||
|
assert matchObj
|
||||||
|
strAcpiState = matchObj.group('AcpiState')
|
||||||
|
if strAcpiState == 'S0/G0':
|
||||||
|
ePowerState = PowerState.ON
|
||||||
|
elif strAcpiState == 'S3': # memory is still powered
|
||||||
|
ePowerState = PowerState.SLEEP
|
||||||
|
elif strAcpiState == 'S5/G2': # soft-off
|
||||||
|
ePowerState = PowerState.OFF
|
||||||
|
else:
|
||||||
|
print(strAcpiState)
|
||||||
|
assert False
|
||||||
|
bPowerStateRead = True
|
||||||
|
else:
|
||||||
|
# error ... it's either because the machine is unplugged or because the machine is busy (well I'm not sure what happened but I had the case where the command failed for no apparent reason, and therefore I suspect it to be busy). In order to differentiate these 2 cases, we try again and if this caommand fails too many times then we decide it's unplugged (very dodgy I know but I'm disapointed that this command doen't always work, and for now I don't know other ways to differentiate between these cases....)
|
||||||
|
iMAX_NUM_ATTEMPTS = 5
|
||||||
|
iNumFailedAttempts += 1
|
||||||
|
if iNumFailedAttempts < iMAX_NUM_ATTEMPTS:
|
||||||
|
logWarning('failed to read the power state of %s. I\'ll try a again a bit later....' % machineName)
|
||||||
|
time.sleep(5)
|
||||||
|
else:
|
||||||
|
logWarning('failed to read the power state of %s too many times. I assume this machine is unplugged' % machineName)
|
||||||
|
ePowerState = PowerState.UNPLUGGED # too many attempts failed ... I guess it's because the machine is unplugged
|
||||||
|
bPowerStateRead = True
|
||||||
|
return ePowerState
|
||||||
|
|
||||||
|
|
||||||
|
def wakeUpMachine(machineName):
|
||||||
|
"""
|
||||||
|
this method seems more reliable than wake on lan (sometimes, sending wake on lan packet seems to have no effect)
|
||||||
|
@return true on success, false otherwise
|
||||||
|
@note I once had this method failing for no obvious reason.. maybe this command does not succeed if the machine is in a transition state
|
||||||
|
"""
|
||||||
|
(returnCode, stdout, stderr) = executeIpmiCommand(machineName, ['chassis', 'power', 'on'])
|
||||||
|
bSuccess = (returnCode == 0) # this command can fail if the machine is manually unplugged for example
|
||||||
|
return bSuccess
|
||||||
|
|
||||||
|
|
||||||
|
def blockingPutMachineToSleep(machineName):
|
||||||
|
"""
|
||||||
|
@return true on success, false otherwise
|
||||||
|
"""
|
||||||
|
logInfo('putting machine %s to sleep...' % machineName)
|
||||||
|
iMaxNumAttempts = 5
|
||||||
|
bSuccess = False
|
||||||
|
bBUG_239_IS_STILL_ALIVE = True
|
||||||
|
iAttempt = 0
|
||||||
|
# note : each sleep order is not actually succeeding (god knows why). Therefore, we need to try again and again.
|
||||||
|
while not bSuccess:
|
||||||
|
# note : pmset must be executed as root
|
||||||
|
(returnCode, stdout, stderr) = executeProgram(['ssh', machineName, 'pmset sleepnow'])
|
||||||
|
# check if the machine actually went to sleep
|
||||||
|
iMaxGoToSleepDuration = 30 # in seconds
|
||||||
|
iDelay = 0
|
||||||
|
while iDelay < iMaxGoToSleepDuration:
|
||||||
|
time.sleep(5)
|
||||||
|
iDelay += 5
|
||||||
|
ePowerState = getPowerState(machineName)
|
||||||
|
if ePowerState == PowerState.SLEEP:
|
||||||
|
logInfo('machine %s is now sleeping (put to sleep succeeded)' % machineName)
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
if ePowerState != PowerState.ON:
|
||||||
|
logWarning('unexpectedly, powerState of %s is %s' % (machineName, PowerStateToStr(ePowerState)))
|
||||||
|
assert ePowerState == PowerState.ON
|
||||||
|
iAttempt += 1
|
||||||
|
if iAttempt > iMaxNumAttempts:
|
||||||
|
if bBUG_239_IS_STILL_ALIVE:
|
||||||
|
logWarning('the attempt to put %s to sleep failed too many times (probably because of bug 239 (machine is in a weird state : power on but no ssh possible) ?)... giving up. ' % (machineName))
|
||||||
|
return False
|
||||||
|
else:
|
||||||
|
logWarning('the attempt to put %s to sleep failed too many times... giving up' % (machineName))
|
||||||
|
return False
|
||||||
|
else:
|
||||||
|
logWarning('the attempt to put %s to sleep failed... trying again' % (machineName))
|
||||||
|
return True
|
||||||
|
|
||||||
def getPowerState( machineName ):
|
|
||||||
ePowerState = PowerState.UNKNOWN
|
|
||||||
bPowerStateRead = False
|
|
||||||
iNumFailedAttempts = 0
|
|
||||||
while not bPowerStateRead:
|
|
||||||
(returnCode, stdout, stderr) = executeIpmiCommand( machineName, ['sensor', 'get', 'ACPI State'] )
|
|
||||||
if returnCode == 0:
|
|
||||||
matchObj = re.search('\[(?P<AcpiState>S[0-9][^\:]*)\:', stdout)
|
|
||||||
bBUG_00000002_IS_STILL_ACTIVE = True
|
|
||||||
if bBUG_00000002_IS_STILL_ACTIVE:
|
|
||||||
if matchObj == None:
|
|
||||||
# the following warning has been commented out because it pollutes the logs and apparently
|
|
||||||
# it's a 'feature' of all 4 core machines : if the machine is woken up using ipmitool, then
|
|
||||||
# no power on event is logged ...
|
|
||||||
#logWarning('degenerate ipmitool output for machine %s (see bug 00000002). Assuming power in on because that''s what I noticed when I had the case.' % machineName)
|
|
||||||
return PowerState.ON
|
|
||||||
else:
|
|
||||||
assert( matchObj )
|
|
||||||
strAcpiState = matchObj.group('AcpiState')
|
|
||||||
if strAcpiState == 'S0/G0':
|
|
||||||
ePowerState = PowerState.ON
|
|
||||||
elif strAcpiState == 'S3': # memory is still powered
|
|
||||||
ePowerState = PowerState.SLEEP
|
|
||||||
elif strAcpiState == 'S5/G2': # soft-off
|
|
||||||
ePowerState = PowerState.OFF
|
|
||||||
else:
|
|
||||||
print(strAcpiState)
|
|
||||||
assert( False )
|
|
||||||
bPowerStateRead = True
|
|
||||||
else:
|
|
||||||
# error ... it's either because the machine is unplugged or because the machine is busy (well I'm not sure what happened but I had the case where the command failed for no apparent reason, and therefore I suspect it to be busy ). In order to differentiate these 2 cases, we try again and if this caommand fails too many times then we decide it's unplugged (very dodgy I know but I'm disapointed that this command doen't always work, and for now I don't know other ways to differentiate between these cases....)
|
|
||||||
iMAX_NUM_ATTEMPTS=5
|
|
||||||
iNumFailedAttempts += 1
|
|
||||||
if iNumFailedAttempts < iMAX_NUM_ATTEMPTS:
|
|
||||||
logWarning('failed to read the power state of %s. I\'ll try a again a bit later....' % machineName)
|
|
||||||
time.sleep(5)
|
|
||||||
else:
|
|
||||||
logWarning('failed to read the power state of %s too many times. I assume this machine is unplugged' % machineName)
|
|
||||||
ePowerState = PowerState.UNPLUGGED # too many attempts failed ... I guess it's because the machine is unplugged
|
|
||||||
bPowerStateRead = True
|
|
||||||
return ePowerState
|
|
||||||
|
|
||||||
def wakeUpMachine( machineName ):
|
|
||||||
"""
|
|
||||||
this method seems more reliable than wake on lan (sometimes, sending wake on lan packet seems to have no effect)
|
|
||||||
@return true on success, false otherwise
|
|
||||||
@note I once had this method failing for no obvious reason.. maybe this command does not succeed if the machine is in a transition state
|
|
||||||
"""
|
|
||||||
(returnCode, stdout, stderr) = executeIpmiCommand( machineName, ['chassis', 'power', 'on'] )
|
|
||||||
bSuccess = (returnCode == 0) # this command can fail if the machine is manually unplugged for example
|
|
||||||
return bSuccess
|
|
||||||
|
|
||||||
def blockingPutMachineToSleep( machineName ):
|
|
||||||
"""
|
|
||||||
@return true on success, false otherwise
|
|
||||||
"""
|
|
||||||
logInfo('putting machine %s to sleep...' % machineName)
|
|
||||||
iMaxNumAttempts = 5
|
|
||||||
bSuccess = False
|
|
||||||
bBUG_239_IS_STILL_ALIVE = True
|
|
||||||
iAttempt = 0
|
|
||||||
# note : each sleep order is not actually succeeding (god knows why). Therefore, we need to try again and again.
|
|
||||||
while not bSuccess:
|
|
||||||
# note : pmset must be executed as root
|
|
||||||
(returnCode, stdout, stderr) = executeProgram(['ssh', machineName, 'pmset sleepnow'])
|
|
||||||
# check if the machine actually went to sleep
|
|
||||||
iMaxGoToSleepDuration = 30 # in seconds
|
|
||||||
iDelay = 0
|
|
||||||
while iDelay < iMaxGoToSleepDuration:
|
|
||||||
time.sleep(5)
|
|
||||||
iDelay += 5
|
|
||||||
ePowerState = getPowerState( machineName )
|
|
||||||
if ePowerState == PowerState.SLEEP:
|
|
||||||
logInfo('machine %s is now sleeping (put to sleep succeeded)' % machineName)
|
|
||||||
return True
|
|
||||||
else:
|
|
||||||
if ePowerState != PowerState.ON:
|
|
||||||
logWarning('unexpectedly, powerState of %s is %s' % (machineName, PowerStateToStr(ePowerState)))
|
|
||||||
assert(ePowerState == PowerState.ON)
|
|
||||||
iAttempt += 1
|
|
||||||
if iAttempt > iMaxNumAttempts:
|
|
||||||
if bBUG_239_IS_STILL_ALIVE:
|
|
||||||
logWarning('the attempt to put %s to sleep failed too many times (probably because of bug 239 (machine is in a weird state : power on but no ssh possible) ?)... giving up. ' % (machineName))
|
|
||||||
return False
|
|
||||||
else:
|
|
||||||
logWarning('the attempt to put %s to sleep failed too many times... giving up' % (machineName))
|
|
||||||
return False
|
|
||||||
else:
|
|
||||||
logWarning('the attempt to put %s to sleep failed... trying again' % (machineName))
|
|
||||||
return True
|
|
||||||
|
|
||||||
def blockingWakeUpMachine(machineName):
|
def blockingWakeUpMachine(machineName):
|
||||||
logInfo('waking up machine %s...' % machineName)
|
logInfo('waking up machine %s...' % machineName)
|
||||||
numAttempts = 0
|
numAttempts = 0
|
||||||
bWakeUpFailed = True
|
bWakeUpFailed = True
|
||||||
while bWakeUpFailed: # try more than once because sometimes for an unknown reason, the wake up order is ignored by the machine ... to be investigated
|
while bWakeUpFailed: # try more than once because sometimes for an unknown reason, the wake up order is ignored by the machine ... to be investigated
|
||||||
iMaxNumWakeUpAttempts = 50
|
iMaxNumWakeUpAttempts = 50
|
||||||
iNumWakeUpAttempts = 0
|
iNumWakeUpAttempts = 0
|
||||||
bWakeUpMachineSucceeded = False
|
bWakeUpMachineSucceeded = False
|
||||||
while not bWakeUpMachineSucceeded:
|
while not bWakeUpMachineSucceeded:
|
||||||
bWakeUpMachineSucceeded = wakeUpMachine( machineName )
|
bWakeUpMachineSucceeded = wakeUpMachine(machineName)
|
||||||
iNumWakeUpAttempts += 1
|
iNumWakeUpAttempts += 1
|
||||||
# the previous command can fail if the machine is already in a transition
|
# the previous command can fail if the machine is already in a transition
|
||||||
# in that case we try sevral times bevire giving up
|
# in that case we try sevral times bevire giving up
|
||||||
if(bWakeUpMachineSucceeded == False):
|
if not bWakeUpMachineSucceeded:
|
||||||
if iNumWakeUpAttempts < iMaxNumWakeUpAttempts:
|
if iNumWakeUpAttempts < iMaxNumWakeUpAttempts:
|
||||||
iDelay = 5
|
iDelay = 5
|
||||||
logWarning('wake up attempt %d of %s failed... I\'ll try again in %d seconds' % (iNumWakeUpAttempts, machineName, iDelay))
|
logWarning('wake up attempt %d of %s failed... I\'ll try again in %d seconds' % (iNumWakeUpAttempts, machineName, iDelay))
|
||||||
time.sleep(iDelay)
|
time.sleep(iDelay)
|
||||||
else:
|
else:
|
||||||
logWarning('wake up attempt %d of %s failed too many times... giving up' % (iNumWakeUpAttempts, machineName))
|
logWarning('wake up attempt %d of %s failed too many times... giving up' % (iNumWakeUpAttempts, machineName))
|
||||||
return False # couldn't wake up to machine for whatever reason
|
return False # couldn't wake up to machine for whatever reason
|
||||||
|
|
||||||
bWakeUpFailed = False
|
bWakeUpFailed = False
|
||||||
# wait until the machine is operational
|
# wait until the machine is operational
|
||||||
WAKEUPTIMEOUT=5*60 # max number of seconds allowed for a machine to be alive after a wakeup request
|
WAKEUPTIMEOUT = 5 * 60 # max number of seconds allowed for a machine to be alive after a wakeup request
|
||||||
wakeUpToAliveDuration = 0
|
wakeUpToAliveDuration = 0
|
||||||
while not Lib.SimpaDbUtil.isMachineResponding( machineName ):
|
while not Lib.SimpaDbUtil.isMachineResponding(machineName):
|
||||||
time.sleep(5)
|
time.sleep(5)
|
||||||
wakeUpToAliveDuration+=5
|
wakeUpToAliveDuration += 5
|
||||||
if wakeUpToAliveDuration > WAKEUPTIMEOUT:
|
if wakeUpToAliveDuration > WAKEUPTIMEOUT:
|
||||||
# the wake up failed for whatever reason (power state changed manually ? wake up order got lost ?)
|
# the wake up failed for whatever reason (power state changed manually ? wake up order got lost ?)
|
||||||
logWarning('%s took too long (more than %d seconds) to respond after a successful wakeup request.' % (machineName, WAKEUPTIMEOUT))
|
logWarning('%s took too long (more than %d seconds) to respond after a successful wakeup request.' % (machineName, WAKEUPTIMEOUT))
|
||||||
bWakeUpFailed = True
|
bWakeUpFailed = True
|
||||||
break
|
break
|
||||||
if bWakeUpFailed:
|
if bWakeUpFailed:
|
||||||
numAttempts+=1
|
numAttempts += 1
|
||||||
if numAttempts >= 2:
|
if numAttempts >= 2:
|
||||||
logWarning('giving up waking up %s because the wake up request succeeded but the machine never actually came alive (and this too many times)' % (machineName))
|
logWarning('giving up waking up %s because the wake up request succeeded but the machine never actually came alive (and this too many times)' % (machineName))
|
||||||
return False # power state changed manually ?
|
return False # power state changed manually ?
|
||||||
else:
|
else:
|
||||||
logWarning('attempting to wake up %s one more time' % (machineName))
|
logWarning('attempting to wake up %s one more time' % (machineName))
|
||||||
else:
|
else:
|
||||||
# wake up completed
|
# wake up completed
|
||||||
logInfo('Waking up of machine %s completed successfully' % machineName)
|
logInfo('Waking up of machine %s completed successfully' % machineName)
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
|
||||||
def onException(exception):
|
def onException(exception):
|
||||||
sys.stdout.flush()
|
sys.stdout.flush()
|
||||||
strExceptionType = type( exception )
|
strExceptionType = type(exception)
|
||||||
strMessage = 'exception %s : %s\n' % (strExceptionType, exception.message)
|
strMessage = 'exception %s : %s\n' % (strExceptionType, exception.message)
|
||||||
#traceback.print_last()
|
# traceback.print_last()
|
||||||
f = io.StringIO()
|
f = io.StringIO()
|
||||||
traceback.print_exc(file=f)
|
traceback.print_exc(file=f)
|
||||||
strMessage += f.getvalue()
|
strMessage += f.getvalue()
|
||||||
f.close()
|
f.close()
|
||||||
logError(strMessage)
|
logError(strMessage)
|
||||||
print(strMessage)
|
print(strMessage)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# I had the case (see bugzilla 234) where an assert failed in a child thread, but the main process kept going. I suspect that it was caused
|
# I had the case (see bugzilla 234) where an assert failed in a child thread, but the main process kept going. I suspect that it was caused
|
||||||
#by a failure of sendTextMail... that's why I've embedded the sendmail inside a try except block, so that is this operation fails, then the
|
# by a failure of sendTextMail... that's why I've embedded the sendmail inside a try except block, so that is this operation fails, then the
|
||||||
# kill of the main process is still executed.
|
# kill of the main process is still executed.
|
||||||
Lib.Util.sendTextMail( 'ClusterController <guillaume.raffy@univ-rennes1.fr>', 'guillaume.raffy@univ-rennes1.fr', 'ClusterController has stopped because of an exception', strMessage)
|
Lib.Util.sendTextMail('ClusterController <guillaume.raffy@univ-rennes1.fr>', 'guillaume.raffy@univ-rennes1.fr', 'ClusterController has stopped because of an exception', strMessage)
|
||||||
except BaseException:
|
except BaseException:
|
||||||
logError("Could not send the email to notify the administrator that cluster controller failed")
|
logError("Could not send the email to notify the administrator that cluster controller failed")
|
||||||
pass
|
pass
|
||||||
executeCommand('kill -9 %d' % os.getpid()) # stop other threads immediately
|
executeCommand('kill -9 %d' % os.getpid()) # stop other threads immediately
|
||||||
exit()
|
exit()
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,42 +1,47 @@
|
||||||
import Sensor
|
import Sensor
|
||||||
|
|
||||||
|
|
||||||
class ClusterNodeSensorsReadings:
|
class ClusterNodeSensorsReadings:
|
||||||
"""
|
"""
|
||||||
|
|
||||||
"""
|
"""
|
||||||
"""
|
"""
|
||||||
POWERSTATE_UNKNOWN=0
|
POWERSTATE_UNKNOWN=0
|
||||||
POWERSTATE_OFF=1
|
POWERSTATE_OFF=1
|
||||||
POWERSTATE_ON=2
|
POWERSTATE_ON=2
|
||||||
POWERSTATE_SLEEP=3
|
POWERSTATE_SLEEP=3
|
||||||
"""
|
"""
|
||||||
def __init__(self, clusterNodeName):
|
def __init__(self, clusterNodeName):
|
||||||
self.m_clusterNodeName = clusterNodeName
|
self.m_clusterNodeName = clusterNodeName
|
||||||
self.m_sensors = {}
|
self.m_sensors = {}
|
||||||
#self.m_powerState = ClusterNodeStatus.POWERSTATE_UNKNOWN
|
# self.m_powerState = ClusterNodeStatus.POWERSTATE_UNKNOWN
|
||||||
return
|
return
|
||||||
def addSensor(self, sensor):
|
|
||||||
self.m_sensors[sensor.m_name] = sensor
|
def addSensor(self, sensor):
|
||||||
def dump(self):
|
self.m_sensors[sensor.m_name] = sensor
|
||||||
for key,sensor in self.m_sensors.items():
|
|
||||||
sensor.dump()
|
def dump(self):
|
||||||
return
|
for key, sensor in self.m_sensors.items():
|
||||||
#def getPowerState(self):
|
sensor.dump()
|
||||||
# return self.m_powerState
|
return
|
||||||
def getLowestTemperature( self ):
|
|
||||||
#log('ClusterNodeSensorsReadings::getLowestTemperature : start')
|
# def getPowerState(self):
|
||||||
lowestTemperature = 0.0
|
# return self.m_powerState
|
||||||
lowestTemperatureIsDefined = False
|
|
||||||
for key,sensor in self.m_sensors.items():
|
def getLowestTemperature(self):
|
||||||
#log('ClusterNodeSensorsReadings::getLowestTemperature : start')
|
# log('ClusterNodeSensorsReadings::getLowestTemperature : start')
|
||||||
if sensor.typeName() == 'Temperature':
|
lowestTemperature = 0.0
|
||||||
sensor.m_temperature
|
lowestTemperatureIsDefined = False
|
||||||
if lowestTemperatureIsDefined:
|
for key, sensor in self.m_sensors.items():
|
||||||
if sensor.m_temperature < lowestTemperature:
|
# log('ClusterNodeSensorsReadings::getLowestTemperature : start')
|
||||||
lowestTemperature = sensor.m_temperature
|
if sensor.typeName() == 'Temperature':
|
||||||
else:
|
sensor.m_temperature
|
||||||
lowestTemperature = sensor.m_temperature
|
if lowestTemperatureIsDefined:
|
||||||
lowestTemperatureIsDefined = True
|
if sensor.m_temperature < lowestTemperature:
|
||||||
assert( lowestTemperatureIsDefined )
|
lowestTemperature = sensor.m_temperature
|
||||||
#log('ClusterNodeSensorsReadings::getLowestTemperature : end')
|
else:
|
||||||
return lowestTemperature
|
lowestTemperature = sensor.m_temperature
|
||||||
|
lowestTemperatureIsDefined = True
|
||||||
|
assert lowestTemperatureIsDefined
|
||||||
|
# log('ClusterNodeSensorsReadings::getLowestTemperature : end')
|
||||||
|
return lowestTemperature
|
||||||
|
|
|
@ -3,79 +3,81 @@ import re
|
||||||
from Sensor import FanSensor, TemperatureSensor
|
from Sensor import FanSensor, TemperatureSensor
|
||||||
from ClusterNodeSensorsReadings import ClusterNodeSensorsReadings
|
from ClusterNodeSensorsReadings import ClusterNodeSensorsReadings
|
||||||
|
|
||||||
|
|
||||||
class IpmiTool202Parser:
|
class IpmiTool202Parser:
|
||||||
def parseSensorOutput( self, strOutput, clusterNodeName ):
|
def parseSensorOutput(self, strOutput, clusterNodeName):
|
||||||
sensorReadings=ClusterNodeSensorsReadings(clusterNodeName)
|
sensorReadings = ClusterNodeSensorsReadings(clusterNodeName)
|
||||||
f = io.StringIO(strOutput)
|
f = io.StringIO(strOutput)
|
||||||
line = f.readline()
|
line = f.readline()
|
||||||
while( len(line) > 0 ):
|
while len(line) > 0:
|
||||||
#print line,
|
# print line,
|
||||||
matchObj = re.match( '^Sensor ID[ ]*\: \'(?P<sensorName>[a-zA-Z 0-9]+)\'', line )
|
matchObj = re.match(r'^Sensor ID[ ]*\: \'(?P<sensorName>[a-zA-Z 0-9]+)\'', line)
|
||||||
if matchObj:
|
if matchObj:
|
||||||
sensorName = matchObj.group('sensorName')
|
sensorName = matchObj.group('sensorName')
|
||||||
# print sensorName
|
# print sensorName
|
||||||
# read the entity id
|
# read the entity id
|
||||||
line = f.readline()
|
line = f.readline()
|
||||||
matchObj = re.match( '^ Entity ID[ ]*\: (?P<entityId>[0-9\.]+)', line )
|
matchObj = re.match(r'^ Entity ID[ ]*\: (?P<entityId>[0-9\.]+)', line)
|
||||||
assert(matchObj)
|
assert matchObj
|
||||||
entityId = matchObj.group('entityId')
|
entityId = matchObj.group('entityId')
|
||||||
# print entityId
|
# print entityId
|
||||||
# read the sensor type
|
# read the sensor type
|
||||||
line = f.readline()
|
line = f.readline()
|
||||||
matchObj = re.match( '^ Sensor Type[\(\)a-zA-Z ]*\: (?P<sensorType>[a-zA-Z \(\)]+)', line )
|
matchObj = re.match(r'^ Sensor Type[\(\)a-zA-Z ]*\: (?P<sensorType>[a-zA-Z \(\)]+)', line)
|
||||||
assert(matchObj)
|
assert matchObj
|
||||||
sensorType = matchObj.group('sensorType')
|
sensorType = matchObj.group('sensorType')
|
||||||
#print sensorType
|
# print sensorType
|
||||||
if sensorType == 'Fan':
|
if sensorType == 'Fan':
|
||||||
rpms = self.parseFanSensorOutput(f)
|
rpms = self.parseFanSensorOutput(f)
|
||||||
if temperature != None:
|
if temperature is not None:
|
||||||
sensor = FanSensor(sensorName)
|
sensor = FanSensor(sensorName)
|
||||||
sensor.m_rpms = rpms
|
sensor.m_rpms = rpms
|
||||||
elif sensorType == 'Temperature':
|
elif sensorType == 'Temperature':
|
||||||
temperature = self.parseTemperatureSensorOutput(f)
|
temperature = self.parseTemperatureSensorOutput(f)
|
||||||
if temperature != None:
|
if temperature is not None:
|
||||||
sensor = TemperatureSensor(sensorName)
|
sensor = TemperatureSensor(sensorName)
|
||||||
sensor.m_temperature = temperature
|
sensor.m_temperature = temperature
|
||||||
else:
|
else:
|
||||||
#ignoring other sensors
|
# ignoring other sensors
|
||||||
sensor = None
|
sensor = None
|
||||||
if sensor:
|
if sensor:
|
||||||
sensorReadings.addSensor( sensor )
|
sensorReadings.addSensor(sensor)
|
||||||
else:
|
else:
|
||||||
None
|
None
|
||||||
#assert(False)
|
# assert(False)
|
||||||
line = f.readline()
|
line = f.readline()
|
||||||
f.close()
|
f.close()
|
||||||
def parseFanSensorOutput(self, file):
|
|
||||||
"""
|
def parseFanSensorOutput(self, file):
|
||||||
reads the fan specific ipdmitool output
|
"""
|
||||||
"""
|
reads the fan specific ipdmitool output
|
||||||
line = file.readline()
|
"""
|
||||||
#print line
|
line = file.readline()
|
||||||
matchObj = re.match( '^ Sensor Reading[ ]*\: (?P<numRpms>[0-9]+) \(\+/\- (?P<rpmsPrecision>[0-9]+)\) RPM', line )
|
# print line
|
||||||
if(matchObj):
|
matchObj = re.match(r'^ Sensor Reading[ ]*\: (?P<numRpms>[0-9]+) \(\+/\- (?P<rpmsPrecision>[0-9]+)\) RPM', line)
|
||||||
numRpms = matchObj.group('numRpms')
|
if matchObj:
|
||||||
#print numRpms
|
numRpms = matchObj.group('numRpms')
|
||||||
rpms = float( numRpms )
|
# print numRpms
|
||||||
return rpms
|
rpms = float(numRpms)
|
||||||
else:
|
return rpms
|
||||||
matchObj = re.match( '^ Sensor Reading[ ]*\: Not Present', line )
|
else:
|
||||||
assert(matchObj)
|
matchObj = re.match(r'^ Sensor Reading[ ]*\: Not Present', line)
|
||||||
return None
|
assert matchObj
|
||||||
|
return None
|
||||||
def parseTemperatureSensorOutput(self, file):
|
|
||||||
"""
|
def parseTemperatureSensorOutput(self, file):
|
||||||
reads the temperature specific ipdmitool output
|
"""
|
||||||
"""
|
reads the temperature specific ipdmitool output
|
||||||
# Sensor Reading : 36 (+/- 0) degrees C
|
"""
|
||||||
line = file.readline()
|
# Sensor Reading : 36 (+/- 0) degrees C
|
||||||
#print line
|
line = file.readline()
|
||||||
matchObj = re.match( '^ Sensor Reading[ ]*\: (?P<temperature>[0-9]+) \(\+/\- (?P<precision>[0-9]+)\) degrees C', line )
|
# print line
|
||||||
if(matchObj):
|
matchObj = re.match(r'^ Sensor Reading[ ]*\: (?P<temperature>[0-9]+) \(\+/\- (?P<precision>[0-9]+)\) degrees C', line)
|
||||||
temperature = matchObj.group('temperature')
|
if matchObj:
|
||||||
temperature = float( temperature )
|
temperature = matchObj.group('temperature')
|
||||||
return temperature
|
temperature = float(temperature)
|
||||||
else:
|
return temperature
|
||||||
matchObj = re.match( '^ Sensor Reading[ ]*\: Not Present', line )
|
else:
|
||||||
assert(matchObj)
|
matchObj = re.match(r'^ Sensor Reading[ ]*\: Not Present', line)
|
||||||
return None
|
assert matchObj
|
||||||
|
return None
|
||||||
|
|
|
@ -3,37 +3,37 @@ import re
|
||||||
from Sensor import FanSensor, TemperatureSensor
|
from Sensor import FanSensor, TemperatureSensor
|
||||||
from ClusterNodeSensorsReadings import ClusterNodeSensorsReadings
|
from ClusterNodeSensorsReadings import ClusterNodeSensorsReadings
|
||||||
|
|
||||||
|
|
||||||
class IpmiTool218Parser:
|
class IpmiTool218Parser:
|
||||||
def parseSensorOutput( self, strOutput, clusterNodeName ):
|
def parseSensorOutput(self, strOutput, clusterNodeName):
|
||||||
sensorReadings=ClusterNodeSensorsReadings(clusterNodeName)
|
sensorReadings = ClusterNodeSensorsReadings(clusterNodeName)
|
||||||
f = io.StringIO(strOutput)
|
f = io.StringIO(strOutput)
|
||||||
line = f.readline()
|
line = f.readline()
|
||||||
while( len(line) > 0 ):
|
while len(line) > 0:
|
||||||
#print line,
|
# print line,
|
||||||
matchObj = re.match( '^(?P<sensorName>[a-zA-Z 0-9]+[a-zA-Z 0-9]*[a-zA-Z0-9])[ ]*\| (?P<sensorValue>[\.0-9]+)[ ]*\| (?P<sensorUnit>[a-zA-Z0-9][a-zA-Z 0-9]*[a-zA-Z0-9])[?]*', line )
|
matchObj = re.match(r'^(?P<sensorName>[a-zA-Z 0-9]+[a-zA-Z 0-9]*[a-zA-Z0-9])[ ]*\| (?P<sensorValue>[\.0-9]+)[ ]*\| (?P<sensorUnit>[a-zA-Z0-9][a-zA-Z 0-9]*[a-zA-Z0-9])[?]*', line)
|
||||||
if matchObj:
|
if matchObj:
|
||||||
#log('readClusterNodeSensorsIpmiTool2_1_8 : sensorName = '+matchObj.group('sensorName'))
|
# log('readClusterNodeSensorsIpmiTool2_1_8 : sensorName = '+matchObj.group('sensorName'))
|
||||||
#log('readClusterNodeSensorsIpmiTool2_1_8 : sensorValue = '+matchObj.group('sensorValue'))
|
# log('readClusterNodeSensorsIpmiTool2_1_8 : sensorValue = '+matchObj.group('sensorValue'))
|
||||||
#log('readClusterNodeSensorsIpmiTool2_1_8 : sensorUnit = "'+matchObj.group('sensorUnit')+'"')
|
# log('readClusterNodeSensorsIpmiTool2_1_8 : sensorUnit = "'+matchObj.group('sensorUnit')+'"')
|
||||||
sensorName = matchObj.group('sensorName')
|
sensorName = matchObj.group('sensorName')
|
||||||
sensorValue = matchObj.group('sensorValue')
|
sensorValue = matchObj.group('sensorValue')
|
||||||
sensorUnit = matchObj.group('sensorUnit')
|
sensorUnit = matchObj.group('sensorUnit')
|
||||||
sensor = None
|
sensor = None
|
||||||
if sensorUnit == 'degrees C':
|
if sensorUnit == 'degrees C':
|
||||||
sensor = TemperatureSensor(sensorName)
|
sensor = TemperatureSensor(sensorName)
|
||||||
sensor.m_temperature = float( sensorValue )
|
sensor.m_temperature = float(sensorValue)
|
||||||
elif sensorUnit == 'RPM':
|
elif sensorUnit == 'RPM':
|
||||||
sensor = FanSensor(sensorName)
|
sensor = FanSensor(sensorName)
|
||||||
sensor.m_rpms = float( sensorValue )
|
sensor.m_rpms = float(sensorValue)
|
||||||
else:
|
else:
|
||||||
None
|
None
|
||||||
if sensor:
|
if sensor:
|
||||||
#log('readClusterNodeSensorsIpmiTool2_1_8 : adding sensor')
|
# log('readClusterNodeSensorsIpmiTool2_1_8 : adding sensor')
|
||||||
sensorReadings.addSensor( sensor )
|
sensorReadings.addSensor(sensor)
|
||||||
else:
|
else:
|
||||||
None
|
None
|
||||||
#assert(False)
|
# assert(False)
|
||||||
line = f.readline()
|
line = f.readline()
|
||||||
f.close()
|
f.close()
|
||||||
return sensorReadings
|
return sensorReadings
|
||||||
|
|
||||||
|
|
|
@ -1,23 +1,23 @@
|
||||||
class Sensor:
|
class Sensor:
|
||||||
def __init__(self, sensorName):
|
def __init__(self, sensorName):
|
||||||
self.m_name = sensorName
|
self.m_name = sensorName
|
||||||
self.m_isValid = True # false if this sensor is not actually present on the target machine
|
self.m_isValid = True # false if this sensor is not actually present on the target machine
|
||||||
return
|
return
|
||||||
def dump(self):
|
def dump(self):
|
||||||
print self.m_name
|
print self.m_name
|
||||||
|
|
||||||
class FanSensor(Sensor):
|
class FanSensor(Sensor):
|
||||||
def __init__(self, sensorName):
|
def __init__(self, sensorName):
|
||||||
Sensor.__init__(self, sensorName)
|
Sensor.__init__(self, sensorName)
|
||||||
def dump(self):
|
def dump(self):
|
||||||
print 'Fan \'', self.m_name, '\' rpm=',self.m_rpms
|
print 'Fan \'', self.m_name, '\' rpm=',self.m_rpms
|
||||||
def typeName(self):
|
def typeName(self):
|
||||||
return 'Fan'
|
return 'Fan'
|
||||||
|
|
||||||
class TemperatureSensor(Sensor):
|
class TemperatureSensor(Sensor):
|
||||||
def __init__(self, sensorName):
|
def __init__(self, sensorName):
|
||||||
Sensor.__init__(self, sensorName)
|
Sensor.__init__(self, sensorName)
|
||||||
def dump(self):
|
def dump(self):
|
||||||
print 'Temperature \'', self.m_name, '\' temperature=',self.m_temperature
|
print 'Temperature \'', self.m_name, '\' temperature=',self.m_temperature
|
||||||
def typeName(self):
|
def typeName(self):
|
||||||
return 'Temperature'
|
return 'Temperature'
|
||||||
|
|
|
@ -6,9 +6,9 @@ if sys.version_info < (3, 0):
|
||||||
else:
|
else:
|
||||||
from io import StringIO
|
from io import StringIO
|
||||||
import re
|
import re
|
||||||
from .wol import *
|
from .wol import wake_on_lan
|
||||||
import os
|
import os
|
||||||
from .Util import *
|
from .Util import executeProgram, executeCommand, log
|
||||||
import abc
|
import abc
|
||||||
import sqlite3
|
import sqlite3
|
||||||
from .mysql2sqlite import mysql_to_sqlite
|
from .mysql2sqlite import mysql_to_sqlite
|
||||||
|
@ -17,7 +17,7 @@ from .mysql2sqlite import mysql_to_sqlite
|
||||||
def isMachineResponding(machineName):
|
def isMachineResponding(machineName):
|
||||||
(returnCode, stdout, stderr) = executeProgram(['ping', '-o', '-t', '1', machineName])
|
(returnCode, stdout, stderr) = executeProgram(['ping', '-o', '-t', '1', machineName])
|
||||||
# log( 'isMachineResponding : result of command %s : %d' % (command, returnCode) )
|
# log( 'isMachineResponding : result of command %s : %d' % (command, returnCode) )
|
||||||
|
|
||||||
if returnCode == 0:
|
if returnCode == 0:
|
||||||
return True
|
return True
|
||||||
else:
|
else:
|
||||||
|
@ -33,7 +33,7 @@ def isMachineResponding(machineName):
|
||||||
# don't stop the program until we understand bug00000004
|
# don't stop the program until we understand bug00000004
|
||||||
else:
|
else:
|
||||||
log('isMachineResponding : Unexpected return code : returnCode=%d, stdout="%s", stderr="%s" , machineName = %s' % (returnCode, stdout, stderr, machineName))
|
log('isMachineResponding : Unexpected return code : returnCode=%d, stdout="%s", stderr="%s" , machineName = %s' % (returnCode, stdout, stderr, machineName))
|
||||||
assert(False)
|
assert False
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
@ -60,10 +60,10 @@ class RemoteMysqlDb(ISqlDatabaseBackend):
|
||||||
self._db_user = db_user
|
self._db_user = db_user
|
||||||
self._db_name = db_name
|
self._db_name = db_name
|
||||||
self._connect()
|
self._connect()
|
||||||
|
|
||||||
def _connect(self):
|
def _connect(self):
|
||||||
self._conn = MySQLdb.connect(self._db_server_fqdn, self._db_user, '', self._db_name)
|
self._conn = MySQLdb.connect(self._db_server_fqdn, self._db_user, '', self._db_name)
|
||||||
assert(self._conn)
|
assert self._conn
|
||||||
|
|
||||||
def query(self, sql_query):
|
def query(self, sql_query):
|
||||||
"""
|
"""
|
||||||
|
@ -73,7 +73,7 @@ class RemoteMysqlDb(ISqlDatabaseBackend):
|
||||||
rows = conn.store_result()
|
rows = conn.store_result()
|
||||||
return rows
|
return rows
|
||||||
|
|
||||||
|
|
||||||
class SqlFile(ISqlDatabaseBackend):
|
class SqlFile(ISqlDatabaseBackend):
|
||||||
def __init__(self, sql_file_path, truncate_hex_strings=False):
|
def __init__(self, sql_file_path, truncate_hex_strings=False):
|
||||||
"""
|
"""
|
||||||
|
@ -128,7 +128,7 @@ class TableAttrNotFound(Exception):
|
||||||
|
|
||||||
|
|
||||||
class SqlDatabaseReader(object):
|
class SqlDatabaseReader(object):
|
||||||
|
|
||||||
def __init__(self, inv_provider):
|
def __init__(self, inv_provider):
|
||||||
"""
|
"""
|
||||||
:param ISqlDatabaseBackend inv_provider: the input that provides the inventory data
|
:param ISqlDatabaseBackend inv_provider: the input that provides the inventory data
|
||||||
|
@ -138,7 +138,7 @@ class SqlDatabaseReader(object):
|
||||||
def query(self, sql_query):
|
def query(self, sql_query):
|
||||||
"""
|
"""
|
||||||
performs a query on the sql database
|
performs a query on the sql database
|
||||||
|
|
||||||
:param str sql_query: the sql query to perform
|
:param str sql_query: the sql query to perform
|
||||||
"""
|
"""
|
||||||
return self._inv_provider.query(sql_query)
|
return self._inv_provider.query(sql_query)
|
||||||
|
@ -146,7 +146,7 @@ class SqlDatabaseReader(object):
|
||||||
def get_table_attr(self, table, key_name, key_value, attr_name):
|
def get_table_attr(self, table, key_name, key_value, attr_name):
|
||||||
"""
|
"""
|
||||||
reads the value of the fiven attribute of the given item in the given table
|
reads the value of the fiven attribute of the given item in the given table
|
||||||
|
|
||||||
:param str table: the name of the table to read
|
:param str table: the name of the table to read
|
||||||
:param str key_name: the name of the column that stores the id of the item to read
|
:param str key_name: the name of the column that stores the id of the item to read
|
||||||
:param str key_value: the id of the item to read
|
:param str key_value: the id of the item to read
|
||||||
|
@ -163,13 +163,13 @@ class SqlDatabaseReader(object):
|
||||||
|
|
||||||
def machineNameToMacAddress(machineName):
|
def machineNameToMacAddress(machineName):
|
||||||
conn = MySQLdb.connect('simpatix10', 'simpadb_reader', '', 'simpadb')
|
conn = MySQLdb.connect('simpatix10', 'simpadb_reader', '', 'simpadb')
|
||||||
assert(conn)
|
assert conn
|
||||||
sqlQuery = """SELECT mac_address FROM ethernet_cards WHERE machine_name='""" + machineName + """' AND type='normal'"""
|
sqlQuery = """SELECT mac_address FROM ethernet_cards WHERE machine_name='""" + machineName + """' AND type='normal'"""
|
||||||
# print sqlQuery
|
# print sqlQuery
|
||||||
conn.query(sqlQuery)
|
conn.query(sqlQuery)
|
||||||
r = conn.store_result()
|
r = conn.store_result()
|
||||||
row = r.fetch_row(0)
|
row = r.fetch_row(0)
|
||||||
assert( len(row) == 1)
|
assert len(row) == 1
|
||||||
# print 'row =', row
|
# print 'row =', row
|
||||||
macAddress = row[0][0]
|
macAddress = row[0][0]
|
||||||
# print macAddress
|
# print macAddress
|
||||||
|
@ -182,13 +182,13 @@ def getLightOutManagementIpAddress(machineName):
|
||||||
the light out management ip of servers allows to talk to the server even when it's asleep
|
the light out management ip of servers allows to talk to the server even when it's asleep
|
||||||
"""
|
"""
|
||||||
conn = MySQLdb.connect('simpatix10', 'simpadb_reader', '', 'simpadb')
|
conn = MySQLdb.connect('simpatix10', 'simpadb_reader', '', 'simpadb')
|
||||||
assert(conn)
|
assert conn
|
||||||
sqlQuery = """SELECT ip_address_1,ip_address_2,ip_address_3,ip_address_4 FROM ethernet_cards WHERE machine_name='""" + machineName + """' AND type='light_out_management'"""
|
sqlQuery = """SELECT ip_address_1,ip_address_2,ip_address_3,ip_address_4 FROM ethernet_cards WHERE machine_name='""" + machineName + """' AND type='light_out_management'"""
|
||||||
# print sqlQuery
|
# print sqlQuery
|
||||||
conn.query(sqlQuery)
|
conn.query(sqlQuery)
|
||||||
r = conn.store_result()
|
r = conn.store_result()
|
||||||
row = r.fetch_row(0)
|
row = r.fetch_row(0)
|
||||||
assert(len(row) == 1)
|
assert len(row) == 1
|
||||||
# print 'row =', row
|
# print 'row =', row
|
||||||
ipAddress = ('%s.%s.%s.%s') % (row[0][0], row[0][1], row[0][2], row[0][3])
|
ipAddress = ('%s.%s.%s.%s') % (row[0][0], row[0][1], row[0][2], row[0][3])
|
||||||
# print macAddress
|
# print macAddress
|
||||||
|
@ -199,7 +199,7 @@ def getLightOutManagementIpAddress(machineName):
|
||||||
def getClusterMachinesNames():
|
def getClusterMachinesNames():
|
||||||
clusterMachinesNames = []
|
clusterMachinesNames = []
|
||||||
conn = MySQLdb.connect('simpatix10', 'simpadb_reader', '', 'simpadb')
|
conn = MySQLdb.connect('simpatix10', 'simpadb_reader', '', 'simpadb')
|
||||||
assert(conn)
|
assert conn
|
||||||
sqlQuery = """SELECT name FROM machines WHERE affectation='cluster'"""
|
sqlQuery = """SELECT name FROM machines WHERE affectation='cluster'"""
|
||||||
# print sqlQuery
|
# print sqlQuery
|
||||||
conn.query(sqlQuery)
|
conn.query(sqlQuery)
|
||||||
|
@ -231,7 +231,7 @@ def putToSleep(machineName):
|
||||||
print 'stderr :'
|
print 'stderr :'
|
||||||
print stderr
|
print stderr
|
||||||
"""
|
"""
|
||||||
assert(returnCode == 0)
|
assert returnCode == 0
|
||||||
# check if the command succeeded by looking at the output (that's the only way I found)
|
# check if the command succeeded by looking at the output (that's the only way I found)
|
||||||
f = StringIO.StringIO(stdout)
|
f = StringIO.StringIO(stdout)
|
||||||
line = f.readline()
|
line = f.readline()
|
||||||
|
@ -248,7 +248,7 @@ def wakeUp(machineName):
|
||||||
wake_on_lan(macAddress)
|
wake_on_lan(macAddress)
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
|
||||||
def isNonRespondingMachineSleeping(machineName):
|
def isNonRespondingMachineSleeping(machineName):
|
||||||
"""
|
"""
|
||||||
note : crappy method to detect if the machine is sleeping (if other methods are available, I would be very interested)
|
note : crappy method to detect if the machine is sleeping (if other methods are available, I would be very interested)
|
||||||
|
|
|
@ -1,61 +1,67 @@
|
||||||
#!/usr/bin/python
|
#!/usr/bin/python
|
||||||
|
|
||||||
#import sys
|
# import sys
|
||||||
#sys.path.insert(0, '/homes/raffy/SvnGRaffy/dev/Python')
|
# sys.path.insert(0, '/homes/raffy/SvnGRaffy/dev/Python')
|
||||||
import re
|
import re
|
||||||
#import Lib.Util
|
# import Lib.Util
|
||||||
|
|
||||||
|
|
||||||
class SgeConfig:
|
class SgeConfig:
|
||||||
def __init__( self ):
|
|
||||||
self.m_attrs={}
|
def __init__(self):
|
||||||
def hasAttr(self, attr_name):
|
self.m_attrs = {}
|
||||||
return attr_name in self.m_attrs.keys()
|
|
||||||
def getAttr( self, strAttrName ):
|
def hasAttr(self, attr_name):
|
||||||
return self.m_attrs[ strAttrName ]
|
return attr_name in self.m_attrs.keys()
|
||||||
def setAttr( self, strAttrName, strAttrValue ):
|
|
||||||
assert isinstance(strAttrName, str)
|
def getAttr(self, strAttrName):
|
||||||
assert isinstance(strAttrValue, str)
|
return self.m_attrs[strAttrName]
|
||||||
self.m_attrs[ strAttrName ] = strAttrValue
|
|
||||||
def loadFromSgeFormat1String( self, strSgeConfigString ):
|
def setAttr(self, strAttrName, strAttrValue):
|
||||||
"""
|
assert isinstance(strAttrName, str)
|
||||||
loads attrs from a string such as :
|
assert isinstance(strAttrValue, str)
|
||||||
hostname simpatix11.univ-rennes1.fr
|
self.m_attrs[strAttrName] = strAttrValue
|
||||||
load_scaling NONE
|
|
||||||
complex_values has_molpro_2010=0
|
def loadFromSgeFormat1String(self, strSgeConfigString):
|
||||||
load_values arch=darwin-x86,num_proc=4,mem_total=8192.000000M, \
|
"""
|
||||||
swap_total=0.000000M,virtual_total=8192.000000M, \
|
loads attrs from a string such as :
|
||||||
load_avg=5.126465,load_short=5.186523, \
|
hostname simpatix11.univ-rennes1.fr
|
||||||
load_medium=5.126465,load_long=5.087891, \
|
load_scaling NONE
|
||||||
mem_free=6654.054688M,swap_free=0.000000M, \
|
complex_values has_molpro_2010=0
|
||||||
virtual_free=6654.054688M,mem_used=1537.945312M, \
|
load_values arch=darwin-x86,num_proc=4,mem_total=8192.000000M, \
|
||||||
swap_used=0.000000M,virtual_used=1537.945312M, \
|
swap_total=0.000000M,virtual_total=8192.000000M, \
|
||||||
cpu=100.000000,m_topology=NONE,m_topology_inuse=NONE, \
|
load_avg=5.126465,load_short=5.186523, \
|
||||||
m_socket=0,m_core=0,np_load_avg=1.281616, \
|
load_medium=5.126465,load_long=5.087891, \
|
||||||
np_load_short=1.296631,np_load_medium=1.281616, \
|
mem_free=6654.054688M,swap_free=0.000000M, \
|
||||||
np_load_long=1.271973
|
virtual_free=6654.054688M,mem_used=1537.945312M, \
|
||||||
processors 4
|
swap_used=0.000000M,virtual_used=1537.945312M, \
|
||||||
user_lists NONE
|
cpu=100.000000,m_topology=NONE,m_topology_inuse=NONE, \
|
||||||
xuser_lists NONE
|
m_socket=0,m_core=0,np_load_avg=1.281616, \
|
||||||
projects NONE
|
np_load_short=1.296631,np_load_medium=1.281616, \
|
||||||
xprojects NONE
|
np_load_long=1.271973
|
||||||
usage_scaling NONE
|
processors 4
|
||||||
report_variables NONE
|
user_lists NONE
|
||||||
"""
|
xuser_lists NONE
|
||||||
self.m_attrs={}
|
projects NONE
|
||||||
# put multiline attributes on one line
|
xprojects NONE
|
||||||
strSgeConfigString = re.sub(r"\\\n", "", strSgeConfigString)
|
usage_scaling NONE
|
||||||
for strAttrDef in strSgeConfigString.split("\n"):
|
report_variables NONE
|
||||||
# print("strAttrDef=%s" % strAttrDef)
|
"""
|
||||||
if len(strAttrDef) != 0:
|
self.m_attrs = {}
|
||||||
matchObj = re.match( "^(?P<attrName>[^\s]+)[ ]+(?P<attrValue>[^\s].*)$", strAttrDef )
|
# put multiline attributes on one line
|
||||||
assert matchObj is not None
|
strSgeConfigString = re.sub(r"\\\n", "", strSgeConfigString)
|
||||||
#print( '%s = %s\n' % (matchObj.group("attrName"), matchObj.group("attrValue") ) )
|
for strAttrDef in strSgeConfigString.split("\n"):
|
||||||
self.m_attrs[ matchObj.group("attrName") ] = matchObj.group("attrValue")
|
# print("strAttrDef=%s" % strAttrDef)
|
||||||
|
if len(strAttrDef) != 0:
|
||||||
def loadFromSgeFormat2String( self, strSgeConfigString ):
|
matchObj = re.match(r"^(?P<attrName>[^\s]+)[]+(?P<attrValue>[^\s].*)$", strAttrDef)
|
||||||
"""
|
assert matchObj is not None
|
||||||
loads attrs from a string such as :
|
# print('%s = %s\n' % (matchObj.group("attrName"), matchObj.group("attrValue")))
|
||||||
arch=darwin-x86,num_proc=4,mem_total=8192.000000M, \
|
self.m_attrs[matchObj.group("attrName")] = matchObj.group("attrValue")
|
||||||
|
|
||||||
|
def loadFromSgeFormat2String(self, strSgeConfigString):
|
||||||
|
"""
|
||||||
|
loads attrs from a string such as :
|
||||||
|
arch=darwin-x86,num_proc=4,mem_total=8192.000000M, \
|
||||||
swap_total=0.000000M,virtual_total=8192.000000M, \
|
swap_total=0.000000M,virtual_total=8192.000000M, \
|
||||||
load_avg=5.126465,load_short=5.186523, \
|
load_avg=5.126465,load_short=5.186523, \
|
||||||
load_medium=5.126465,load_long=5.087891, \
|
load_medium=5.126465,load_long=5.087891, \
|
||||||
|
@ -66,87 +72,88 @@ class SgeConfig:
|
||||||
m_socket=0,m_core=0,np_load_avg=1.281616, \
|
m_socket=0,m_core=0,np_load_avg=1.281616, \
|
||||||
np_load_short=1.296631,np_load_medium=1.281616, \
|
np_load_short=1.296631,np_load_medium=1.281616, \
|
||||||
np_load_long=1.271973
|
np_load_long=1.271973
|
||||||
"""
|
"""
|
||||||
self.m_attrs={}
|
self.m_attrs = {}
|
||||||
if strSgeConfigString != "NONE":
|
if strSgeConfigString != "NONE":
|
||||||
for strAttrDef in strSgeConfigString.split(","):
|
for strAttrDef in strSgeConfigString.split(","):
|
||||||
#print strAttrDef
|
# print strAttrDef
|
||||||
if len(strAttrDef) != 0:
|
if len(strAttrDef) != 0:
|
||||||
matchObj = re.match( "^\s*(?P<attrName>[^=]+)=(?P<attrValue>.*)$", strAttrDef )
|
matchObj = re.match(r"^\s*(?P<attrName>[^=]+)=(?P<attrValue>.*)$", strAttrDef)
|
||||||
#print matchObj.group("attrName")
|
# print matchObj.group("attrName")
|
||||||
self.m_attrs[ matchObj.group("attrName") ] = matchObj.group("attrValue")
|
self.m_attrs[matchObj.group("attrName")] = matchObj.group("attrValue")
|
||||||
def asFormat1String( self ):
|
|
||||||
strResult = ""
|
|
||||||
for (k,v) in self.m_attrs.items():
|
|
||||||
#print "%s %s" % (k,v)
|
|
||||||
# if the attribute's value is a list of comma separated strings, make sure there are no spaces after the commas, otherwise the value is not properly interpreted when read back into sge
|
|
||||||
# for example if the user sets the value of administrator_mail (using qconf -mconf global) to "alice@univ-rennes1.fr, bob@univ-rennes1.fr", then the next call to qconf -sconf global will show a wrong value for administrator_mail, as shown below:
|
|
||||||
# pag_cmd none
|
|
||||||
# administrator_mail alice@univ-rennes1.fr,
|
|
||||||
# token_extend_time none
|
|
||||||
|
|
||||||
# it's even worse, as it messes with the whole config, putting unwanted attributes in the reporting_params attribute. In short, inputting commas followed by spaces seems to confuse sge....
|
def asFormat1String(self):
|
||||||
|
strResult = ""
|
||||||
|
for (k, v) in self.m_attrs.items():
|
||||||
|
# print "%s %s" % (k,v)
|
||||||
|
# if the attribute's value is a list of comma separated strings, make sure there are no spaces after the commas, otherwise the value is not properly interpreted when read back into sge
|
||||||
|
# for example if the user sets the value of administrator_mail (using qconf -mconf global) to "alice@univ-rennes1.fr, bob@univ-rennes1.fr", then the next call to qconf -sconf global will show a wrong value for administrator_mail, as shown below:
|
||||||
|
# pag_cmd none
|
||||||
|
# administrator_mail alice@univ-rennes1.fr,
|
||||||
|
# token_extend_time none
|
||||||
|
|
||||||
# the tests below show that administrator_mail can only take a value, which can be a separator separated list, in which a separator is either :
|
# it's even worse, as it messes with the whole config, putting unwanted attributes in the reporting_params attribute. In short, inputting commas followed by spaces seems to confuse sge....
|
||||||
# - separator_form_a: a comma character (no spaces after)
|
|
||||||
# - separator_form_b: a comma character, followed by any number of spaces, then a backslash, then \n
|
|
||||||
|
|
||||||
# because we remove carriage returns in our values, the only storage option is separator_form_a
|
# the tests below show that administrator_mail can only take a value, which can be a separator separated list, in which a separator is either :
|
||||||
|
# - separator_form_a: a comma character (no spaces after)
|
||||||
|
# - separator_form_b: a comma character, followed by any number of spaces, then a backslash, then \n
|
||||||
|
|
||||||
# administrator_mail alice@univ-rennes1.fr
|
# because we remove carriage returns in our values, the only storage option is separator_form_a
|
||||||
# -> ok
|
|
||||||
|
|
||||||
# administrator_mail alice@univ-rennes1.fr,bob@univ-rennes1.fr
|
# administrator_mail alice@univ-rennes1.fr
|
||||||
# -> ok
|
# -> ok
|
||||||
|
|
||||||
# administrator_mail alice@univ-rennes1.fr, bob@univ-rennes1.fr
|
# administrator_mail alice@univ-rennes1.fr,bob@univ-rennes1.fr
|
||||||
# -> messes up
|
# -> ok
|
||||||
|
|
||||||
# administrator_mail alice@univ-rennes1.fr, \
|
# administrator_mail alice@univ-rennes1.fr, bob@univ-rennes1.fr
|
||||||
# bob@univ-rennes1.fr
|
# -> messes up
|
||||||
# -> ok
|
|
||||||
|
|
||||||
# administrator_mail alice@univ-rennes1.fr,bob@univ-rennes1.fr, \
|
# administrator_mail alice@univ-rennes1.fr, \
|
||||||
# bob2@univ-rennes1.fr
|
# bob@univ-rennes1.fr
|
||||||
# -> ok
|
# -> ok
|
||||||
|
|
||||||
# administrator_mail alice@univ-rennes1.fr,bob@univ-rennes1.fr, \
|
# administrator_mail alice@univ-rennes1.fr,bob@univ-rennes1.fr, \
|
||||||
# bob2@univ-rennes1.fr
|
# bob2@univ-rennes1.fr
|
||||||
# -> ok
|
# -> ok
|
||||||
|
|
||||||
# administrator_mail alice@univ-rennes1.fr,bob@univ-rennes1.fr \
|
# administrator_mail alice@univ-rennes1.fr,bob@univ-rennes1.fr, \
|
||||||
# bob2@univ-rennes1.fr
|
# bob2@univ-rennes1.fr
|
||||||
# -> error
|
# -> ok
|
||||||
# root@physix-master:~# qconf -Mconf /tmp/global
|
|
||||||
# only a single value is allowed for configuration attribute "administrator_mail"
|
|
||||||
|
|
||||||
cleaned_value = re.sub(',\s*', ',', v)
|
# administrator_mail alice@univ-rennes1.fr,bob@univ-rennes1.fr \
|
||||||
|
# bob2@univ-rennes1.fr
|
||||||
|
# -> error
|
||||||
|
# root@physix-master:~# qconf -Mconf /tmp/global
|
||||||
|
# only a single value is allowed for configuration attribute "administrator_mail"
|
||||||
|
|
||||||
# prevent space pollution in space separated values, such as in reporting_params (see https://bugzilla.ipr.univ-rennes1.fr/show_bug.cgi?id=2812). If spaces are not compacted, the space separated values will contain more and more spaces and at some point corrupt the value : a line containing just a backslash, such as in the following example:
|
cleaned_value = re.sub(r',\s*', ',', v)
|
||||||
# reporting_params accounting=true reporting=false \
|
|
||||||
# flush_time=00:00:15 joblog=false \
|
# prevent space pollution in space separated values, such as in reporting_params (see https://bugzilla.ipr.univ-rennes1.fr/show_bug.cgi?id=2812). If spaces are not compacted, the space separated values will contain more and more spaces and at some point corrupt the value : a line containing just a backslash, such as in the following example:
|
||||||
# sharelog=00:00:00
|
# reporting_params accounting=true reporting=false \
|
||||||
# \
|
# flush_time=00:00:15 joblog=false \
|
||||||
cleaned_value = re.sub('\s+', ' ', cleaned_value)
|
# sharelog=00:00:00
|
||||||
strResult += "%s %s\n" % (k, cleaned_value)
|
# \
|
||||||
# print("strResult=%s" % strResult)
|
cleaned_value = re.sub(r'\s+', ' ', cleaned_value)
|
||||||
return strResult
|
strResult += "%s %s\n" % (k, cleaned_value)
|
||||||
def asFormat2String( self ):
|
# print("strResult=%s" % strResult)
|
||||||
strResult = ""
|
return strResult
|
||||||
iNumAttrs = len(self.m_attrs)
|
|
||||||
if iNumAttrs == 0:
|
def asFormat2String(self):
|
||||||
return "NONE"
|
strResult = ""
|
||||||
iAttr = 0
|
iNumAttrs = len(self.m_attrs)
|
||||||
for (k,v) in self.m_attrs.items():
|
if iNumAttrs == 0:
|
||||||
#print "%s %s" % (k,v)
|
return "NONE"
|
||||||
strResult += "%s=%s" % (k,v)
|
iAttr = 0
|
||||||
if iAttr != (iNumAttrs - 1):
|
for (k, v) in self.m_attrs.items():
|
||||||
strResult += ","
|
# print "%s %s" % (k,v)
|
||||||
iAttr+=1
|
strResult += "%s=%s" % (k, v)
|
||||||
#print strSgeConfigString
|
if iAttr != (iNumAttrs - 1):
|
||||||
return strResult
|
strResult += ","
|
||||||
def dump( self ):
|
iAttr += 1
|
||||||
for (k,v) in self.m_attrs.items():
|
# print strSgeConfigString
|
||||||
print("['%s']='%s'" % (k,v))
|
return strResult
|
||||||
|
|
||||||
|
def dump(self):
|
||||||
|
for (k, v) in self.m_attrs.items():
|
||||||
|
print("['%s']='%s'" % (k, v))
|
||||||
|
|
|
@ -15,6 +15,7 @@ else:
|
||||||
from html.parser import HTMLParser
|
from html.parser import HTMLParser
|
||||||
from email.mime.text import MIMEText
|
from email.mime.text import MIMEText
|
||||||
|
|
||||||
|
|
||||||
def sendTextMail(strFrom, to, strSubject, text):
|
def sendTextMail(strFrom, to, strSubject, text):
|
||||||
# from = "SimpaCluster <guillaume.raffy@univ-rennes1.fr>"
|
# from = "SimpaCluster <guillaume.raffy@univ-rennes1.fr>"
|
||||||
mail = MIMEText(text)
|
mail = MIMEText(text)
|
||||||
|
@ -31,7 +32,7 @@ def sendTextMail(strFrom, to, strSubject, text):
|
||||||
class Error(Exception):
|
class Error(Exception):
|
||||||
def __init__(self, strMessage):
|
def __init__(self, strMessage):
|
||||||
self.m_strMessage = strMessage
|
self.m_strMessage = strMessage
|
||||||
|
|
||||||
|
|
||||||
def getHostName():
|
def getHostName():
|
||||||
(returnCode, stdout, stderr) = executeProgram(['hostname', '-s'])
|
(returnCode, stdout, stderr) = executeProgram(['hostname', '-s'])
|
||||||
|
@ -47,7 +48,7 @@ def log(message):
|
||||||
|
|
||||||
def executeProgram(astrArguments):
|
def executeProgram(astrArguments):
|
||||||
# log('executeProgram : program [%s]' % (','.join(astrArguments)))
|
# log('executeProgram : program [%s]' % (','.join(astrArguments)))
|
||||||
popen = subprocess.Popen( astrArguments, shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE) # bufsize=1 seems to prevent deadlocks that happen 50% the time
|
popen = subprocess.Popen(astrArguments, shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE) # bufsize=1 seems to prevent deadlocks that happen 50% the time
|
||||||
stdout, stderr = popen.communicate()
|
stdout, stderr = popen.communicate()
|
||||||
# popen.wait()
|
# popen.wait()
|
||||||
result = (popen.returncode, stdout.decode(), stderr)
|
result = (popen.returncode, stdout.decode(), stderr)
|
||||||
|
@ -60,7 +61,7 @@ def executeCommand(command):
|
||||||
"""
|
"""
|
||||||
executes the shell command such as 'set x=1; myprog $x'
|
executes the shell command such as 'set x=1; myprog $x'
|
||||||
"""
|
"""
|
||||||
popen = subprocess.Popen( [command], shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, executable='/bin/bash') # bufsize=1 seems to prevent deadlocks that happen 50% the time
|
popen = subprocess.Popen([command], shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, executable='/bin/bash') # bufsize=1 seems to prevent deadlocks that happen 50% the time
|
||||||
# if we don't specify the optional executable argument, then the default non interactive shell will be used. On debian, the default non-interactive shell is dash, which doesn't understand the keyword 'source' that we use in many places
|
# if we don't specify the optional executable argument, then the default non interactive shell will be used. On debian, the default non-interactive shell is dash, which doesn't understand the keyword 'source' that we use in many places
|
||||||
stdout, stderr = popen.communicate()
|
stdout, stderr = popen.communicate()
|
||||||
# popen.wait()
|
# popen.wait()
|
||||||
|
@ -85,29 +86,29 @@ def executeCommandOn(target_machine_fqdn, command, user=None):
|
||||||
target = '%s@%s' % (user, target_machine_fqdn)
|
target = '%s@%s' % (user, target_machine_fqdn)
|
||||||
else:
|
else:
|
||||||
target = target_machine_fqdn
|
target = target_machine_fqdn
|
||||||
|
|
||||||
result = executeProgram(['ssh', target, "%s" % command])
|
result = executeProgram(['ssh', target, "%s" % command])
|
||||||
logging.debug("finished executing %s on %s as %s" % (command, target_machine_fqdn, user))
|
logging.debug("finished executing %s on %s as %s" % (command, target_machine_fqdn, user))
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
|
||||||
def getUpsStatus():
|
def getUpsStatus():
|
||||||
|
|
||||||
class MyHTMLParser(HTMLParser):
|
class MyHTMLParser(HTMLParser):
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
HTMLParser.__init__(self)
|
HTMLParser.__init__(self)
|
||||||
self.TokenList = []
|
self.TokenList = []
|
||||||
|
|
||||||
def handle_data( self, data):
|
def handle_data(self, data):
|
||||||
data = data.strip()
|
data = data.strip()
|
||||||
if data and len(data) > 0:
|
if data and len(data) > 0:
|
||||||
self.TokenList.append(data)
|
self.TokenList.append(data)
|
||||||
# print data
|
# print data
|
||||||
|
|
||||||
def GetTokenList(self):
|
def GetTokenList(self):
|
||||||
return self.TokenList
|
return self.TokenList
|
||||||
from urllib.request import urlopen
|
from urllib.request import urlopen
|
||||||
|
|
||||||
try:
|
try:
|
||||||
url = 'http://Net Vision:public@129.20.27.119/PageMonComprehensive.html'
|
url = 'http://Net Vision:public@129.20.27.119/PageMonComprehensive.html'
|
||||||
f = urlopen(url)
|
f = urlopen(url)
|
||||||
|
@ -118,7 +119,8 @@ def getUpsStatus():
|
||||||
return
|
return
|
||||||
h = MyHTMLParser()
|
h = MyHTMLParser()
|
||||||
h.feed(res)
|
h.feed(res)
|
||||||
tokensList = h.GetTokenList() # @UnusedVariable
|
tokensList = h.GetTokenList() # noqa:F841
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
from SimpaDbUtil import wakeUp
|
from SimpaDbUtil import wakeUp
|
||||||
|
|
|
@ -1,63 +0,0 @@
|
||||||
'''
|
|
||||||
The goal of this application is to convert a mno database into mno's web site compatible database (drupal)
|
|
||||||
'''
|
|
||||||
|
|
||||||
import sqlite3
|
|
||||||
|
|
||||||
|
|
||||||
import os
|
|
||||||
import re
|
|
||||||
import sys
|
|
||||||
from SimpaDbUtil import SqlFile, SqlDatabaseReader
|
|
||||||
from _sqlite3 import Row
|
|
||||||
|
|
||||||
class OrchestraSqlDb( object ):
|
|
||||||
def __init__(self, sql_reader):
|
|
||||||
"""
|
|
||||||
:param SqlDatabaseReader sql_reader: the inventory database
|
|
||||||
"""
|
|
||||||
super(OrchestraSqlDb, self).__init__()
|
|
||||||
self._sql_reader = sql_reader
|
|
||||||
|
|
||||||
def query(self, sql_query):
|
|
||||||
return self._sql_reader.query(sql_query)
|
|
||||||
|
|
||||||
|
|
||||||
class Concert(object):
|
|
||||||
pass
|
|
||||||
|
|
||||||
class Recording(object):
|
|
||||||
pass
|
|
||||||
|
|
||||||
class OrchestraDb(object):
|
|
||||||
|
|
||||||
def __init__(self, mno_drupal_db_sql_file_path):
|
|
||||||
self.concerts = {}
|
|
||||||
|
|
||||||
sql_source = SqlFile(mno_drupal_db_sql_file_path)
|
|
||||||
sql_reader = SqlDatabaseReader(sql_source)
|
|
||||||
orchestra_sql_db = OrchestraSqlDb(sql_reader)
|
|
||||||
|
|
||||||
self._parse_from_orchestra_drupal_db(orchestra_sql_db)
|
|
||||||
|
|
||||||
def _parse_from_orchestra_drupal_db(self, orchestra_sql_db):
|
|
||||||
"""
|
|
||||||
:param OrchestraSqlDb orchestra_sql_db:
|
|
||||||
"""
|
|
||||||
|
|
||||||
concert_rows = orchestra_sql_db.query("SELECT nid,title FROM node WHERE type is 'concert'")
|
|
||||||
|
|
||||||
for concert_row in concert_rows:
|
|
||||||
(nid, title)=concert_row
|
|
||||||
print(title)
|
|
||||||
nid = int(nid)
|
|
||||||
track_id_rows = orchestra_sql_db.query("SELECT field_tracks_target_id FROM field_revision_field_tracks WHERE entity_id=%d" % nid )
|
|
||||||
for track_id_row in track_id_rows:
|
|
||||||
(field_tracks_target_id, ) = track_id_row
|
|
||||||
#print(field_tracks_target_id)
|
|
||||||
|
|
||||||
track_rows = orchestra_sql_db.query("SELECT title FROM node WHERE nid=%d" % field_tracks_target_id)
|
|
||||||
(recording_title, ) = track_rows[0]
|
|
||||||
print("\t%s" % recording_title)
|
|
||||||
|
|
||||||
mno_db = OrchestraDb('/Users/graffy/data/Perso/MeltingNotes_work.git/website/v2_drupal/melting_drupal.sql')
|
|
|
@ -1,84 +1,83 @@
|
||||||
import re
|
import re
|
||||||
|
|
||||||
def mysql_to_sqlite( mysql_sql_code, truncate_hex_strings = False ):
|
|
||||||
"""
|
|
||||||
converts a mysql-compatible sql code into a sqlite-ompatible sql code
|
|
||||||
|
|
||||||
note: the original code was found on internet, then tweaked
|
|
||||||
"""
|
|
||||||
content = mysql_sql_code
|
|
||||||
|
|
||||||
# unused commands
|
def mysql_to_sqlite(mysql_sql_code, truncate_hex_strings=False):
|
||||||
COMMAND_RE = re.compile(r'^(SET).*?;\n$', re.IGNORECASE | re.MULTILINE | re.DOTALL)
|
"""
|
||||||
content = COMMAND_RE.sub('', content)
|
converts a mysql-compatible sql code into a sqlite-ompatible sql code
|
||||||
|
|
||||||
# sqlite doesn't like COMMENT= , remove it properly before the table constraint filter because the table constraint filter is not clever enough to cope with ; inside comment strings
|
note: the original code was found on internet, then tweaked
|
||||||
# ENGINE=InnoDB DEFAULT CHARSET=utf8 COMMENT='A list of URL aliases for Drupal paths; a user may visit...';
|
"""
|
||||||
COMMENTS_EQUAL_RE = re.compile(r'\s+COMMENT=\'[^\']*\'', re.IGNORECASE | re.MULTILINE | re.DOTALL)
|
content = mysql_sql_code
|
||||||
# content = re.sub(r'^-- Tab[.]', 'toto', content, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL)
|
|
||||||
content = COMMENTS_EQUAL_RE.sub('', content)
|
|
||||||
|
|
||||||
# table constraints
|
# unused commands
|
||||||
TCONS_RE = re.compile(r'\)(\s*(CHARSET|DEFAULT|ENGINE)(=.*?)?\s*)+;', re.IGNORECASE | re.MULTILINE | re.DOTALL)
|
COMMAND_RE = re.compile(r'^(SET).*?;\n$', re.IGNORECASE | re.MULTILINE | re.DOTALL)
|
||||||
content = TCONS_RE.sub(');', content)
|
content = COMMAND_RE.sub('', content)
|
||||||
|
|
||||||
# remove comments
|
# sqlite doesn't like COMMENT= , remove it properly before the table constraint filter because the table constraint filter is not clever enough to cope with ; inside comment strings
|
||||||
# `nid` int(10) UNSIGNED NOT NULL DEFAULT '0' COMMENT 'The node.nid this record affects.',
|
# ENGINE=InnoDB DEFAULT CHARSET=utf8 COMMENT='A list of URL aliases for Drupal paths; a user may visit...';
|
||||||
COMMENTS_RE = re.compile(r'\s+COMMENT\s+\'[^\']*\'', re.IGNORECASE | re.MULTILINE | re.DOTALL)
|
COMMENTS_EQUAL_RE = re.compile(r'\s+COMMENT=\'[^\']*\'', re.IGNORECASE | re.MULTILINE | re.DOTALL)
|
||||||
# content = re.sub(r'^-- Tab[.]', 'toto', content, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL)
|
# content = re.sub(r'^-- Tab[.]', 'toto', content, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL)
|
||||||
content = COMMENTS_RE.sub('', content)
|
content = COMMENTS_EQUAL_RE.sub('', content)
|
||||||
|
|
||||||
# sqlite doesn't like ' being escaped as \', use '' instead
|
# table constraints
|
||||||
content = re.sub(r'\\\'', '\'\'', content, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL )
|
TCONS_RE = re.compile(r'\)(\s*(CHARSET|DEFAULT|ENGINE)(=.*?)?\s*)+;', re.IGNORECASE | re.MULTILINE | re.DOTALL)
|
||||||
|
content = TCONS_RE.sub(');', content)
|
||||||
|
|
||||||
if truncate_hex_strings:
|
# remove comments
|
||||||
# sqlite doesn't like too big hex strings 0x613a343a7b733a383a
|
# `nid` int(10) UNSIGNED NOT NULL DEFAULT '0' COMMENT 'The node.nid this record affects.',
|
||||||
content = re.sub(r'0x[0-9a-f]+', '0xdeadbeef', content, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL )
|
COMMENTS_RE = re.compile(r'\s+COMMENT\s+\'[^\']*\'', re.IGNORECASE | re.MULTILINE | re.DOTALL)
|
||||||
|
# content = re.sub(r'^-- Tab[.]', 'toto', content, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL)
|
||||||
|
content = COMMENTS_RE.sub('', content)
|
||||||
|
|
||||||
# sqlite doesn't understand
|
# sqlite doesn't like ' being escaped as \', use '' instead
|
||||||
# `format` varchar(100) CHARACTER SET utf8 COLLATE utf8_bin NOT NULL
|
content = re.sub(r'\\\'', '\'\'', content, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL)
|
||||||
content = re.sub(r'\s+CHARACTER SET\s+[^\s]+', '', content, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL )
|
|
||||||
|
|
||||||
# sqlite doesn't know the utf8_bin :
|
|
||||||
# `format` varchar(100) CHARACTER SET utf8 COLLATE utf8_bin NOT NULL
|
|
||||||
#no such collation sequence: utf8_bin
|
|
||||||
content = re.sub(r'\s+COLLATE\s+utf8_bin\s+', ' ', content, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL )
|
|
||||||
|
|
||||||
# sqlite doesn't like 'unsigned' as in `ip_address_3` tinyint(3) unsigned NOT NULL default '27',
|
if truncate_hex_strings:
|
||||||
content = re.sub(r' unsigned ', ' ', content, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL )
|
# sqlite doesn't like too big hex strings 0x613a343a7b733a383a
|
||||||
|
content = re.sub(r'0x[0-9a-f]+', '0xdeadbeef', content, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL)
|
||||||
|
|
||||||
# sqlite doesn't like 'enum' as in `type` enum('normal','light_out_management') NOT NULL default 'normal',,
|
# sqlite doesn't understand
|
||||||
content = re.sub(r' enum\([^\)]*\) ', ' varchar(255) ', content)
|
# `format` varchar(100) CHARACTER SET utf8 COLLATE utf8_bin NOT NULL
|
||||||
|
content = re.sub(r'\s+CHARACTER SET\s+[^\s]+', '', content, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL)
|
||||||
|
|
||||||
# sqlite doesn't support much of alter table (https://www.sqlite.org/lang_altertable.html). The following is not supported :
|
# sqlite doesn't know the utf8_bin :
|
||||||
# ALTER TABLE `blocked_ips`
|
# `format` varchar(100) CHARACTER SET utf8 COLLATE utf8_bin NOT NULL
|
||||||
# ADD PRIMARY KEY (`iid`),
|
# no such collation sequence: utf8_bin
|
||||||
# ADD KEY `blocked_ip` (`ip`);
|
content = re.sub(r'\s+COLLATE\s+utf8_bin\s+', ' ', content, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL)
|
||||||
content = re.sub(r'alter table [^;]*;', '', content, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL )
|
|
||||||
|
|
||||||
|
# sqlite doesn't like 'unsigned' as in `ip_address_3` tinyint(3) unsigned NOT NULL default '27',
|
||||||
|
content = re.sub(r' unsigned ', ' ', content, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL)
|
||||||
|
|
||||||
# COMMIT;
|
# sqlite doesn't like 'enum' as in `type` enum('normal','light_out_management') NOT NULL default 'normal',,
|
||||||
# sqlite3.OperationalError: cannot commit - no transaction is active
|
content = re.sub(r' enum\([^\)]*\) ', ' varchar(255) ', content)
|
||||||
content = re.sub(r'commit\s*;', '', content, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL )
|
|
||||||
|
|
||||||
# insert multiple values
|
# sqlite doesn't support much of alter table (https://www.sqlite.org/lang_altertable.html). The following is not supported :
|
||||||
# INSERTVALS_RE = re.compile(r'^(INSERT INTO.*?VALUES)\s*\((.*)\*;', re.IGNORECASE | re.MULTILINE | re.DOTALL)
|
# ALTER TABLE `blocked_ips`
|
||||||
INSERTVALS_RE = re.compile(r'^(INSERT INTO.*?VALUES)\s*([^;]*);', re.IGNORECASE | re.MULTILINE | re.DOTALL)
|
# ADD PRIMARY KEY (`iid`),
|
||||||
#INSERTVALS_RE = re.compile(r'^(INSERT INTO.*?VALUES)\s*((\[^\)](\)));$', re.IGNORECASE | re.MULTILINE | re.DOTALL)
|
# ADD KEY `blocked_ip` (`ip`);
|
||||||
INSERTVALS_SPLIT_RE = re.compile(r'\)\s*,\s*\(', re.IGNORECASE | re.MULTILINE | re.DOTALL)
|
content = re.sub(r'alter table [^;]*;', '', content, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL)
|
||||||
|
|
||||||
|
# COMMIT;
|
||||||
|
# sqlite3.OperationalError: cannot commit - no transaction is active
|
||||||
|
content = re.sub(r'commit\s*;', '', content, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL)
|
||||||
|
|
||||||
def insertvals_replacer(match):
|
# insert multiple values
|
||||||
insert, values = match.groups()
|
# INSERTVALS_RE = re.compile(r'^(INSERT INTO.*?VALUES)\s*\((.*)\*;', re.IGNORECASE | re.MULTILINE | re.DOTALL)
|
||||||
# print("insert=%s"%insert)
|
INSERTVALS_RE = re.compile(r'^(INSERT INTO.*?VALUES)\s*([^;]*);', re.IGNORECASE | re.MULTILINE | re.DOTALL)
|
||||||
# print("values=%s"%values)
|
# INSERTVALS_RE = re.compile(r'^(INSERT INTO.*?VALUES)\s*((\[^\)](\)));$', re.IGNORECASE | re.MULTILINE | re.DOTALL)
|
||||||
values = re.sub('^\s*\(' ,'', values)
|
INSERTVALS_SPLIT_RE = re.compile(r'\)\s*,\s*\(', re.IGNORECASE | re.MULTILINE | re.DOTALL)
|
||||||
values = re.sub('\)\s*$' ,'', values)
|
|
||||||
replacement = ''
|
|
||||||
for vals in INSERTVALS_SPLIT_RE.split(values):
|
|
||||||
#print("vals=%s"%vals)
|
|
||||||
replacement = '%s\n%s (%s);' % (replacement, insert, vals)
|
|
||||||
return replacement
|
|
||||||
|
|
||||||
content = INSERTVALS_RE.sub(insertvals_replacer, content)
|
def insertvals_replacer(match):
|
||||||
return content
|
insert, values = match.groups()
|
||||||
|
# print("insert=%s"%insert)
|
||||||
|
# print("values=%s"%values)
|
||||||
|
values = re.sub(r'^\s*\(', '', values)
|
||||||
|
values = re.sub(r'\)\s*$', '', values)
|
||||||
|
replacement = ''
|
||||||
|
for vals in INSERTVALS_SPLIT_RE.split(values):
|
||||||
|
# print("vals=%s"%vals)
|
||||||
|
replacement = '%s\n%s (%s);' % (replacement, insert, vals)
|
||||||
|
return replacement
|
||||||
|
|
||||||
|
content = INSERTVALS_RE.sub(insertvals_replacer, content)
|
||||||
|
return content
|
||||||
|
|
|
@ -1,17 +1,17 @@
|
||||||
class Version(object):
|
class Version(object):
|
||||||
"""
|
"""
|
||||||
simple version number made of a series of positive integers separated by dots
|
simple version number made of a series of positive integers separated by dots
|
||||||
|
|
||||||
distutils.version.StrictVersion : not good because versions such as 3.2.0.4 are not allowed (StrictVersion allows no more than 3 numbers)
|
distutils.version.StrictVersion : not good because versions such as 3.2.0.4 are not allowed (StrictVersion allows no more than 3 numbers)
|
||||||
distutils.version.LooseVersion : not good because the version string could be anything (https://stackoverflow.com/questions/11887762/how-do-i-compare-version-numbers-in-python)
|
distutils.version.LooseVersion : not good because the version string could be anything (https://stackoverflow.com/questions/11887762/how-do-i-compare-version-numbers-in-python)
|
||||||
|
|
||||||
"""
|
"""
|
||||||
def __init__(self, version_as_string):
|
def __init__(self, version_as_string):
|
||||||
"""
|
"""
|
||||||
:param str version_as_string: eg '6.2u5' or '8.1.9'
|
:param str version_as_string: eg '6.2u5' or '8.1.9'
|
||||||
"""
|
"""
|
||||||
self.numbers = [int(s) for s in version_as_string.replace('u', '.').split('.')]
|
self.numbers = [int(s) for s in version_as_string.replace('u', '.').split('.')]
|
||||||
|
|
||||||
def get_number(self, index):
|
def get_number(self, index):
|
||||||
if index >= len(self.numbers):
|
if index >= len(self.numbers):
|
||||||
return 0
|
return 0
|
||||||
|
|
|
@ -4,6 +4,7 @@
|
||||||
import socket
|
import socket
|
||||||
import struct
|
import struct
|
||||||
|
|
||||||
|
|
||||||
def wake_on_lan(macaddress):
|
def wake_on_lan(macaddress):
|
||||||
""" Switches on remote computers using WOL. """
|
""" Switches on remote computers using WOL. """
|
||||||
|
|
||||||
|
@ -15,10 +16,10 @@ def wake_on_lan(macaddress):
|
||||||
macaddress = macaddress.replace(sep, '')
|
macaddress = macaddress.replace(sep, '')
|
||||||
else:
|
else:
|
||||||
raise ValueError('Incorrect MAC address format')
|
raise ValueError('Incorrect MAC address format')
|
||||||
|
|
||||||
# Pad the synchronization stream.
|
# Pad the synchronization stream.
|
||||||
data = ''.join(['FFFFFFFFFFFF', macaddress * 20])
|
data = ''.join(['FFFFFFFFFFFF', macaddress * 20])
|
||||||
send_data = ''
|
send_data = ''
|
||||||
|
|
||||||
# Split up the hex values and pack.
|
# Split up the hex values and pack.
|
||||||
for i in range(0, len(data), 2):
|
for i in range(0, len(data), 2):
|
||||||
|
@ -29,14 +30,13 @@ def wake_on_lan(macaddress):
|
||||||
sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
|
sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
|
||||||
sock.setsockopt(socket.SOL_SOCKET, socket.SO_BROADCAST, 1)
|
sock.setsockopt(socket.SOL_SOCKET, socket.SO_BROADCAST, 1)
|
||||||
sock.sendto(send_data, ('<broadcast>', 7))
|
sock.sendto(send_data, ('<broadcast>', 7))
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
|
||||||
# Use macaddresses with any seperators.
|
# Use macaddresses with any seperators.
|
||||||
wake_on_lan('00:1E:52:F3:61:60') # simpatix28
|
wake_on_lan('00:1E:52:F3:61:60') # simpatix28
|
||||||
#wake_on_lan('00:24:36:F2:D0:FA') # simpatix33
|
# wake_on_lan('00:24:36:F2:D0:FA') # simpatix33
|
||||||
#wake_on_lan('0F:0F:DF:0F:BF:EF')
|
# wake_on_lan('0F:0F:DF:0F:BF:EF')
|
||||||
#wake_on_lan('0F-0F-DF-0F-BF-EF')
|
# wake_on_lan('0F-0F-DF-0F-BF-EF')
|
||||||
# or without any seperators.
|
# or without any seperators.
|
||||||
#wake_on_lan('0F0FDF0FBFEF')
|
# wake_on_lan('0F0FDF0FBFEF')
|
||||||
|
|
5
setup.py
5
setup.py
|
@ -1,7 +1,8 @@
|
||||||
from setuptools import setup
|
from setuptools import setup
|
||||||
|
|
||||||
setup(name='cocluto',
|
setup(
|
||||||
version=1.00,
|
name='cocluto',
|
||||||
|
version=1.01,
|
||||||
description='compute cluster utility tools',
|
description='compute cluster utility tools',
|
||||||
url='https://git.ipr.univ-rennes1.fr/graffy/cocluto',
|
url='https://git.ipr.univ-rennes1.fr/graffy/cocluto',
|
||||||
author='Guillaume Raffy',
|
author='Guillaume Raffy',
|
||||||
|
|
Loading…
Reference in New Issue