fixed pylint errors and cleaned up

work related to Bug 3315 - make simpaweb django app a packageable application
This commit is contained in:
Guillaume Raffy 2023-05-23 17:27:12 +02:00
parent 7a5d32dec0
commit 270304f58e
28 changed files with 2323 additions and 2293 deletions

View File

@ -2,313 +2,325 @@
import sys
sys.path.insert(0, '..')
import os
import MySQLdb
import threading
from Lib.Util import *
from Lib.SimpaDbUtil import *
import time
from ClusterStatus import ClusterStatus
from SlotAllocator import *
from Log import *
from ClusterNodeStatusUpdater import *
from SlotAllocator import DecoupledSlotAllocator
from Log import logDebug, logInfo
from ClusterNodeStatusUpdater import IWakeUpCompleteNotifier, ISleepCompleteNotifier
from SunGridEngine import SunGridEngine
import Util
from Util import log, onException
from WebServer import WebServerThread
from PowerState import PowerState
from HTMLParser import HTMLParser
VERSION='1.18'
VERSION = '1.18'
class MyHTMLParser(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
self.TokenList = []
def handle_data( self,data):
data = data.strip()
if data and len(data) > 0:
self.TokenList.append(data)
#print data
def GetTokenList(self):
return self.TokenList
def __init__(self):
HTMLParser.__init__(self)
self.TokenList = []
def handle_data(self, data):
data = data.strip()
if data and len(data) > 0:
self.TokenList.append(data)
# print data
def GetTokenList(self):
return self.TokenList
class WakeUpCompleteNotifier( IWakeUpCompleteNotifier ):
def __init__(self, machineName, clusterController):
self.m_machineName = machineName
self.m_clusterController = clusterController
def onWakeUpComplete( self ):
logDebug('WakeUpCompleteNotifier::onWakeUpComplete : start')
self.m_clusterController.onMachineWakeUpComplete( self.m_machineName )
class WakeUpCompleteNotifier(IWakeUpCompleteNotifier):
class SleepCompleteNotifier( ISleepCompleteNotifier ):
def __init__(self, machineName, clusterController):
self.m_machineName = machineName
self.m_clusterController = clusterController
def onSleepComplete( self, bSleepSucceeded ):
logDebug('SleepCompleteNotifier::onSleepComplete : start')
self.m_clusterController.onMachineSleepComplete( self.m_machineName, bSleepSucceeded )
def __init__(self, machineName, clusterController):
self.m_machineName = machineName
self.m_clusterController = clusterController
def onWakeUpComplete(self):
logDebug('WakeUpCompleteNotifier::onWakeUpComplete : start')
self.m_clusterController.onMachineWakeUpComplete(self.m_machineName)
class SleepCompleteNotifier(ISleepCompleteNotifier):
def __init__(self, machineName, clusterController):
self.m_machineName = machineName
self.m_clusterController = clusterController
def onSleepComplete(self, bSleepSucceeded):
logDebug('SleepCompleteNotifier::onSleepComplete : start')
self.m_clusterController.onMachineSleepComplete(self.m_machineName, bSleepSucceeded)
def jouleToKwh(fEnergyInJoules):
"""
converts joules to kWH
"""
# 1 kWh = 1000 * 3600 J
return fEnergyInJoules / (1000.0 * 3600.0)
def jouleToKwh( fEnergyInJoules ):
"""
converts joules to kWH
"""
# 1 kWh = 1000 * 3600 J
return fEnergyInJoules / (1000.0 * 3600.0)
class ClusterController:
"""
The cluster controller monitors the cluster's activity and has multiple purposes :
- energy saving : it can put some machines to sleep if they have nothing to do, or it
can wake them up when needed (eg when a new job has arrived)
- auto-repair : for examples
- it happened sometimes that sge_execd process disappeared for some unknown reason
in that case, the cluster controller can detect it and restart the daemon
automatically, without administrator's intervention
- clear the Error state of queues
- it could also be used to dynamically adapt sge's settings to the requirements of
jobs (eg add some machines to a queue).
Mechanism to let user get priority
"""
def __init__( self ):
gridEngine = SunGridEngine()
self.m_clusterStatus = ClusterStatus( gridEngine )
self.m_slotAllocator = DecoupledSlotAllocator() #SimpleSlotAllocator()
self.m_machinesThatNeedWakeUp = {}
self.m_machinesThatNeedWakeupLock = threading.Lock() # to prevent concurrent access to m_machinesThatNeedWakeUp
self.m_machinesThatNeedSleeping = {}
self.m_machinesThatNeedSleepingLock = threading.Lock() # to prevent concurrent access to m_machinesThatNeedSleeping
self.m_lastEnergyStatusLogTime = None
self.DELAY_BETWEEN_ENERGY_STATUS_LOGS = 60 # in seconds
self.m_iSessionId = None # session (run) identifier in database
self.m_webServer = WebServerThread(self)
self.m_bStop = False
self.m_bStopLock = threading.Lock() # to prevent concurrent access to m_bStop
def getClusterStatus( self ):
return self.m_clusterStatus
def log( self, message ):
print message
def shutdownLeastImportantNode( self ):
self.log("ClusterController::shutdownLeastImportantNode : start")
def onMachineWakeUpComplete( self, machineName ):
self.m_machinesThatNeedWakeupLock.acquire()
#logDebug('ClusterController::onMachineWakeUpComplete : machine %s old len(self.m_machinesThatNeedWakeUp) = %d' % (machineName,len(self.m_machinesThatNeedWakeUp)) )
del self.m_machinesThatNeedWakeUp[ machineName ]
#logDebug('ClusterController::onMachineWakeUpComplete : machine %s new len(self.m_machinesThatNeedWakeUp) = %d' % (machineName,len(self.m_machinesThatNeedWakeUp)) )
self.m_machinesThatNeedWakeupLock.release()
logDebug('ClusterController::onMachineWakeUpComplete : removed %s from the list of machines that need waking up because it\'s now awake' % machineName)
"""
The cluster controller monitors the cluster's activity and has multiple purposes :
- energy saving : it can put some machines to sleep if they have nothing to do, or it
can wake them up when needed (eg when a new job has arrived)
- auto-repair : for examples
- it happened sometimes that sge_execd process disappeared for some unknown reason
in that case, the cluster controller can detect it and restart the daemon
automatically, without administrator's intervention
- clear the Error state of queues
- it could also be used to dynamically adapt sge's settings to the requirements of
jobs (eg add some machines to a queue).
Mechanism to let user get priority
"""
def __init__(self):
gridEngine = SunGridEngine()
self.m_clusterStatus = ClusterStatus(gridEngine)
self.m_slotAllocator = DecoupledSlotAllocator() # SimpleSlotAllocator()
self.m_machinesThatNeedWakeUp = {}
self.m_machinesThatNeedWakeupLock = threading.Lock() # to prevent concurrent access to m_machinesThatNeedWakeUp
self.m_machinesThatNeedSleeping = {}
self.m_machinesThatNeedSleepingLock = threading.Lock() # to prevent concurrent access to m_machinesThatNeedSleeping
self.m_lastEnergyStatusLogTime = None
self.DELAY_BETWEEN_ENERGY_STATUS_LOGS = 60 # in seconds
self.m_iSessionId = None # session (run) identifier in database
self.m_webServer = WebServerThread(self)
self.m_bStop = False
self.m_bStopLock = threading.Lock() # to prevent concurrent access to m_bStop
def onMachineSleepComplete( self, machineName, bSleepSucceeded ):
self.m_machinesThatNeedSleepingLock.acquire()
#logDebug('ClusterController::onMachineSleepComplete : machine %s old len(self.m_machinesThatNeedWakeUp) = %d' % (machineName,len(self.m_machinesThatNeedWakeUp)) )
del self.m_machinesThatNeedSleeping[ machineName ]
#logDebug('ClusterController::onMachineSleepComplete : machine %s new len(self.m_machinesThatNeedWakeUp) = %d' % (machineName,len(self.m_machinesThatNeedWakeUp)) )
self.m_machinesThatNeedSleepingLock.release()
if bSleepSucceeded:
logDebug('ClusterController::onMachineSleepComplete : removed %s from the list of machines that need sleeping because it\'s now sleeping' % machineName)
else:
logDebug('ClusterController::onMachineSleepComplete : removed %s from the list of machines that need sleeping because it can\'t be put to sleep at the moment (eg a job just arrived)' % machineName)
def getNumPendingWakeUps( self ):
self.m_machinesThatNeedWakeupLock.acquire()
numPendingWakeUps = len(self.m_machinesThatNeedWakeUp)
self.m_machinesThatNeedWakeupLock.release()
return numPendingWakeUps
def getClusterStatus(self):
return self.m_clusterStatus
def getNumPendingSleeps( self ):
self.m_machinesThatNeedSleepingLock.acquire()
numPendingSleeps = len(self.m_machinesThatNeedSleeping)
self.m_machinesThatNeedSleepingLock.release()
return numPendingSleeps
def putIdleMachinesToSleep( self ):
self.m_clusterStatus.m_lock.acquire()
idleMachines = self.m_clusterStatus.getIdleMachines()
# logInfo('idleMachines :')
self.m_machinesThatNeedToSleep = []
for machineName, idleMachine in idleMachines.items():
if idleMachine.getPowerState() == PowerState.ON:
# logInfo('\t%s' % machineName)
if idleMachine.getName() != 'simpatix10': # never put simpatix10 to sleep because it's the sge master and is also server for other things
self.m_machinesThatNeedSleeping[idleMachine.getName()]=idleMachine
self.m_clusterStatus.m_lock.release()
def log(self, message):
print(message)
listOfMachinesThatNeedSleeping = self.m_machinesThatNeedSleeping.values() # duplicate the list so that we don't iterate on m_machinesThatNeedSleeping, which could cause a runtime error because callbacks alter m_machinesThatNeedWakeUp
for machine in listOfMachinesThatNeedSleeping:
logInfo('ClusterController::putIdleMachinesToSleep : requesting sleep for %s because it\'s idle' % machine.getName())
machine.requestSleep( SleepCompleteNotifier( machine.getName(), self ) )
if len(listOfMachinesThatNeedSleeping) != 0:
# hack : wait until the sleep requests are handled so that we don't request the same machine to sleep multiple times
while self.getNumPendingSleeps() > 0:
time.sleep(1)
def shutdownLeastImportantNode(self):
self.log("ClusterController::shutdownLeastImportantNode : start")
def wakeUpMachinesForPendingJobs(self):
listOfMachinesThatNeedWakeUp = []
self.m_clusterStatus.m_lock.acquire()
pendingJobs = self.m_clusterStatus.getPendingJobs()
"""
logInfo('pending jobs :')
for job in pendingJobs.values():
logInfo('\t%d' % job.getId().asStr())
"""
if len(pendingJobs) != 0:
self.m_machinesThatNeedWakeUp = self.m_slotAllocator.getMachinesThatNeedWakeUp( pendingJobs, self.m_clusterStatus )
if len(self.m_machinesThatNeedWakeUp) == 0:
None
#logInfo('ClusterController::updateNormalState : no machine needs waking up' )
else:
listOfMachinesThatNeedWakeUp = self.m_machinesThatNeedWakeUp.values() # duplicate the list so that we don't iterate on m_machinesThatNeedWakeUp, which would cause a runtime error because callbacks alter m_machinesThatNeedWakeUp
for machine in listOfMachinesThatNeedWakeUp:
logInfo('ClusterController::wakeUpMachinesForPendingJobs : requesting wake up for '+machine.getName() )
machine.requestWakeUp( WakeUpCompleteNotifier( machine.getName(), self ) )
self.m_clusterStatus.m_lock.release()
if len(listOfMachinesThatNeedWakeUp) != 0:
# hack : wait until the wakeup requests are handled so that a later sleep request doesn't cancel it
# and also wait for the jobs to come in
while self.getNumPendingWakeUps() > 0:
time.sleep(1)
iSGE_CHEK_RUNNABLE_JOBS_DELAY = 60 * 5 # max time it takes for sge between the fact that a queued job is runnable and SGE actually starting it (I've put a long time here because sometimes, qstat takes a long time to ralise that the machine is available after I wake it up)
logInfo('ClusterController::wakeUpMachinesForPendingJobs : all required machines are awake. Now give %d seconds to SGE to allocate slots.' % iSGE_CHEK_RUNNABLE_JOBS_DELAY)
# wait until SGE has a chance to allocate slots
time.sleep(iSGE_CHEK_RUNNABLE_JOBS_DELAY) # note : this is annoying because it blocks the main thread. This could be improved if we forbid the machines to go to sleep for that much time....
logInfo('ClusterController::wakeUpMachinesForPendingJobs : end of the delay given to SGE to allocate slots')
def updateNormalState( self ):
# attempt to shut down machines that are idle
self.putIdleMachinesToSleep()
# wake up necessary machines if there are pending jobs
self.wakeUpMachinesForPendingJobs()
def storeSessionInDatabase( self ):
conn = MySQLdb.connect('simpatix10', 'clusterctrl', '', 'clustercontroller')
assert(conn)
# retrieve the session id, as it's an auto_increment field
sqlCommand = "SELECT AUTO_INCREMENT FROM information_schema.TABLES WHERE TABLE_SCHEMA = 'clustercontroller' AND TABLE_NAME = 'sessions_desc'"
print sqlCommand
conn.query(sqlCommand)
r=conn.store_result()
iSessionId = r.fetch_row()[0][0]
# stores information about the session
sqlCommand = "INSERT INTO `sessions_desc` (`start_time`, end_time, `program_version`, `machine_name`, `pid`, num_controlled_machines) VALUES (NOW(), NOW(), '%s', 'simpatix10', %d, %d);" % (VERSION, os.getpid(), len(self.m_clusterStatus.m_clusterNodes))
print sqlCommand
conn.query(sqlCommand)
# initialize the energy savings table
sqlCommand = "INSERT INTO session_to_energy_savings (session_id, energy_savings_kwh) VALUES (%d,0.0);" % (iSessionId)
print sqlCommand
conn.query(sqlCommand)
def onMachineWakeUpComplete(self, machineName):
self.m_machinesThatNeedWakeupLock.acquire()
# logDebug('ClusterController::onMachineWakeUpComplete : machine %s old len(self.m_machinesThatNeedWakeUp) = %d' % (machineName,len(self.m_machinesThatNeedWakeUp)))
del self.m_machinesThatNeedWakeUp[machineName]
# logDebug('ClusterController::onMachineWakeUpComplete : machine %s new len(self.m_machinesThatNeedWakeUp) = %d' % (machineName,len(self.m_machinesThatNeedWakeUp)))
self.m_machinesThatNeedWakeupLock.release()
logDebug('ClusterController::onMachineWakeUpComplete : removed %s from the list of machines that need waking up because it\'s now awake' % machineName)
conn.close()
print( 'Session Iid = %d' % iSessionId )
return iSessionId
def updateSessionEnergyConsumptionInDatabase( self ):
conn = MySQLdb.connect('simpatix10', 'root', '', 'clustercontroller')
assert(conn)
# update energy savings for the current session
sqlCommand = "UPDATE session_to_energy_savings SET energy_savings_kwh=%f WHERE session_id=%d;" % ( jouleToKwh(self.m_clusterStatus.getEnergySavings()) ,self.m_iSessionId)
print sqlCommand
conn.query(sqlCommand)
# update the end time of the current session
sqlCommand = "UPDATE sessions_desc SET end_time=NOW() WHERE session_id=%d;" % (self.m_iSessionId)
print sqlCommand
conn.query(sqlCommand)
conn.close()
def setControlOnMachine(self, machineName, bControl):
"""
adds or removes the control of ClusterController on the given machine
"""
self.m_clusterStatus.setControlOnMachine(machineName, bControl)
def run( self ):
"""
"""
self.m_iSessionId = self.storeSessionInDatabase()
log("storeSessionInDatabase completed")
DELAY_BETWEEN_MEASURES = 10 # in seconds
self.m_clusterStatus.startReadingThreads()
self.m_webServer.start()
while not self.m_clusterStatus.isReady():
log('waiting for system to be ready')
time.sleep(1)
None
logInfo('ClusterController::run : cluster initial readings have completed')
startTime = time.localtime()
while not self.m_bStop:
currentTime = time.time()
#clusterStatus.m_nodesStatus['simpatix10'].dump()
if (not self.m_lastEnergyStatusLogTime) or (currentTime > (self.m_lastEnergyStatusLogTime +self.DELAY_BETWEEN_ENERGY_STATUS_LOGS)):
iNumMachines = len(self.m_clusterStatus.m_clusterNodes)
iNumMachinesOn = 0
iNumSleepingMachines = 0
for machine in self.m_clusterStatus.m_clusterNodes.values():
ePowerState = machine.getPowerState()
if ePowerState == PowerState.ON:
iNumMachinesOn+=1
elif ePowerState == PowerState.SLEEP:
iNumSleepingMachines+=1
logInfo('%d machines (%d ON, %d SLEEPING)' % (iNumMachines, iNumMachinesOn, iNumSleepingMachines))
iNumSlots = self.m_clusterStatus.getNumControlledSlots()
iNumUsedSlots = self.m_clusterStatus.getNumUsedSlots()
iNumWastedSlots = self.m_clusterStatus.getNumWastedSlots()
iNumSleepingSlots = self.m_clusterStatus.getNumSleepingSlots()
logInfo('%d slots (%d used, %d wasted, %d sleeping)' % (iNumSlots, iNumUsedSlots, iNumWastedSlots, iNumSleepingSlots ))
logInfo('cluster estimated power consumption : %f W (saving from cluster controller : %f W)' % (self.m_clusterStatus.getCurrentPowerConsumption(), self.m_clusterStatus.getCurrentPowerSavings()) )
logInfo('cluster estimated energy consumption since %s : %f kWh (saving from cluster controller : %f kWh)' % (time.asctime(startTime), jouleToKwh(self.m_clusterStatus.getEnergyConsumption()), jouleToKwh(self.m_clusterStatus.getEnergySavings())))
self.updateSessionEnergyConsumptionInDatabase()
self.m_lastEnergyStatusLogTime = currentTime
def onMachineSleepComplete(self, machineName, bSleepSucceeded):
self.m_machinesThatNeedSleepingLock.acquire()
# logDebug('ClusterController::onMachineSleepComplete : machine %s old len(self.m_machinesThatNeedWakeUp) = %d' % (machineName,len(self.m_machinesThatNeedWakeUp)))
del self.m_machinesThatNeedSleeping[machineName]
# logDebug('ClusterController::onMachineSleepComplete : machine %s new len(self.m_machinesThatNeedWakeUp) = %d' % (machineName,len(self.m_machinesThatNeedWakeUp)))
self.m_machinesThatNeedSleepingLock.release()
if bSleepSucceeded:
logDebug('ClusterController::onMachineSleepComplete : removed %s from the list of machines that need sleeping because it\'s now sleeping' % machineName)
else:
logDebug('ClusterController::onMachineSleepComplete : removed %s from the list of machines that need sleeping because it can\'t be put to sleep at the moment (eg a job just arrived)' % machineName)
self.updateNormalState()
time.sleep(DELAY_BETWEEN_MEASURES)
self.m_clusterStatus.stopReadingThreads()
def getNumPendingWakeUps(self):
self.m_machinesThatNeedWakeupLock.acquire()
numPendingWakeUps = len(self.m_machinesThatNeedWakeUp)
self.m_machinesThatNeedWakeupLock.release()
return numPendingWakeUps
def getNumPendingSleeps(self):
self.m_machinesThatNeedSleepingLock.acquire()
numPendingSleeps = len(self.m_machinesThatNeedSleeping)
self.m_machinesThatNeedSleepingLock.release()
return numPendingSleeps
def putIdleMachinesToSleep(self):
self.m_clusterStatus.m_lock.acquire()
idleMachines = self.m_clusterStatus.getIdleMachines()
# logInfo('idleMachines :')
self.m_machinesThatNeedToSleep = []
for machineName, idleMachine in idleMachines.items():
if idleMachine.getPowerState() == PowerState.ON:
# logInfo('\t%s' % machineName)
if idleMachine.getName() != 'simpatix10': # never put simpatix10 to sleep because it's the sge master and is also server for other things
self.m_machinesThatNeedSleeping[idleMachine.getName()] = idleMachine
self.m_clusterStatus.m_lock.release()
listOfMachinesThatNeedSleeping = self.m_machinesThatNeedSleeping.values() # duplicate the list so that we don't iterate on m_machinesThatNeedSleeping, which could cause a runtime error because callbacks alter m_machinesThatNeedWakeUp
for machine in listOfMachinesThatNeedSleeping:
logInfo('ClusterController::putIdleMachinesToSleep : requesting sleep for %s because it\'s idle' % machine.getName())
machine.requestSleep(SleepCompleteNotifier(machine.getName(), self))
if len(listOfMachinesThatNeedSleeping) != 0:
# hack : wait until the sleep requests are handled so that we don't request the same machine to sleep multiple times
while self.getNumPendingSleeps() > 0:
time.sleep(1)
def wakeUpMachinesForPendingJobs(self):
listOfMachinesThatNeedWakeUp = []
self.m_clusterStatus.m_lock.acquire()
pendingJobs = self.m_clusterStatus.getPendingJobs()
"""
logInfo('pending jobs :')
for job in pendingJobs.values():
logInfo('\t%d' % job.getId().asStr())
"""
if len(pendingJobs) != 0:
self.m_machinesThatNeedWakeUp = self.m_slotAllocator.getMachinesThatNeedWakeUp(pendingJobs, self.m_clusterStatus)
if len(self.m_machinesThatNeedWakeUp) == 0:
None
# logInfo('ClusterController::updateNormalState : no machine needs waking up')
else:
listOfMachinesThatNeedWakeUp = self.m_machinesThatNeedWakeUp.values() # duplicate the list so that we don't iterate on m_machinesThatNeedWakeUp, which would cause a runtime error because callbacks alter m_machinesThatNeedWakeUp
for machine in listOfMachinesThatNeedWakeUp:
logInfo('ClusterController::wakeUpMachinesForPendingJobs : requesting wake up for ' + machine.getName())
machine.requestWakeUp(WakeUpCompleteNotifier(machine.getName(), self))
self.m_clusterStatus.m_lock.release()
if len(listOfMachinesThatNeedWakeUp) != 0:
# hack : wait until the wakeup requests are handled so that a later sleep request doesn't cancel it
# and also wait for the jobs to come in
while self.getNumPendingWakeUps() > 0:
time.sleep(1)
iSGE_CHEK_RUNNABLE_JOBS_DELAY = 60 * 5 # max time it takes for sge between the fact that a queued job is runnable and SGE actually starting it (I've put a long time here because sometimes, qstat takes a long time to ralise that the machine is available after I wake it up)
logInfo('ClusterController::wakeUpMachinesForPendingJobs : all required machines are awake. Now give %d seconds to SGE to allocate slots.' % iSGE_CHEK_RUNNABLE_JOBS_DELAY)
# wait until SGE has a chance to allocate slots
time.sleep(iSGE_CHEK_RUNNABLE_JOBS_DELAY) # note : this is annoying because it blocks the main thread. This could be improved if we forbid the machines to go to sleep for that much time....
logInfo('ClusterController::wakeUpMachinesForPendingJobs : end of the delay given to SGE to allocate slots')
def updateNormalState(self):
# attempt to shut down machines that are idle
self.putIdleMachinesToSleep()
# wake up necessary machines if there are pending jobs
self.wakeUpMachinesForPendingJobs()
def storeSessionInDatabase(self):
conn = MySQLdb.connect('simpatix10', 'clusterctrl', '', 'clustercontroller')
assert conn
# retrieve the session id, as it's an auto_increment field
sqlCommand = "SELECT AUTO_INCREMENT FROM information_schema.TABLES WHERE TABLE_SCHEMA = 'clustercontroller' AND TABLE_NAME = 'sessions_desc'"
print(sqlCommand)
conn.query(sqlCommand)
r = conn.store_result()
iSessionId = r.fetch_row()[0][0]
# stores information about the session
sqlCommand = "INSERT INTO `sessions_desc` (`start_time`, end_time, `program_version`, `machine_name`, `pid`, num_controlled_machines) VALUES (NOW(), NOW(), '%s', 'simpatix10', %d, %d);" % (VERSION, os.getpid(), len(self.m_clusterStatus.m_clusterNodes))
print(sqlCommand)
conn.query(sqlCommand)
# initialize the energy savings table
sqlCommand = "INSERT INTO session_to_energy_savings (session_id, energy_savings_kwh) VALUES (%d,0.0);" % (iSessionId)
print(sqlCommand)
conn.query(sqlCommand)
conn.close()
print('Session Iid = %d' % iSessionId)
return iSessionId
def updateSessionEnergyConsumptionInDatabase(self):
conn = MySQLdb.connect('simpatix10', 'root', '', 'clustercontroller')
assert conn
# update energy savings for the current session
sqlCommand = "UPDATE session_to_energy_savings SET energy_savings_kwh=%f WHERE session_id=%d;" % (jouleToKwh(self.m_clusterStatus.getEnergySavings()), self.m_iSessionId)
print(sqlCommand)
conn.query(sqlCommand)
# update the end time of the current session
sqlCommand = "UPDATE sessions_desc SET end_time=NOW() WHERE session_id=%d;" % (self.m_iSessionId)
print(sqlCommand)
conn.query(sqlCommand)
conn.close()
def setControlOnMachine(self, machineName, bControl):
"""
adds or removes the control of ClusterController on the given machine
"""
self.m_clusterStatus.setControlOnMachine(machineName, bControl)
def run(self):
"""
"""
self.m_iSessionId = self.storeSessionInDatabase()
log("storeSessionInDatabase completed")
DELAY_BETWEEN_MEASURES = 10 # in seconds
self.m_clusterStatus.startReadingThreads()
self.m_webServer.start()
while not self.m_clusterStatus.isReady():
log('waiting for system to be ready')
time.sleep(1)
None
logInfo('ClusterController::run : cluster initial readings have completed')
startTime = time.localtime()
while not self.m_bStop:
currentTime = time.time()
# clusterStatus.m_nodesStatus['simpatix10'].dump()
if (not self.m_lastEnergyStatusLogTime) or (currentTime > (self.m_lastEnergyStatusLogTime + self.DELAY_BETWEEN_ENERGY_STATUS_LOGS)):
iNumMachines = len(self.m_clusterStatus.m_clusterNodes)
iNumMachinesOn = 0
iNumSleepingMachines = 0
for machine in self.m_clusterStatus.m_clusterNodes.values():
ePowerState = machine.getPowerState()
if ePowerState == PowerState.ON:
iNumMachinesOn += 1
elif ePowerState == PowerState.SLEEP:
iNumSleepingMachines += 1
logInfo('%d machines (%d ON, %d SLEEPING)' % (iNumMachines, iNumMachinesOn, iNumSleepingMachines))
iNumSlots = self.m_clusterStatus.getNumControlledSlots()
iNumUsedSlots = self.m_clusterStatus.getNumUsedSlots()
iNumWastedSlots = self.m_clusterStatus.getNumWastedSlots()
iNumSleepingSlots = self.m_clusterStatus.getNumSleepingSlots()
logInfo('%d slots (%d used, %d wasted, %d sleeping)' % (iNumSlots, iNumUsedSlots, iNumWastedSlots, iNumSleepingSlots))
logInfo('cluster estimated power consumption : %f W (saving from cluster controller : %f W)' % (self.m_clusterStatus.getCurrentPowerConsumption(), self.m_clusterStatus.getCurrentPowerSavings()))
logInfo('cluster estimated energy consumption since %s : %f kWh (saving from cluster controller : %f kWh)' % (time.asctime(startTime), jouleToKwh(self.m_clusterStatus.getEnergyConsumption()), jouleToKwh(self.m_clusterStatus.getEnergySavings())))
self.updateSessionEnergyConsumptionInDatabase()
self.m_lastEnergyStatusLogTime = currentTime
self.updateNormalState()
time.sleep(DELAY_BETWEEN_MEASURES)
self.m_clusterStatus.stopReadingThreads()
def storeClusterNodeStatus(clusterNodeStatus):
# conn = MySQLdb.connect('simpatix10', 'measures_writer', '', 'simpa_measurements')
conn = MySQLdb.connect('simpatix10', 'root', '', 'simpa_measurements')
assert conn
# conn.query("""INSERT INTO `fan_rpm_logs` (`fan_id`, `rpm`, `date`) VALUES ('titi', 2000, NOW());""")
'''
conn.query("""SELECT * FROM fan_rpm_logs""")
r=conn.store_result()
print r.fetch_row()[0]
'''
for key, sensor in clusterNodeStatus.m_sensors.items():
sensorId = clusterNodeStatus.m_clusterNodeName + '_' + sensor.m_name
if sensor.typeName() == 'Fan':
sqlCommand = """INSERT INTO `fan_rpm_logs` (`fan_id`, `rpm`, `date`) VALUES ('""" + sensorId + """', """ + str(sensor.m_rpms) + """, NOW());"""
print(sqlCommand)
conn.query(sqlCommand)
elif sensor.typeName() == 'Temperature':
sqlCommand = """INSERT INTO `temperature_logs` (`temp_sensor_id`, `temperature`, `date`) VALUES ('""" + sensorId + """', """ + str(sensor.m_temperature) + """, NOW());"""
print(sqlCommand)
conn.query(sqlCommand)
else:
assert False
conn.close()
def storeClusterNodeStatus( clusterNodeStatus ):
#conn = MySQLdb.connect('simpatix10', 'measures_writer', '', 'simpa_measurements')
conn = MySQLdb.connect('simpatix10', 'root', '', 'simpa_measurements')
assert(conn)
#conn.query("""INSERT INTO `fan_rpm_logs` (`fan_id`, `rpm`, `date`) VALUES ('titi', 2000, NOW());""")
'''
conn.query("""SELECT * FROM fan_rpm_logs""")
r=conn.store_result()
print r.fetch_row()[0]
'''
for key, sensor in clusterNodeStatus.m_sensors.items():
sensorId = clusterNodeStatus.m_clusterNodeName + '_' + sensor.m_name
if sensor.typeName() == 'Fan':
sqlCommand = """INSERT INTO `fan_rpm_logs` (`fan_id`, `rpm`, `date`) VALUES ('"""+sensorId+"""', """+str(sensor.m_rpms)+""", NOW());"""
print sqlCommand
conn.query(sqlCommand)
elif sensor.typeName() == 'Temperature':
sqlCommand = """INSERT INTO `temperature_logs` (`temp_sensor_id`, `temperature`, `date`) VALUES ('"""+sensorId+"""', """+str(sensor.m_temperature)+""", NOW());"""
print sqlCommand
conn.query(sqlCommand)
else:
assert(False)
conn.close()
if __name__ == '__main__':
#Lib.Util.sendTextMail( 'SimpaCluster <guillaume.raffy@univ-rennes1.fr>', 'guillaume.raffy@univ-rennes1.fr', 'mail subject', 'mail content')
try:
logInfo('ClusterController v. %s starting....' % VERSION)
#executeCommand('ping -o -t 1 simpatix310 > /dev/null')
#print executeCommand('ssh simpatix10 "ipmitool sensor"')
#assert False, 'prout'
controller = ClusterController()
controller.run()
#machineNameToMacAddress( 'simpatix10' )
#except AssertionError, error:
#except KeyboardInterrupt, error:
except BaseException, exception: # catches all exceptions, including the ctrl+C (KeyboardInterrupt)
Util.onException(exception)
# Lib.Util.sendTextMail('SimpaCluster <guillaume.raffy@univ-rennes1.fr>', 'guillaume.raffy@univ-rennes1.fr', 'mail subject', 'mail content')
try:
logInfo('ClusterController v. %s starting....' % VERSION)
# executeCommand('ping -o -t 1 simpatix310 > /dev/null')
# print executeCommand('ssh simpatix10 "ipmitool sensor"')
# assert False, 'prout'
controller = ClusterController()
controller.run()
# machineNameToMacAddress('simpatix10')
# except AssertionError, error:
# except KeyboardInterrupt, error:
except BaseException as exception: # catches all exceptions, including the ctrl+C (KeyboardInterrupt)
onException(exception)

View File

@ -1,140 +1,142 @@
import threading
from PowerState import *
from ClusterNodeStatusUpdater import *
from PowerState import PowerState, PowerStateToStr
from ClusterNodeStatusUpdater import ClusterNodeStatusUpdater
import Lib.Util
import Lib.SimpaDbUtil
from Log import logInfo, logWarning
from datetime import datetime
from datetime import *
class ClusterNode:
"""
the state of a machine node
"""
def __init__( self, machineName, cluster, gridEngine ):
self.m_name = machineName
self.m_cluster = cluster # the cluster this machine belongs to
self.m_requestedPowerState = PowerState.ON
self.m_powerState = PowerState.UNKNOWN
self.m_lastPowerStateTime = None # time at which the last value of self.m_powerState has been set
self.m_machineStatusUpdater = ClusterNodeStatusUpdater( machineName, self, gridEngine )
self.m_energyConsumption = 0.0 # estimate of the energy consumption of this machine since the start of cluster controller (in joules)
self.m_energySavings = 0.0 # estimate of the energy savings on this machine caused by the cluster controller since it started (in joules)
def getName( self ):
return self.m_name
def isReady( self ):
if self.m_powerState == PowerState.UNKNOWN:
#logInfo( self.m_name + ' is not ready (waiting for power state)' )
return False
if self.m_powerState == PowerState.ON:
return True
#log( self.m_name + ' is ready' )
return True
def getPowerState( self ):
return self.m_powerState
def setShouldAlwaysBeOn( self ):
self.m_machineStatusUpdater.setShouldAlwaysBeOn( )
self.setPowerState( PowerState.ON )
def setPowerState( self, powerState ):
bUpdateRequiredChecks = False
if self.m_powerState == PowerState.UNKNOWN:
logInfo('ClusterNode::setPowerState : '+self.m_name+'\'s power state has been initialized to '+PowerStateToStr( powerState ))
self.m_powerState = powerState
self.m_lastPowerStateTime = datetime.now()
bUpdateRequiredChecks = True
else:
# update the estimation of energy consumption
self.updateEnergyMeasurements()
# then change the power state
if self.m_powerState != powerState:
logInfo('ClusterNode::setPowerState : '+self.m_name+'\'s power state has been changed to '+PowerStateToStr( powerState ))
self.m_powerState = powerState
self.m_lastPowerStateTime = datetime.now()
bUpdateRequiredChecks = True
if bUpdateRequiredChecks:
if self.m_powerState == PowerState.ON:
self.m_machineStatusUpdater.m_bCheckPowerState = True
self.m_machineStatusUpdater.m_bCheckSensors = True
elif self.m_powerState == PowerState.OFF:
self.m_machineStatusUpdater.m_bCheckPowerState = True
self.m_machineStatusUpdater.m_bCheckSensors = False
elif self.m_powerState == PowerState.SLEEP:
self.m_machineStatusUpdater.m_bCheckPowerState = True
self.m_machineStatusUpdater.m_bCheckSensors = False
elif self.m_powerState == PowerState.UNPLUGGED:
self.m_machineStatusUpdater.m_bCheckPowerState = True
self.m_machineStatusUpdater.m_bCheckSensors = False
else:
assert( False )
def onNewPowerStateReading( self, powerState ):
"""
called when a new powerstate reading arrives
"""
if powerState != self.getPowerState():
if self.getPowerState() != PowerState.UNKNOWN:
logWarning('ClusterNode::onNewPowerStateReading : '+self.m_name+'\'s power state has been (manually it seems) changed to '+PowerStateToStr( powerState ))
self.setPowerState( powerState )
"""
the state of a machine node
"""
def __init__(self, machineName, cluster, gridEngine):
self.m_name = machineName
self.m_cluster = cluster # the cluster this machine belongs to
self.m_requestedPowerState = PowerState.ON
self.m_powerState = PowerState.UNKNOWN
self.m_lastPowerStateTime = None # time at which the last value of self.m_powerState has been set
self.m_machineStatusUpdater = ClusterNodeStatusUpdater(machineName, self, gridEngine)
self.m_energyConsumption = 0.0 # estimate of the energy consumption of this machine since the start of cluster controller (in joules)
self.m_energySavings = 0.0 # estimate of the energy savings on this machine caused by the cluster controller since it started (in joules)
def getPowerConsumptionForPowerState( self, ePowerState ):
"""
returns the power consumption estimation (in watts) of this machine for the given power state
"""
fCurrentIntensity = 0.0
fCurrentVoltage = 220.0
# noticed on 26.08.2009 that putting 22 machines from sleep to on eats 17 A, resulting in difference of 0.77 A per machine
if ePowerState == PowerState.ON:
fCurrentIntensity = 0.9 # value when the machine is doing nothing
elif ePowerState == PowerState.OFF:
fCurrentIntensity = 0.1
elif ePowerState == PowerState.SLEEP:
fCurrentIntensity = 0.1
elif ePowerState == PowerState.UNPLUGGED:
fCurrentIntensity = 0.0
else:
assert(False)
return fCurrentIntensity * fCurrentVoltage
def updateEnergyMeasurements( self ):
timeInterval = datetime.now() - self.m_lastPowerStateTime
self.m_energyConsumption += self.getPowerConsumptionForPowerState( self.m_powerState ) * timeInterval.seconds
self.m_energySavings += ( self.getPowerConsumptionForPowerState( PowerState.ON ) - self.getPowerConsumptionForPowerState( self.m_powerState ) ) * timeInterval.seconds
self.m_lastPowerStateTime = datetime.now()
#logDebug('energy savings on %s : %f J' %(self.getName(), self.m_energySavings))
def getEnergyConsumption( self ):
"""
in joules
"""
self.updateEnergyMeasurements()
return self.m_energyConsumption
def getPowerConsumption( self ):
fCurrentPowerConsumption = self.getPowerConsumptionForPowerState( self.m_powerState )
#logDebug('getPowerConsumption of %s : %f (powerstate = %d)' % (self.getName(), fCurrentPowerConsumption, self.m_powerState))
return fCurrentPowerConsumption
def getEnergySavings( self ):
self.updateEnergyMeasurements()
return self.m_energySavings
def onSleepFailedBecauseAJobJustArrived( self ):
logInfo('%s was scheduled to sleep but the sleep is canceled because it\'s currently executing a new job' % self.m_name)
def requestSleep( self, sleepCompleteNotifier = None ):
self.m_machineStatusUpdater.requestSleep( sleepCompleteNotifier )
def requestWakeUp( self, wakeUpCompleteNotifier = None ):
self.m_machineStatusUpdater.requestWakeUp( wakeUpCompleteNotifier )
def getQueueMachineName( self ):
return self.getCluster().getJobsState().getQueueMachine( self.m_name ).getName()
assert( self.m_queueName != None )
return self.m_queueName
def getCluster( self ):
return self.m_cluster
def getName(self):
return self.m_name
def isReady(self):
if self.m_powerState == PowerState.UNKNOWN:
# logInfo(self.m_name + ' is not ready (waiting for power state)')
return False
if self.m_powerState == PowerState.ON:
return True
# log(self.m_name + ' is ready')
return True
def getPowerState(self):
return self.m_powerState
def setShouldAlwaysBeOn(self):
self.m_machineStatusUpdater.setShouldAlwaysBeOn()
self.setPowerState(PowerState.ON)
def setPowerState(self, powerState):
bUpdateRequiredChecks = False
if self.m_powerState == PowerState.UNKNOWN:
logInfo('ClusterNode::setPowerState : ' + self.m_name + '\'s power state has been initialized to ' + PowerStateToStr(powerState))
self.m_powerState = powerState
self.m_lastPowerStateTime = datetime.now()
bUpdateRequiredChecks = True
else:
# update the estimation of energy consumption
self.updateEnergyMeasurements()
# then change the power state
if self.m_powerState != powerState:
logInfo('ClusterNode::setPowerState : ' + self.m_name + '\'s power state has been changed to ' + PowerStateToStr(powerState))
self.m_powerState = powerState
self.m_lastPowerStateTime = datetime.now()
bUpdateRequiredChecks = True
if bUpdateRequiredChecks:
if self.m_powerState == PowerState.ON:
self.m_machineStatusUpdater.m_bCheckPowerState = True
self.m_machineStatusUpdater.m_bCheckSensors = True
elif self.m_powerState == PowerState.OFF:
self.m_machineStatusUpdater.m_bCheckPowerState = True
self.m_machineStatusUpdater.m_bCheckSensors = False
elif self.m_powerState == PowerState.SLEEP:
self.m_machineStatusUpdater.m_bCheckPowerState = True
self.m_machineStatusUpdater.m_bCheckSensors = False
elif self.m_powerState == PowerState.UNPLUGGED:
self.m_machineStatusUpdater.m_bCheckPowerState = True
self.m_machineStatusUpdater.m_bCheckSensors = False
else:
assert False
def onNewPowerStateReading(self, powerState):
"""
called when a new powerstate reading arrives
"""
if powerState != self.getPowerState():
if self.getPowerState() != PowerState.UNKNOWN:
logWarning('ClusterNode::onNewPowerStateReading : ' + self.m_name + '\'s power state has been (manually it seems) changed to ' + PowerStateToStr(powerState))
self.setPowerState(powerState)
def getPowerConsumptionForPowerState(self, ePowerState):
"""
returns the power consumption estimation (in watts) of this machine for the given power state
"""
fCurrentIntensity = 0.0
fCurrentVoltage = 220.0
# noticed on 26.08.2009 that putting 22 machines from sleep to on eats 17 A, resulting in difference of 0.77 A per machine
if ePowerState == PowerState.ON:
fCurrentIntensity = 0.9 # value when the machine is doing nothing
elif ePowerState == PowerState.OFF:
fCurrentIntensity = 0.1
elif ePowerState == PowerState.SLEEP:
fCurrentIntensity = 0.1
elif ePowerState == PowerState.UNPLUGGED:
fCurrentIntensity = 0.0
else:
assert False
return fCurrentIntensity * fCurrentVoltage
def updateEnergyMeasurements(self):
timeInterval = datetime.now() - self.m_lastPowerStateTime
self.m_energyConsumption += self.getPowerConsumptionForPowerState(self.m_powerState) * timeInterval.seconds
self.m_energySavings += (self.getPowerConsumptionForPowerState(PowerState.ON) - self.getPowerConsumptionForPowerState(self.m_powerState)) * timeInterval.seconds
self.m_lastPowerStateTime = datetime.now()
# logDebug('energy savings on %s : %f J' %(self.getName(), self.m_energySavings))
def getEnergyConsumption(self):
"""
in joules
"""
self.updateEnergyMeasurements()
return self.m_energyConsumption
def getPowerConsumption(self):
fCurrentPowerConsumption = self.getPowerConsumptionForPowerState(self.m_powerState)
# logDebug('getPowerConsumption of %s : %f (powerstate = %d)' % (self.getName(), fCurrentPowerConsumption, self.m_powerState))
return fCurrentPowerConsumption
def getEnergySavings(self):
self.updateEnergyMeasurements()
return self.m_energySavings
def onSleepFailedBecauseAJobJustArrived(self):
logInfo('%s was scheduled to sleep but the sleep is canceled because it\'s currently executing a new job' % self.m_name)
def requestSleep(self, sleepCompleteNotifier=None):
self.m_machineStatusUpdater.requestSleep(sleepCompleteNotifier)
def requestWakeUp(self, wakeUpCompleteNotifier=None):
self.m_machineStatusUpdater.requestWakeUp(wakeUpCompleteNotifier)
def getQueueMachineName(self):
return self.getCluster().getJobsState().getQueueMachine(self.m_name).getName()
assert self.m_queueName is not None
return self.m_queueName
def getCluster(self):
return self.m_cluster

View File

@ -2,187 +2,191 @@ import threading
import time
import Lib.Util
import Lib.SimpaDbUtil
import os
import traceback
import sys
from PowerState import *
from QstatParser import *
import Util
from PowerState import PowerState
from Log import logInfo, logDebug
from Util import blockingWakeUpMachine, blockingPutMachineToSleep, getPowerState, onException
class IWakeUpCompleteNotifier:
"""
interface for wakeup notifiers
"""
def onWakeUpComplete( self ):
assert( False )
"""
interface for wakeup notifiers
"""
def onWakeUpComplete(self):
assert False
class ISleepCompleteNotifier:
"""
interface for sleep notifiers
"""
def onSleepComplete( self, bSleepSucceeded ):
assert( False )
"""
interface for sleep notifiers
"""
def onSleepComplete(self, bSleepSucceeded):
assert False
class IRequest:
GO_TO_SLEEP = 1
WAKE_UP = 2
CHECK_POWER_STATE = 3
def __init__( self, requestType ):
self.m_type = requestType
def getType( self ):
return self.m_type
def process( self, clusterNodeStatusUpdater ):
"""
processes this request
"""
assert( False ) # this method is abstract
GO_TO_SLEEP = 1
WAKE_UP = 2
CHECK_POWER_STATE = 3
class WakeUpRequest( IRequest ):
def __init__(self, requestType):
self.m_type = requestType
def __init__( self, wakeUpNotifier ):
IRequest.__init__( self, IRequest.WAKE_UP )
self.m_wakeUpNotifier = wakeUpNotifier
def getType(self):
return self.m_type
def process( self, clusterNodeStatusUpdater ):
assert( clusterNodeStatusUpdater.m_bShouldAlwaysBeOn == False ) # are we attempting to wake up a machine that should always be on ?
logInfo('Handling wakeup request for %s' % clusterNodeStatusUpdater.getName() )
bSuccess = blockingWakeUpMachine( clusterNodeStatusUpdater.getName() )
assert( bSuccess )
# activate the associated machine queue
if clusterNodeStatusUpdater.setQueueActivation( True ):
None # all is ok
else:
assert( False )
clusterNodeStatusUpdater.m_stateLock.acquire()
clusterNodeStatusUpdater.m_clusterNode.setPowerState( PowerState.ON )
clusterNodeStatusUpdater.m_stateLock.release()
if self.m_wakeUpNotifier:
logDebug('ClusterNodeStatusUpdater::run : Sending wakeup notification')
self.m_wakeUpNotifier.onWakeUpComplete()
class SleepRequest( IRequest ):
def process(self, clusterNodeStatusUpdater):
"""
processes this request
"""
assert False # this method is abstract
def __init__( self, sleepCompleteNotifier ):
IRequest.__init__( self, IRequest.GO_TO_SLEEP )
self.m_sleepCompleteNotifier = sleepCompleteNotifier
def process( self, clusterNodeStatusUpdater ):
assert( clusterNodeStatusUpdater.m_bShouldAlwaysBeOn == False ) # are we attempting to put a machine the should stay on to sleep ?
logInfo('Handling sleep request for %s' % clusterNodeStatusUpdater.getName() )
if clusterNodeStatusUpdater.setQueueActivation( False ):
if clusterNodeStatusUpdater.queueIsEmpty():
if blockingPutMachineToSleep( clusterNodeStatusUpdater.m_clusterNodeName ):
# now we know that the machine is asleep
clusterNodeStatusUpdater.m_stateLock.acquire()
clusterNodeStatusUpdater.m_clusterNode.setPowerState( PowerState.SLEEP )
clusterNodeStatusUpdater.m_stateLock.release()
if self.m_sleepCompleteNotifier:
self.m_sleepCompleteNotifier.onSleepComplete( True )
else:
assert( False )
else:
# reactivate the queue
if not clusterNodeStatusUpdater.setQueueActivation( True ):
assert( False )
clusterNodeStatusUpdater.m_stateLock.acquire()
clusterNodeStatusUpdater.m_clusterNode.setPowerState( PowerState.ON ) # this is necessary to reenable the various cyclic checks that were disabled on sleep request
clusterNodeStatusUpdater.m_stateLock.release()
clusterNodeStatusUpdater.m_clusterNode.onSleepFailedBecauseAJobJustArrived()
if self.m_sleepCompleteNotifier:
self.m_sleepCompleteNotifier.onSleepComplete( False )
else:
assert( False )
class WakeUpRequest(IRequest):
class CheckPowerStateRequest( IRequest ):
def __init__(self, wakeUpNotifier):
IRequest.__init__(self, IRequest.WAKE_UP)
self.m_wakeUpNotifier = wakeUpNotifier
def __init__( self ):
IRequest.__init__( self, IRequest.CHECK_POWER_STATE )
def process(self, clusterNodeStatusUpdater):
assert clusterNodeStatusUpdater.m_bShouldAlwaysBeOn is False # are we attempting to wake up a machine that should always be on ?
logInfo('Handling wakeup request for %s' % clusterNodeStatusUpdater.getName())
bSuccess = blockingWakeUpMachine(clusterNodeStatusUpdater.getName())
assert bSuccess
# activate the associated machine queue
if clusterNodeStatusUpdater.setQueueActivation(True):
pass # all is ok
else:
assert False
clusterNodeStatusUpdater.m_stateLock.acquire()
clusterNodeStatusUpdater.m_clusterNode.setPowerState(PowerState.ON)
clusterNodeStatusUpdater.m_stateLock.release()
if self.m_wakeUpNotifier:
logDebug('ClusterNodeStatusUpdater::run : Sending wakeup notification')
self.m_wakeUpNotifier.onWakeUpComplete()
def process( self, clusterNodeStatusUpdater ):
powerState = Util.getPowerState( clusterNodeStatusUpdater.m_clusterNodeName )
clusterNodeStatusUpdater.m_stateLock.acquire()
clusterNodeStatusUpdater.m_clusterNode.onNewPowerStateReading( powerState )
clusterNodeStatusUpdater.m_lastPowerStateCheckTime = time.time()
clusterNodeStatusUpdater.m_stateLock.release()
class ClusterNodeStatusUpdater( threading.Thread ):
DELAY_BETWEEN_POWERSTATE_CHECKS=5*60 # in seconds
def __init__( self, machineName, clusterNode, gridEngine ):
threading.Thread.__init__(self)
self.m_clusterNodeName = machineName
self.m_clusterNode = clusterNode
self.m_gridEngine = gridEngine
self.m_bStop = False
self.m_lastPowerStateCheckTime = None #time.time()
self.m_bCheckPowerState = True
self.m_stateLock = threading.Lock() # lock that prevents concurrent access to the state of this instance
self.m_bShouldAlwaysBeOn = False # indicates that the machine should never go to sleep or off for whatever reason (eg simpatix10)
self.m_pendingRequestsQueue = []
def getGridEngine( self ):
return self.m_gridEngine
def getName( self ):
return self.m_clusterNodeName
def setShouldAlwaysBeOn( self ):
print('%s should always be on' % (self.getName()) )
self.m_bShouldAlwaysBeOn = True
def pushRequest( self, request ):
self.m_stateLock.acquire()
self.m_pendingRequestsQueue.append(request)
self.m_stateLock.release()
def popRequest( self ):
oldestRequest = None
self.m_stateLock.acquire()
if len(self.m_pendingRequestsQueue) != 0:
oldestRequest = self.m_pendingRequestsQueue.pop(0)
self.m_stateLock.release()
return oldestRequest
def run( self ):
try:
while not self.m_bStop :
# handle the oldest request
request = self.popRequest()
if request != None :
request.process( self )
# schedule a power state check if required
currentTime = time.time()
if self.m_bCheckPowerState:
if not self.m_bShouldAlwaysBeOn: # don't do power checks on such machines because some current implementations of
# operations involved might cause the machine to go to sleep
if (not self.m_lastPowerStateCheckTime) or (currentTime > (self.m_lastPowerStateCheckTime + ClusterNodeStatusUpdater.DELAY_BETWEEN_POWERSTATE_CHECKS)):
self.pushRequest( CheckPowerStateRequest() )
time.sleep(1)
except BaseException, exception: # catches all exceptions, including the ctrl+C (KeyboardInterrupt)
Util.onException(exception)
def requestSleep( self, sleepCompleteNotifier = None ):
assert( self.m_bShouldAlwaysBeOn == False )
self.pushRequest( SleepRequest( sleepCompleteNotifier ) )
def requestWakeUp( self, wakeUpNotifier = None ):
assert( self.m_bShouldAlwaysBeOn == False )
self.pushRequest( WakeUpRequest( wakeUpNotifier ) )
def getQueueMachineName( self ):
return self.m_clusterNode.getQueueMachineName()
def setQueueActivation( self, bEnable ):
"""
@return true on success, false otherwise
"""
return self.getGridEngine().setQueueInstanceActivation( self.getQueueMachineName(), bEnable )
def queueIsEmpty( self ):
return self.getGridEngine().queueIsEmpty( self.getName() )
class SleepRequest(IRequest):
def __init__(self, sleepCompleteNotifier):
IRequest.__init__(self, IRequest.GO_TO_SLEEP)
self.m_sleepCompleteNotifier = sleepCompleteNotifier
def process(self, clusterNodeStatusUpdater):
assert not clusterNodeStatusUpdater.m_bShouldAlwaysBeOn # are we attempting to put a machine the should stay on to sleep ?
logInfo('Handling sleep request for %s' % clusterNodeStatusUpdater.getName())
if clusterNodeStatusUpdater.setQueueActivation(False):
if clusterNodeStatusUpdater.queueIsEmpty():
if blockingPutMachineToSleep(clusterNodeStatusUpdater.m_clusterNodeName):
# now we know that the machine is asleep
clusterNodeStatusUpdater.m_stateLock.acquire()
clusterNodeStatusUpdater.m_clusterNode.setPowerState(PowerState.SLEEP)
clusterNodeStatusUpdater.m_stateLock.release()
if self.m_sleepCompleteNotifier:
self.m_sleepCompleteNotifier.onSleepComplete(True)
else:
assert False
else:
# reactivate the queue
if not clusterNodeStatusUpdater.setQueueActivation(True):
assert False
clusterNodeStatusUpdater.m_stateLock.acquire()
clusterNodeStatusUpdater.m_clusterNode.setPowerState(PowerState.ON) # this is necessary to reenable the various cyclic checks that were disabled on sleep request
clusterNodeStatusUpdater.m_stateLock.release()
clusterNodeStatusUpdater.m_clusterNode.onSleepFailedBecauseAJobJustArrived()
if self.m_sleepCompleteNotifier:
self.m_sleepCompleteNotifier.onSleepComplete(False)
else:
assert False
class CheckPowerStateRequest(IRequest):
def __init__(self):
IRequest.__init__(self, IRequest.CHECK_POWER_STATE)
def process(self, clusterNodeStatusUpdater):
powerState = getPowerState(clusterNodeStatusUpdater.m_clusterNodeName)
clusterNodeStatusUpdater.m_stateLock.acquire()
clusterNodeStatusUpdater.m_clusterNode.onNewPowerStateReading(powerState)
clusterNodeStatusUpdater.m_lastPowerStateCheckTime = time.time()
clusterNodeStatusUpdater.m_stateLock.release()
class ClusterNodeStatusUpdater(threading.Thread):
DELAY_BETWEEN_POWERSTATE_CHECKS = 5 * 60 # in seconds
def __init__(self, machineName, clusterNode, gridEngine):
threading.Thread.__init__(self)
self.m_clusterNodeName = machineName
self.m_clusterNode = clusterNode
self.m_gridEngine = gridEngine
self.m_bStop = False
self.m_lastPowerStateCheckTime = None # time.time()
self.m_bCheckPowerState = True
self.m_stateLock = threading.Lock() # lock that prevents concurrent access to the state of this instance
self.m_bShouldAlwaysBeOn = False # indicates that the machine should never go to sleep or off for whatever reason (eg simpatix10)
self.m_pendingRequestsQueue = []
def getGridEngine(self):
return self.m_gridEngine
def getName(self):
return self.m_clusterNodeName
def setShouldAlwaysBeOn(self):
print('%s should always be on' % (self.getName()))
self.m_bShouldAlwaysBeOn = True
def pushRequest(self, request):
self.m_stateLock.acquire()
self.m_pendingRequestsQueue.append(request)
self.m_stateLock.release()
def popRequest(self):
oldestRequest = None
self.m_stateLock.acquire()
if len(self.m_pendingRequestsQueue) != 0:
oldestRequest = self.m_pendingRequestsQueue.pop(0)
self.m_stateLock.release()
return oldestRequest
def run(self):
try:
while not self.m_bStop:
# handle the oldest request
request = self.popRequest()
if request is not None:
request.process(self)
# schedule a power state check if required
currentTime = time.time()
if self.m_bCheckPowerState:
if not self.m_bShouldAlwaysBeOn: # don't do power checks on such machines because some current implementations of
# operations involved might cause the machine to go to sleep
if (not self.m_lastPowerStateCheckTime) or (currentTime > (self.m_lastPowerStateCheckTime + ClusterNodeStatusUpdater.DELAY_BETWEEN_POWERSTATE_CHECKS)):
self.pushRequest(CheckPowerStateRequest())
time.sleep(1)
except BaseException as exception: # catches all exceptions, including the ctrl+C (KeyboardInterrupt)
onException(exception)
def requestSleep(self, sleepCompleteNotifier=None):
assert not self.m_bShouldAlwaysBeOn
self.pushRequest(SleepRequest(sleepCompleteNotifier))
def requestWakeUp(self, wakeUpNotifier=None):
assert self.m_bShouldAlwaysBeOn is False
self.pushRequest(WakeUpRequest(wakeUpNotifier))
def getQueueMachineName(self):
return self.m_clusterNode.getQueueMachineName()
def setQueueActivation(self, bEnable):
"""
@return true on success, false otherwise
"""
return self.getGridEngine().setQueueInstanceActivation(self.getQueueMachineName(), bEnable)
def queueIsEmpty(self):
return self.getGridEngine().queueIsEmpty(self.getName())

View File

@ -1,209 +1,209 @@
import threading
from JobsStateUpdater import *
from JobsStateUpdater import JobsStateUpdater
import Lib.Util
import Lib.SimpaDbUtil
from ClusterNode import *
from ClusterNode import ClusterNode
from Log import logInfo, logError
from PowerState import PowerState
import time
class ClusterStatus:
"""
The current state (jobs, sensors) of the cluster
@param gridEngine the interface to the batch job tool (in our case it's sun grid engine)
"""
def __init__(self, gridEngine):
self.m_gridEngine = gridEngine
self.m_clusterNodes = {}
self.m_lock = threading.Lock() # to prevent concurrent access to this instance
self.m_jobsStateUpdater = JobsStateUpdater( self )
self.m_jobsState = None
#self.m_controlledMachineNames = [ 'simpatix30' ]
self.m_controlledMachineNames = [] # [ 'simpatix30' ]
if False:
for iMachine in range(11, 40):
if (iMachine == 31) or (iMachine == 32):
continue # these machines don't seem to be able to go to sleep properly (bug 00000010)
if (iMachine == 18):
continue # this machine needs maintenance (restarting because it's very slow for an unknown reason)
self.m_controlledMachineNames.append( 'simpatix%d' % iMachine )
nodeNames = Lib.SimpaDbUtil.getClusterMachinesNames()
for nodeName in nodeNames:
if nodeName in self.m_controlledMachineNames:
logInfo( 'machine %s is under the cluster controller\'s control' % nodeName )
clusterNode = ClusterNode( nodeName, self, gridEngine )
if nodeName == 'simpatix10':
clusterNode.setShouldAlwaysBeOn()
self.m_clusterNodes[ nodeName ] = clusterNode
return
def setControlOnMachine(self, machineName, bControl):
if bControl:
# add machineName under control of ClusterController
for k, v in self.m_clusterNodes.items():
if v.getName() == machineName :
return # nothing to do : machineName is already under the control of ClusterController
"""
The current state (jobs, sensors) of the cluster
clusterNode = ClusterNode( machineName, self, self.m_gridEngine )
if machineName == 'simpatix10':
clusterNode.setShouldAlwaysBeOn()
self.m_clusterNodes[ machineName ] = clusterNode
clusterNode.m_machineStatusUpdater.start()
else:
# remove machineName from control of ClusterController
clusterNode = self.m_clusterNodes.get(machineName)
if clusterNode:
clusterNode.m_machineStatusUpdater.m_bStop = True
clusterNode.m_machineStatusUpdater.join()
self.m_clusterNodes.pop(machineName)
def getGridEngine( self ):
return self.m_gridEngine
def getMachines( self ):
return self.m_clusterNodes
def startReadingThreads( self ):
for k, v in self.m_clusterNodes.items():
v.m_machineStatusUpdater.start()
self.m_jobsStateUpdater.start()
def stopReadingThreads( self ):
for k, v in self.m_clusterNodes.items():
v.m_machineStatusUpdater.m_bStop = True
v.m_machineStatusUpdater.join()
self.m_jobsStateUpdater.m_bStop = True
self.m_jobsStateUpdater.join()
def onNewJobsState( self, newJobsState ):
#logDebug( 'ClusterStatus::onNewJobsState : attempting to acquire lock to access m_jobsState' )
self.m_lock.acquire()
#logDebug( 'ClusterStatus::onNewJobsState : got lock to access m_jobsState' )
self.m_jobsState = newJobsState
self.m_lock.release()
def getJobsOnMachine( self, machineName ):
return self.m_jobsState.getJobsOnMachine( machineName )
def isReady( self ):
for k, v in self.m_clusterNodes.items():
if not v.isReady():
logInfo( 'ClusterStatus::isReady : not ready because of ' + v.getName() )
return False
#log('ClusterStatus::isReady() : '+k+' is ready')
#assert( False )
if self.m_jobsState == None:
logInfo( 'ClusterStatus::isReady : not ready because waiting for jobs state' )
return False
return True
def getIdleMachines( self ):
assert( self.isReady )
bBUG_00000009_IS_STILL_ALIVE = True
if bBUG_00000009_IS_STILL_ALIVE:
currentTime = time.time()
fJOBS_STATE_MAX_ALLOWED_AGE = 3600
fJobsStateAge = currentTime - self.m_jobsState.getTime()
if fJobsStateAge > fJOBS_STATE_MAX_ALLOWED_AGE:
logError('ClusterStatus::getIdleMachines : age of jobs state is too old (%f s). This is bug 00000009.' % (fJobsStateAge))
assert( False )
idleMachines = {}
for machineName, machine in self.m_clusterNodes.items():
if machine.getPowerState() == PowerState.ON:
jobsOnThisMachine = self.getJobsOnMachine( machineName )
if len(jobsOnThisMachine) == 0:
idleMachines[ machineName ] = machine
return idleMachines
def getPendingJobs( self ):
return self.m_jobsState.getPendingJobs()
def getJobsState( self ):
return self.m_jobsState
def queueMachineFitsJobRequirements( self, queueMachine, jobRequirements ):
if jobRequirements.m_queues:
bQueueIsInAllowedQueues = False
for queueName in jobRequirements.m_queues:
if queueName == queueMachine.getQueueName():
bQueueIsInAllowedQueues = True
if not bQueueIsInAllowedQueues:
logInfo('queueMachineFitsJobRequirements : queueMachine '+queueMachine.getName()+' rejected because it\'s not in the allowed queues')
return False
return True
def getEnergyConsumption( self ):
"""
returns an estimate of the energy consumption since the start of the cluster controller (in joules)
"""
fEnergyConsumption = 0.0
for machine in self.m_clusterNodes.values():
if machine.isReady(): # there are cases where the machine is not ready yet (for example, it's just been added to clustercontroller's control)
fEnergyConsumption += machine.getEnergyConsumption()
return fEnergyConsumption
def getEnergySavings( self ):
"""
returns an estimate of the energy saving since the start of the cluster controller (in joules)
"""
fEnergySavings = 0.0
for machine in self.m_clusterNodes.values():
if machine.isReady():
fEnergySavings += machine.getEnergySavings()
return fEnergySavings
def getCurrentPowerConsumption( self ):
fPowerConsumption = 0.0
for machine in self.m_clusterNodes.values():
if machine.isReady():
fPowerConsumption += machine.getPowerConsumption()
return fPowerConsumption
def getCurrentPowerSavings( self ):
fPowerSavings = 0.0
for machine in self.m_clusterNodes.values():
if machine.isReady():
fPowerSavings += machine.getPowerConsumptionForPowerState( PowerState.ON ) - machine.getPowerConsumption()
return fPowerSavings
@param gridEngine the interface to the batch job tool (in our case it's sun grid engine)
"""
def __init__(self, gridEngine):
self.m_gridEngine = gridEngine
self.m_clusterNodes = {}
self.m_lock = threading.Lock() # to prevent concurrent access to this instance
self.m_jobsStateUpdater = JobsStateUpdater(self)
self.m_jobsState = None
# self.m_controlledMachineNames = ['simpatix30']
self.m_controlledMachineNames = [] # ['simpatix30']
if False:
for iMachine in range(11, 40):
if (iMachine == 31) or (iMachine == 32):
continue # these machines don't seem to be able to go to sleep properly (bug 00000010)
if (iMachine == 18):
continue # this machine needs maintenance (restarting because it's very slow for an unknown reason)
self.m_controlledMachineNames.append('simpatix%d' % iMachine)
nodeNames = Lib.SimpaDbUtil.getClusterMachinesNames()
for nodeName in nodeNames:
if nodeName in self.m_controlledMachineNames:
logInfo('machine %s is under the cluster controller\'s control' % nodeName)
clusterNode = ClusterNode(nodeName, self, gridEngine)
if nodeName == 'simpatix10':
clusterNode.setShouldAlwaysBeOn()
self.m_clusterNodes[nodeName] = clusterNode
return
def getNumControlledSlots( self ):
self.m_lock.acquire()
iNumControlledSlots = 0
for machine in self.m_clusterNodes.values():
queueMachine = self.m_jobsState.getQueueMachine( machine.getName() )
iNumControlledSlots += queueMachine.getNumSlots()
self.m_lock.release()
return iNumControlledSlots
def getNumUsedSlots( self ):
self.m_lock.acquire()
iNumUsedSlots = 0
for machine in self.m_clusterNodes.values():
queueMachine = self.m_jobsState.getQueueMachine( machine.getName() )
iNumUsedSlotsOnThisMachine = queueMachine.getNumSlots() - self.m_jobsState.getNumFreeSlotsOnQueueMachine(queueMachine)
assert(iNumUsedSlotsOnThisMachine >= 0)
iNumUsedSlots += iNumUsedSlotsOnThisMachine
self.m_lock.release()
return iNumUsedSlots
def getNumWastedSlots( self ):
self.m_lock.acquire()
iNumWastedSlots = 0
for machine in self.m_clusterNodes.values():
if machine.getPowerState() == PowerState.ON:
queueMachine = self.m_jobsState.getQueueMachine( machine.getName() )
iNumWastedSlots += self.m_jobsState.getNumFreeSlotsOnQueueMachine(queueMachine)
self.m_lock.release()
return iNumWastedSlots
def setControlOnMachine(self, machineName, bControl):
if bControl:
# add machineName under control of ClusterController
for k, v in self.m_clusterNodes.items():
if v.getName() == machineName:
return # nothing to do : machineName is already under the control of ClusterController
def getNumSleepingSlots( self ):
self.m_lock.acquire()
iNumSleepingSlots = 0
for machine in self.m_clusterNodes.values():
if machine.getPowerState() == PowerState.SLEEP:
queueMachine = self.m_jobsState.getQueueMachine( machine.getName() )
iNumSleepingSlots += self.m_jobsState.getNumFreeSlotsOnQueueMachine(queueMachine)
self.m_lock.release()
return iNumSleepingSlots
clusterNode = ClusterNode(machineName, self, self.m_gridEngine)
if machineName == 'simpatix10':
clusterNode.setShouldAlwaysBeOn()
self.m_clusterNodes[machineName] = clusterNode
clusterNode.m_machineStatusUpdater.start()
else:
# remove machineName from control of ClusterController
clusterNode = self.m_clusterNodes.get(machineName)
if clusterNode:
clusterNode.m_machineStatusUpdater.m_bStop = True
clusterNode.m_machineStatusUpdater.join()
self.m_clusterNodes.pop(machineName)
def getGridEngine(self):
return self.m_gridEngine
def getMachines(self):
return self.m_clusterNodes
def startReadingThreads(self):
for k, v in self.m_clusterNodes.items():
v.m_machineStatusUpdater.start()
self.m_jobsStateUpdater.start()
def stopReadingThreads(self):
for k, v in self.m_clusterNodes.items():
v.m_machineStatusUpdater.m_bStop = True
v.m_machineStatusUpdater.join()
self.m_jobsStateUpdater.m_bStop = True
self.m_jobsStateUpdater.join()
def onNewJobsState(self, newJobsState):
# logDebug('ClusterStatus::onNewJobsState : attempting to acquire lock to access m_jobsState')
self.m_lock.acquire()
# logDebug('ClusterStatus::onNewJobsState : got lock to access m_jobsState')
self.m_jobsState = newJobsState
self.m_lock.release()
def getJobsOnMachine(self, machineName):
return self.m_jobsState.getJobsOnMachine(machineName)
def isReady(self):
for k, v in self.m_clusterNodes.items():
if not v.isReady():
logInfo('ClusterStatus::isReady : not ready because of ' + v.getName())
return False
# log('ClusterStatus::isReady() : '+k+' is ready')
# assert(False)
if self.m_jobsState is None:
logInfo('ClusterStatus::isReady : not ready because waiting for jobs state')
return False
return True
def getIdleMachines(self):
assert self.isReady
bBUG_00000009_IS_STILL_ALIVE = True
if bBUG_00000009_IS_STILL_ALIVE:
currentTime = time.time()
fJOBS_STATE_MAX_ALLOWED_AGE = 3600
fJobsStateAge = currentTime - self.m_jobsState.getTime()
if fJobsStateAge > fJOBS_STATE_MAX_ALLOWED_AGE:
logError('ClusterStatus::getIdleMachines : age of jobs state is too old (%f s). This is bug 00000009.' % (fJobsStateAge))
assert False
idleMachines = {}
for machineName, machine in self.m_clusterNodes.items():
if machine.getPowerState() == PowerState.ON:
jobsOnThisMachine = self.getJobsOnMachine(machineName)
if len(jobsOnThisMachine) == 0:
idleMachines[machineName] = machine
return idleMachines
def getPendingJobs(self):
return self.m_jobsState.getPendingJobs()
def getJobsState(self):
return self.m_jobsState
def queueMachineFitsJobRequirements(self, queueMachine, jobRequirements):
if jobRequirements.m_queues:
bQueueIsInAllowedQueues = False
for queueName in jobRequirements.m_queues:
if queueName == queueMachine.getQueueName():
bQueueIsInAllowedQueues = True
if not bQueueIsInAllowedQueues:
logInfo('queueMachineFitsJobRequirements : queueMachine ' + queueMachine.getName() + ' rejected because it\'s not in the allowed queues')
return False
return True
def getEnergyConsumption(self):
"""
returns an estimate of the energy consumption since the start of the cluster controller (in joules)
"""
fEnergyConsumption = 0.0
for machine in self.m_clusterNodes.values():
if machine.isReady(): # there are cases where the machine is not ready yet (for example, it's just been added to clustercontroller's control)
fEnergyConsumption += machine.getEnergyConsumption()
return fEnergyConsumption
def getEnergySavings(self):
"""
returns an estimate of the energy saving since the start of the cluster controller (in joules)
"""
fEnergySavings = 0.0
for machine in self.m_clusterNodes.values():
if machine.isReady():
fEnergySavings += machine.getEnergySavings()
return fEnergySavings
def getCurrentPowerConsumption(self):
fPowerConsumption = 0.0
for machine in self.m_clusterNodes.values():
if machine.isReady():
fPowerConsumption += machine.getPowerConsumption()
return fPowerConsumption
def getCurrentPowerSavings(self):
fPowerSavings = 0.0
for machine in self.m_clusterNodes.values():
if machine.isReady():
fPowerSavings += machine.getPowerConsumptionForPowerState(PowerState.ON) - machine.getPowerConsumption()
return fPowerSavings
def getNumControlledSlots(self):
self.m_lock.acquire()
iNumControlledSlots = 0
for machine in self.m_clusterNodes.values():
queueMachine = self.m_jobsState.getQueueMachine(machine.getName())
iNumControlledSlots += queueMachine.getNumSlots()
self.m_lock.release()
return iNumControlledSlots
def getNumUsedSlots(self):
self.m_lock.acquire()
iNumUsedSlots = 0
for machine in self.m_clusterNodes.values():
queueMachine = self.m_jobsState.getQueueMachine(machine.getName())
iNumUsedSlotsOnThisMachine = queueMachine.getNumSlots() - self.m_jobsState.getNumFreeSlotsOnQueueMachine(queueMachine)
assert iNumUsedSlotsOnThisMachine >= 0
iNumUsedSlots += iNumUsedSlotsOnThisMachine
self.m_lock.release()
return iNumUsedSlots
def getNumWastedSlots(self):
self.m_lock.acquire()
iNumWastedSlots = 0
for machine in self.m_clusterNodes.values():
if machine.getPowerState() == PowerState.ON:
queueMachine = self.m_jobsState.getQueueMachine(machine.getName())
iNumWastedSlots += self.m_jobsState.getNumFreeSlotsOnQueueMachine(queueMachine)
self.m_lock.release()
return iNumWastedSlots
def getNumSleepingSlots(self):
self.m_lock.acquire()
iNumSleepingSlots = 0
for machine in self.m_clusterNodes.values():
if machine.getPowerState() == PowerState.SLEEP:
queueMachine = self.m_jobsState.getQueueMachine(machine.getName())
iNumSleepingSlots += self.m_jobsState.getNumFreeSlotsOnQueueMachine(queueMachine)
self.m_lock.release()
return iNumSleepingSlots

View File

@ -1,9 +1,9 @@
#!/usr/bin/env python
"""
script that installs ClusterController on simpatix10
to start ClusterController :
launchctl start fr.univ-rennes1.ipr.ClusterController
script that installs ClusterController on simpatix10
to start ClusterController :
launchctl start fr.univ-rennes1.ipr.ClusterController
"""
import sys
sys.path.insert(0, '..')
@ -11,32 +11,32 @@ from Lib.Util import *
import os
if __name__ == '__main__':
machineName = 'simpatix10'
strThisDir = os.getcwd()
strPythonDevDir = strThisDir + '/..'
print( 'installing ClusterController on '+machineName )
remoteCommand = ''
remoteCommand += 'mkdir -p /usr/local/bin/ipr/Python;'
remoteCommand += 'rm -r /usr/local/bin/ipr/Python/Lib;'
remoteCommand += 'rm -r /usr/local/bin/ipr/Python/ClusterController;'
remoteCommand += 'cp -r %s/Lib /usr/local/bin/ipr/Python/;' % strPythonDevDir
remoteCommand += 'cp -r %s/ClusterController /usr/local/bin/ipr/Python/;' % strPythonDevDir
remoteCommand += 'cp %s/ClusterController/ClusterController.plist /Library/LaunchDaemons/fr.univ-rennes1.ipr.ClusterController.plist;' % strPythonDevDir
remoteCommand += 'cp -r %s/ClusterController/ClusterControllerLauncher.sh /usr/local/bin/ipr/Python/ClusterController/;' % strPythonDevDir
remoteCommand += 'launchctl unload /Library/LaunchDaemons/fr.univ-rennes1.ipr.ClusterController.plist;'
remoteCommand += 'launchctl load /Library/LaunchDaemons/fr.univ-rennes1.ipr.ClusterController.plist;'
command = 'ssh root@'+ machineName +' "'+remoteCommand+'"'
( returnCode, stdout, stderr ) = executeCommand( command )
for strSingleCommand in remoteCommand.split(';'):
print(strSingleCommand)
print(stdout)
print(stderr)
if returnCode == 0:
print('install succeeded on '+machineName)
else:
print('install failed on '+machineName+' (see below for detail)')
print stderr
#assert( False )
machineName = 'simpatix10'
strThisDir = os.getcwd()
strPythonDevDir = strThisDir + '/..'
print( 'installing ClusterController on '+machineName )
remoteCommand = ''
remoteCommand += 'mkdir -p /usr/local/bin/ipr/Python;'
remoteCommand += 'rm -r /usr/local/bin/ipr/Python/Lib;'
remoteCommand += 'rm -r /usr/local/bin/ipr/Python/ClusterController;'
remoteCommand += 'cp -r %s/Lib /usr/local/bin/ipr/Python/;' % strPythonDevDir
remoteCommand += 'cp -r %s/ClusterController /usr/local/bin/ipr/Python/;' % strPythonDevDir
remoteCommand += 'cp %s/ClusterController/ClusterController.plist /Library/LaunchDaemons/fr.univ-rennes1.ipr.ClusterController.plist;' % strPythonDevDir
remoteCommand += 'cp -r %s/ClusterController/ClusterControllerLauncher.sh /usr/local/bin/ipr/Python/ClusterController/;' % strPythonDevDir
remoteCommand += 'launchctl unload /Library/LaunchDaemons/fr.univ-rennes1.ipr.ClusterController.plist;'
remoteCommand += 'launchctl load /Library/LaunchDaemons/fr.univ-rennes1.ipr.ClusterController.plist;'
command = 'ssh root@'+ machineName +' "'+remoteCommand+'"'
( returnCode, stdout, stderr ) = executeCommand( command )
for strSingleCommand in remoteCommand.split(';'):
print(strSingleCommand)
print(stdout)
print(stderr)
if returnCode == 0:
print('install succeeded on '+machineName)
else:
print('install failed on '+machineName+' (see below for detail)')
print stderr
#assert( False )

View File

@ -1,130 +1,146 @@
class JobStateFlags:
RUNNING=1 # the job is running
WAITING=2 # the job is waiting
QUEUED=4 # not sure what that exactly means but it reflects the q state of jobs as seen in the pending jobs list from qstat -f -u \*
TRANSFERING=8
DELETED=16
HOLD=32
ERROR=64
SUSPENDED=128
RUNNING = 1 # the job is running
WAITING = 2 # the job is waiting
QUEUED = 4 # not sure what that exactly means but it reflects the q state of jobs as seen in the pending jobs list from qstat -f -u \*
TRANSFERING = 8
DELETED = 16
HOLD = 32
ERROR = 64
SUSPENDED = 128
class ParallelEnvironment:
MPI=1
MPI = 1
class JobRequirements:
def __init__( self ):
self.m_numSlots = None
self.m_strArchitecture = None # machine architecture
self.m_parallelEnvironment = None
self.m_queues = None # the list of queues this job is allowed to run on
def __init__(self):
self.m_numSlots = None
self.m_strArchitecture = None # machine architecture
self.m_parallelEnvironment = None
self.m_queues = None # the list of queues this job is allowed to run on
class JobId:
"""
the identifier of a job.
We treat each element of a job array as a separate job
A single integer is no longer enough to identify a job because all elements in a job array
share the same sge job identifier. To uniquely define a job array element, we also use the task id.
"""
MAX_NUM_JOBS_IN_ARRAY = 1000000
def __init__( self, iJobId, iJobArrayElementId = None):
if iJobArrayElementId is not None:
assert iJobArrayElementId <= self.MAX_NUM_JOBS_IN_ARRAY
self.m_iJobId = iJobId
self.m_iJobArrayElementId = iJobArrayElementId # None if this identifier does not refer to a job array element
def __hash__( self ):
"""
required to use a JobId as a dict hash key
"""
hash = self.m_iJobId * self.MAX_NUM_JOBS_IN_ARRAY
if self.m_iJobArrayElementId is not None:
hash += self.m_iJobArrayElementId
return hash
def __eq__( self, other ):
"""
required to use a JobId as a dict hash key
"""
if self.m_iJobId != other.m_iJobId:
return False
if self.m_iJobArrayElementId != other.m_iJobArrayElementId:
return False
return True
def isJobArrayElement( self ):
return (self.m_iJobArrayElementId != None)
def getMainId(self):
return self.m_iJobId
def asStr( self ):
strResult = '%s' % self.m_iJobId
if self.isJobArrayElement():
strResult += '.%d' % self.m_iJobArrayElementId
return strResult
"""
the identifier of a job.
We treat each element of a job array as a separate job
A single integer is no longer enough to identify a job because all elements in a job array
share the same sge job identifier. To uniquely define a job array element, we also use the task id.
"""
MAX_NUM_JOBS_IN_ARRAY = 1000000
def __init__(self, iJobId, iJobArrayElementId=None):
if iJobArrayElementId is not None:
assert iJobArrayElementId <= self.MAX_NUM_JOBS_IN_ARRAY
self.m_iJobId = iJobId
self.m_iJobArrayElementId = iJobArrayElementId # None if this identifier does not refer to a job array element
def __hash__(self):
"""
required to use a JobId as a dict hash key
"""
hash = self.m_iJobId * self.MAX_NUM_JOBS_IN_ARRAY
if self.m_iJobArrayElementId is not None:
hash += self.m_iJobArrayElementId
return hash
def __eq__(self, other):
"""
required to use a JobId as a dict hash key
"""
if self.m_iJobId != other.m_iJobId:
return False
if self.m_iJobArrayElementId != other.m_iJobArrayElementId:
return False
return True
def isJobArrayElement(self):
return (self.m_iJobArrayElementId is not None)
def getMainId(self):
return self.m_iJobId
def asStr(self):
strResult = '%s' % self.m_iJobId
if self.isJobArrayElement():
strResult += '.%d' % self.m_iJobArrayElementId
return strResult
class Job:
def __init__( self, jobId ):
self.m_jobId = jobId
self.m_startTime = None
self.m_submitTime = None
self.m_owner = None
self.m_scriptName = None
self.m_slots = {}
self.m_stateFlags = 0
self.m_jobRequirements = JobRequirements()
self.m_requestedRamPerCore = 0
def getId( self ):
return self.m_jobId
def setState( self, state ):
self.m_stateFlags = state
def setOwner( self, jobOwner ):
if self.m_owner:
assert( self.m_owner == jobOwner )
self.m_owner = jobOwner
def getOwner( self ):
return self.m_owner
def setStartTime( self, jobStartTime ):
if self.m_startTime:
assert( self.m_startTime == jobStartTime )
self.m_startTime = jobStartTime
def setSubmitTime( self, jobSubmitTime ):
if self.m_submitTime:
assert( self.m_submitTime == jobSubmitTime )
self.m_submitTime = jobSubmitTime
def getStartTime( self ):
return self.m_startTime
def setScriptName( self, jobScriptName ):
if self.m_scriptName:
assert( self.m_scriptName == jobScriptName )
self.m_scriptName = jobScriptName
def addSlots( self, queueMachineName, numSlots ):
assert( self.m_slots.get( queueMachineName ) == None )
if self.m_slots.get( queueMachineName ) == None:
self.m_slots[ queueMachineName ] = numSlots
else:
# should never happen
self.m_slots[ queueMachineName ] += numSlots
def getSlots( self ):
return self.m_slots
def setNumRequiredSlots( self, numSlots ):
self.m_jobRequirements.m_numSlots = numSlots
def isPending( self ):
"""
returns true if this job is waiting in the queue for whatever reason
"""
return self.m_stateFlags & JobStateFlags.QUEUED
def getRequestedRamPerCore( self ):
"""
requested RAM per core in bytes
"""
return self.m_requestedRamPerCore
def setRequestedRamPerCore( self, requestedRam ):
"""
requestedRam : requested RAM per core in bytes
"""
self.m_requestedRamPerCore=requestedRam
def __init__(self, jobId):
self.m_jobId = jobId
self.m_startTime = None
self.m_submitTime = None
self.m_owner = None
self.m_scriptName = None
self.m_slots = {}
self.m_stateFlags = 0
self.m_jobRequirements = JobRequirements()
self.m_requestedRamPerCore = 0
def getId(self):
return self.m_jobId
def setState(self, state):
self.m_stateFlags = state
def setOwner(self, jobOwner):
if self.m_owner:
assert self.m_owner == jobOwner
self.m_owner = jobOwner
def getOwner(self):
return self.m_owner
def setStartTime(self, jobStartTime):
if self.m_startTime:
assert self.m_startTime == jobStartTime
self.m_startTime = jobStartTime
def setSubmitTime(self, jobSubmitTime):
if self.m_submitTime:
assert self.m_submitTime == jobSubmitTime
self.m_submitTime = jobSubmitTime
def getStartTime(self):
return self.m_startTime
def setScriptName(self, jobScriptName):
if self.m_scriptName:
assert self.m_scriptName == jobScriptName
self.m_scriptName = jobScriptName
def addSlots(self, queueMachineName, numSlots):
assert self.m_slots.get(queueMachineName) is None
if self.m_slots.get(queueMachineName) is None:
self.m_slots[queueMachineName] = numSlots
else:
# should never happen
self.m_slots[queueMachineName] += numSlots
def getSlots(self):
return self.m_slots
def setNumRequiredSlots(self, numSlots):
self.m_jobRequirements.m_numSlots = numSlots
def isPending(self):
"""
returns true if this job is waiting in the queue for whatever reason
"""
return self.m_stateFlags & JobStateFlags.QUEUED
def getRequestedRamPerCore(self):
"""
requested RAM per core in bytes
"""
return self.m_requestedRamPerCore
def setRequestedRamPerCore(self, requestedRam):
"""
requestedRam : requested RAM per core in bytes
"""
self.m_requestedRamPerCore = requestedRam

View File

@ -1,85 +1,86 @@
from .Log import *
class JobsState:
"""
represents a snapshot of the state of SGE jobs as seen by the SGE command "qstat -f -u \*"
"""
def __init__( self ):
self.m_jobs = {} # list of jobs
self.m_jobArrayJobs = {} # a dictionary of jobs for each job array, indexed by job array id
self.m_queueMachines = {} # list of queue machines such as allintel.q@simpatix10
self.m_stateTime = None # the time at which the state was snapshot
"""
represents a snapshot of the state of SGE jobs as seen by the SGE command "qstat -f -u \*"
"""
def __init__(self):
self.m_jobs = {} # list of jobs
self.m_jobArrayJobs = {} # a dictionary of jobs for each job array, indexed by job array id
self.m_queueMachines = {} # list of queue machines such as allintel.q@simpatix10
self.m_stateTime = None # the time at which the state was snapshot
def deleteAllJobs( self ):
self.m_jobs = {}
self.m_jobArrayJobs = {}
def addJob( self, job ):
jobId = job.getId()
self.m_jobs[ jobId ] = job
if jobId.isJobArrayElement():
tasks = self.m_jobArrayJobs.get(jobId.m_iJobId)
if tasks == None:
tasks = {}
self.m_jobArrayJobs[ jobId.m_iJobId ] = tasks
tasks[jobId] = job
def getJob( self, jobId ):
return self.m_jobs.get( jobId )
def deleteAllJobs(self):
self.m_jobs = {}
self.m_jobArrayJobs = {}
def getJobArrayJobs( self, iJobArrayId ):
return self.m_jobArrayJobs.get( iJobArrayId )
def setTime( self, stateTime ):
self.m_stateTime = stateTime
def addJob(self, job):
jobId = job.getId()
self.m_jobs[jobId] = job
if jobId.isJobArrayElement():
tasks = self.m_jobArrayJobs.get(jobId.m_iJobId)
if tasks is None:
tasks = {}
self.m_jobArrayJobs[jobId.m_iJobId] = tasks
tasks[jobId] = job
def getTime( self ):
return self.m_stateTime
def getJobsOnMachine( self, machineName ):
jobsOnMachine = {}
for jobId, job in self.m_jobs.items():
for queueMachineName, numSlots in job.getSlots().items():
jobMachineName = queueMachineName.split('@')[1]
if jobMachineName == machineName:
jobsOnMachine[ jobId ] = job
return jobsOnMachine
def getNumFreeSlotsOnQueueMachine( self, queueMachine ):
#logInfo('getNumFreeSlotsOnQueueMachine : looking for free slots on queuemachine %s' % queueMachine.getName() )
numUsedSlots = 0
for job in self.m_jobs.values():
numUsedSlotsByThisJob = job.getSlots().get( queueMachine.getName() )
if numUsedSlotsByThisJob != None:
#logInfo('getNumFreeSlotsOnQueueMachine : job %d uses %d slots' % (job.getId().asStr(), numUsedSlotsByThisJob) )
numUsedSlots += numUsedSlotsByThisJob
else:
None
#logInfo('getNumFreeSlotsOnQueueMachine : job %d uses no slot' % job.getId().asStr() )
numFreeSlots = queueMachine.getNumSlots() - numUsedSlots
assert( numFreeSlots >= 0 )
return numFreeSlots
def addQueueMachine( self, queueMachine ):
self.m_queueMachines[ queueMachine.getName() ] = queueMachine
def getQueueMachine( self, machineName ):
"""
finds the queue machine associated with a machine
"""
queueMachine = None
for qmName, qm in self.m_queueMachines.items():
if qm.m_machineName == machineName:
assert( queueMachine == None ) # to be sure that no more than one queue machine is on a given machine
queueMachine = qm
return queueMachine
def getQueueMachines( self ):
return self.m_queueMachines
def getPendingJobs( self ):
pendingJobs = {}
for jobId, job in self.m_jobs.items():
if job.isPending():
pendingJobs[ job.getId() ] = job
return pendingJobs
def getJob(self, jobId):
return self.m_jobs.get(jobId)
def getJobArrayJobs(self, iJobArrayId):
return self.m_jobArrayJobs.get(iJobArrayId)
def setTime(self, stateTime):
self.m_stateTime = stateTime
def getTime(self):
return self.m_stateTime
def getJobsOnMachine(self, machineName):
jobsOnMachine = {}
for jobId, job in self.m_jobs.items():
for queueMachineName, numSlots in job.getSlots().items():
jobMachineName = queueMachineName.split('@')[1]
if jobMachineName == machineName:
jobsOnMachine[jobId] = job
return jobsOnMachine
def getNumFreeSlotsOnQueueMachine(self, queueMachine):
# logInfo('getNumFreeSlotsOnQueueMachine : looking for free slots on queuemachine %s' % queueMachine.getName())
numUsedSlots = 0
for job in self.m_jobs.values():
numUsedSlotsByThisJob = job.getSlots().get(queueMachine.getName())
if numUsedSlotsByThisJob is not None:
# logInfo('getNumFreeSlotsOnQueueMachine : job %d uses %d slots' % (job.getId().asStr(), numUsedSlotsByThisJob))
numUsedSlots += numUsedSlotsByThisJob
else:
None
# logInfo('getNumFreeSlotsOnQueueMachine : job %d uses no slot' % job.getId().asStr())
numFreeSlots = queueMachine.getNumSlots() - numUsedSlots
assert numFreeSlots >= 0
return numFreeSlots
def addQueueMachine(self, queueMachine):
self.m_queueMachines[queueMachine.getName()] = queueMachine
def getQueueMachine(self, machineName):
"""
finds the queue machine associated with a machine
"""
queueMachine = None
for qmName, qm in self.m_queueMachines.items():
if qm.m_machineName == machineName:
assert queueMachine is None # to be sure that no more than one queue machine is on a given machine
queueMachine = qm
return queueMachine
def getQueueMachines(self):
return self.m_queueMachines
def getPendingJobs(self):
pendingJobs = {}
for jobId, job in self.m_jobs.items():
if job.isPending():
pendingJobs[job.getId()] = job
return pendingJobs

View File

@ -6,30 +6,30 @@ import sys
import time
class JobsStateUpdater( threading.Thread ):
DELAY_BETWEEN_STATUS_CHECKS=10 # in seconds
def __init__( self, clusterStatus ):
threading.Thread.__init__(self)
self.m_clusterStatus = clusterStatus
self.m_bStop = False
def getName( self ):
return 'JobsStateUpdater'
def getGridEngine( self ):
return self.m_clusterStatus.getGridEngine()
def updateClusterStatus( self ):
#log('JobsStateUpdater::updateClusterStatus : start')
DELAY_BETWEEN_STATUS_CHECKS=10 # in seconds
def __init__( self, clusterStatus ):
threading.Thread.__init__(self)
self.m_clusterStatus = clusterStatus
self.m_bStop = False
def getName( self ):
return 'JobsStateUpdater'
def getGridEngine( self ):
return self.m_clusterStatus.getGridEngine()
def updateClusterStatus( self ):
#log('JobsStateUpdater::updateClusterStatus : start')
jobsState = self.getGridEngine().getCurrentJobsState()
# update the jobs in the cluster status
self.m_clusterStatus.onNewJobsState( jobsState )
#log('JobsStateUpdater::updateClusterStatus : end')
def run( self ):
try:
while not self.m_bStop :
self.updateClusterStatus()
time.sleep(JobsStateUpdater.DELAY_BETWEEN_STATUS_CHECKS)
except BaseException, exception: # catches all exceptions, including the ctrl+C (KeyboardInterrupt)
Util.onException(exception)
jobsState = self.getGridEngine().getCurrentJobsState()
# update the jobs in the cluster status
self.m_clusterStatus.onNewJobsState( jobsState )
#log('JobsStateUpdater::updateClusterStatus : end')
def run( self ):
try:
while not self.m_bStop :
self.updateClusterStatus()
time.sleep(JobsStateUpdater.DELAY_BETWEEN_STATUS_CHECKS)
except BaseException, exception: # catches all exceptions, including the ctrl+C (KeyboardInterrupt)
Util.onException(exception)

View File

@ -1,29 +1,33 @@
import time
import threading
gLogFilePath = '/tmp/ClusterController.log'#'/var/log/ClusterController.log'
gLogFilePath = '/tmp/ClusterController.log' # '/var/log/ClusterController.log'
def log( message ):
threadName = threading.currentThread().getName()
logMessage = time.asctime(time.localtime())+' : '+ threadName + ' : ' + message
print(logMessage)
f = open(gLogFilePath, 'a+')
assert( f )
try:
f.write( logMessage + '\n' )
finally:
f.close()
def logDebug( message ):
log('[D]'+message)
return
def logInfo( message ):
log('[I]'+message)
def log(message):
threadName = threading.currentThread().getName()
logMessage = time.asctime(time.localtime()) + ' : ' + threadName + ' : ' + message
print(logMessage)
f = open(gLogFilePath, 'a+')
assert f
try:
f.write(logMessage + '\n')
finally:
f.close()
def logWarning( message ):
log('[W]'+message)
def logError( message ):
log('[E]'+message)
def logDebug(message):
log('[D]' + message)
return
def logInfo(message):
log('[I]' + message)
def logWarning(message):
log('[W]' + message)
def logError(message):
log('[E]' + message)

View File

@ -1,21 +1,22 @@
class PowerState:
UNKNOWN=0
OFF=1
ON=2
SLEEP=3
UNPLUGGED=4
UNKNOWN = 0
OFF = 1
ON = 2
SLEEP = 3
UNPLUGGED = 4
def PowerStateToStr( powerState ):
if powerState == PowerState.UNKNOWN:
return 'UNKNOWN'
if powerState == PowerState.OFF:
return 'OFF'
if powerState == PowerState.ON:
return 'ON'
if powerState == PowerState.SLEEP:
return 'SLEEP'
if powerState == PowerState.UNPLUGGED:
return 'UNPLUGGED'
else:
assert( False )
def PowerStateToStr(powerState):
if powerState == PowerState.UNKNOWN:
return 'UNKNOWN'
if powerState == PowerState.OFF:
return 'OFF'
if powerState == PowerState.ON:
return 'ON'
if powerState == PowerState.SLEEP:
return 'SLEEP'
if powerState == PowerState.UNPLUGGED:
return 'UNPLUGGED'
else:
assert False

View File

@ -1,249 +1,255 @@
import io
import re
from .JobsState import *
from .QueueMachine import *
from .JobsState import JobsState
from .QueueMachine import QueueMachine, QueueMachineStateFlags
from .Util import *
from .Log import *
from .Job import *
from .Log import logError
from .Job import JobStateFlags, JobId, Job, ParallelEnvironment
import logging
class QstatParser:
def parseJobState( self, strJobStatus ):
jobState = 0
for i in range(0, len(strJobStatus) ):
c = strJobStatus[i]
if c == 'r':
jobState += JobStateFlags.RUNNING
elif c == 'w':
jobState += JobStateFlags.WAITING
elif c == 'q':
jobState += JobStateFlags.QUEUED
elif c == 't':
jobState += JobStateFlags.TRANSFERING
elif c == 'd':
jobState += JobStateFlags.DELETED
elif c == 'h':
jobState += JobStateFlags.HOLD
elif c == 's':
jobState += JobStateFlags.SUSPENDED
elif c == 'E':
jobState += JobStateFlags.ERROR
else:
assert False, 'unhandled job state flag :"' + c + '"'
return jobState
def parseQueueMachineState( self, strQueueMachineStatus ):
queueMachineState = 0
for i in range(0, len(strQueueMachineStatus) ):
c = strQueueMachineStatus[i]
if c == 'd':
queueMachineState += QueueMachineStateFlags.DISABLED
elif c == 'a':
queueMachineState += QueueMachineStateFlags.ALARM
elif c == 'u':
queueMachineState += QueueMachineStateFlags.UNKNOWN
elif c == 'E':
queueMachineState += QueueMachineStateFlags.ERROR
elif c == 'o':
queueMachineState += QueueMachineStateFlags.OBSOLETE
elif c == 's':
queueMachineState += QueueMachineStateFlags.SUSPENDED
else:
assert False, 'unhandled queue machine state flag :"' + c + '"'
return queueMachineState
def parseQstatOutput( self, qstatOutput ):
"""
parses result of command 'qstat -f -u \* -pri'
"""
def parse_pending_tasks(task_ranges_sequence):
"""
parses a job's task ids encoded in the form of a string containing a sequence of ranges
:param str task_ranges_sequence: a job's task ids encoded in the form of a string containing a sequence of non overlapping ranges separated with a comma. Each range is expected to be in the form "<min_index>-<max_index>:<step>"
:return list(int): the list of task ids
for example, this function would return [1, 2, 3, 4, 6, 7, 8] for the input string "1-4:1,6-8:1"
"""
task_ids = []
astrRanges = re.split(',', task_ranges_sequence)
for strRange in astrRanges:
singleIndexMatch = re.match('^(?P<elementIndex>[0-9]+)$', strRange)
if singleIndexMatch:
iElementIndex = int(singleIndexMatch.group('elementIndex'))
task_ids.extend(range(iElementIndex, iElementIndex+1))
else:
# we expect strRange to be of the form "1-4:1", where :
# the 1st number is the min element index (sge imposes it to be greater than 0)
# the 2nd number is the max element index
# the 3rd number is the step between consecutive element indices
rangeMatch = re.match( '^(?P<minElementIndex>[0-9]+)-(?P<maxElementIndex>[0-9]+):(?P<stepBetweenIndices>[0-9]+)$', strRange)
if rangeMatch == None:
logError('unexpected format for job array details : "%s" (line="%s"' % (strRange, line) )
assert(False)
iMinElementIndex=int(rangeMatch.group('minElementIndex'))
iMaxElementIndex=int(rangeMatch.group('maxElementIndex'))
iStepBetweenIndices=int(rangeMatch.group('stepBetweenIndices'))
task_ids.extend(range(iMinElementIndex, iMaxElementIndex+1, iStepBetweenIndices))
return task_ids
# ugly hack to work around the fact that qstat truncates the fqdn of cluster nodes
# graffy@physix-master:~$ qstat -f -u \*
# queuename qtype resv/used/tot. load_avg arch states
# ---------------------------------------------------------------------------------
# main.q@physix88.ipr.univ-renne BIP 0/0/36 14.03 lx-amd64
# TODO: fix this properly by parsing the output of 'qstat -f -u \* -xml' instead of 'qstat -f -u \*'
qstatOutput = re.sub('\.univ[^ ]*', '.univ-rennes1.fr', qstatOutput)
jobsState = JobsState()
f = io.StringIO(qstatOutput)
line = f.readline()
currentQueueMachine = None
bInPendingJobsSection = False
# examples of job line :
# 43521 0.55108 Confidiso3 aghoufi r 08/19/2009 18:40:09 1
# a typical job line in the pending jobs section looks like this :
# 43645 0.00000 LC_LV_MC aghoufi qw 08/21/2009 08:14:58 1
# a typical running job array line looks like this
# 43619 0.56000 SimpleJobA raffy r 08/20/2009 18:13:03 1 3
# a typical job array line in the pending jobs section looks like this
# 43646 0.00000 SimpleJobA raffy qw 08/21/2009 09:56:40 1 1-4:1
# nurg The job's total urgency value in normalized fashion.
# npprior The job's -p priority in normalized fashion.
# ntckts The job's ticket amount in normalized fashion.
# ppri The job's -p priority as specified by the user.
jobRegularExp = re.compile( '^[ ]*(?P<jobId>[^ ]+)[ ]+(?P<JobPriority>[0-9.]+)[ ]+(?P<nurg>[0-9.]+)[ ]+(?P<npprior>[0-9.]+)[ ]+(?P<ntckts>[0-9.]+)[ ]+(?P<ppri>-?[0-9]+)[ ]+(?P<jobScriptName>[^ ]+)[ ]+(?P<jobOwner>[^ ]+)[ ]+(?P<jobStatus>[^ ]+)[ ]+(?P<jobStartOrSubmitTime>[0-9][0-9]/[0-9][0-9]/[0-9][0-9][0-9][0-9] [0-9][0-9]:[0-9][0-9]:[0-9][0-9])[ ]+(?P<numSlots>[0-9]+)[ ]+(?P<jobArrayDetails>[^\n]*)[\s]*$' )
# example of machine line :
# allintel.q@simpatix34.univ-ren BIP 0/6/8 6.00 darwin-x86
machineRegularExp = re.compile( '^(?P<queueName>[^@]+)@(?P<machineName>[^ ]+)[ ]+(?P<queueTypeString>[^ ]+)[ ]+(?P<numReservedSlots>[^/]+)/(?P<numUsedSlots>[^/]+)/(?P<numTotalSlots>[^ ]+)[ ]+(?P<cpuLoad>[^ ]+)[\s]+(?P<archName>[^ ]+)[\s]+(?P<queueMachineStatus>[^\s]*)' )
pendingJobsHeaderRegularExp = re.compile( '^ - PENDING JOBS - PENDING JOBS - PENDING JOBS - PENDING JOBS - PENDING JOBS[?]*' )
while( len(line) > 0 ):
# print line
# check if the current line is a line describing a job running on a machine
matchObj = jobRegularExp.match( line )
if matchObj:
# we are dealing with a job line
if not bInPendingJobsSection:
assert( currentQueueMachine )
#log('QstatParser::parseQstatOutput : jobId = "'+matchObj.group('jobId')+'"')
iJobId = int(matchObj.group('jobId'))
jobState = self.parseJobState( matchObj.group('jobStatus') )
strJobArrayDetails = matchObj.group('jobArrayDetails')
bIsJobArray = (len(strJobArrayDetails) != 0)
#logDebug('strJobArrayDetails = "%s", bIsJobArray=%d' % (strJobArrayDetails, int(bIsJobArray)))
# each element of a job array is treated as a separate job for the sake of simplicity.
# For these elements, the job id in sge sense is the same, but they are different in this program's sense
task_ids = range(0,1) # just one element, unless it's a job array
if bIsJobArray:
if bInPendingJobsSection:
task_ids = parse_pending_tasks(strJobArrayDetails)
else:
# we are in the running jobs section, and here we expect the strJobArrayDetails to just contain the index of the job array element
iJobArrayElementIndex = int(strJobArrayDetails)
assert(iJobArrayElementIndex != 0) # sge does not allow element indices to be 0
task_ids = range(iJobArrayElementIndex,iJobArrayElementIndex+1)
for task_id in task_ids:
jobId = None
if bIsJobArray:
jobId = JobId(iJobId, task_id)
else:
jobId = JobId(iJobId)
job = jobsState.getJob(jobId)
#logDebug('iElementIndex = %d job id = %s' % (iElementIndex, jobId.asStr()))
if job == None:
# this job hasn't been encountered yet in the output of qstat ...
# we could either be in the pending jobs section or in the running jobs section
job = Job(jobId)
jobsState.addJob( job )
job.setState( jobState )
strJobStartOrSubmitTime = matchObj.group('jobStartOrSubmitTime')
jobStartOrSubmitTime = time.strptime(strJobStartOrSubmitTime, '%m/%d/%Y %H:%M:%S')
if bInPendingJobsSection:
job.setSubmitTime( jobStartOrSubmitTime )
else:
job.setStartTime( jobStartOrSubmitTime )
job.setOwner( matchObj.group('jobOwner') )
job.setScriptName( matchObj.group('jobScriptName') )
if bInPendingJobsSection:
job.setNumRequiredSlots(int(matchObj.group('numSlots')))
else:
assert( not bInPendingJobsSection ) # if we are in the pending jobs section, the job should be new
if not bInPendingJobsSection:
job.addSlots( currentQueueMachine.getName(), int(matchObj.group('numSlots')) )
else:
# the current line does not describe a job
if not bInPendingJobsSection:
# check if this line describes the status of a machine
matchObj = machineRegularExp.match( line )
if matchObj:
queueName = matchObj.group('queueName')
machineName = matchObj.group('machineName')
queueMachine = QueueMachine( queueName, machineName )
#log(line)
#log('matchObj.group(queueTypeString) :' + matchObj.group('queueTypeString'))
#log('matchObj.group(numTotalSlots) :' + matchObj.group('numTotalSlots'))
queueMachine.setNumSlots( int( matchObj.group('numTotalSlots') ) )
queueMachine.setNumUsedSlots( int( matchObj.group('numUsedSlots') ) )
strCpuLoad = matchObj.group('cpuLoad')
if strCpuLoad != '-NA-':
queueMachine.setCpuLoad( float(strCpuLoad) )
strQueueMachineState = matchObj.group('queueMachineStatus')
queueMachine.setState( self.parseQueueMachineState( strQueueMachineState ) )
#log('QstatParser::parseQstatOutput : queueName = "'+matchObj.group('queueName')+'"')
#log('QstatParser::parseQstatOutput : machineName = "'+matchObj.group('machineName')+'"')
currentQueueMachine = queueMachine
jobsState.addQueueMachine( queueMachine )
else:
matchObj = pendingJobsHeaderRegularExp.match( line )
if matchObj:
bInPendingJobsSection = True
currentQueueMachine = None
else:
#print line
None
else:
# we are in a pending jobs section
matchObj = re.match('^[#]+$', line)
if not matchObj:
# unexpected line
print('line = "' + line + '"')
assert( False )
None
line = f.readline()
f.close()
return jobsState
def parseJobDetails( self, qstatOutput, job ):
"""
adds to job the details parsed from the output of the "qstat -j <jobid>" command
"""
f = io.StringIO(qstatOutput)
line = f.readline()
fieldRegularExp = re.compile( '^(?P<fieldName>[^:]+):[ ]+(?P<fieldValue>[?]*)$' )
while( len(line) > 0 ):
# print line
# check if the current line is a line describing a job running on a machine
matchObj = fieldRegularExp.match( line )
if matchObj:
fieldName = matchObj.group('fieldName')
strFieldValue = matchObj.group('fieldValue')
if fieldName == 'job_number':
assert( job.getId().asStr() == strFieldValue )
elif fieldName == 'hard_queue_list':
allowedQueues = strFieldValue.split(',')
assert(len(allowedQueues) > 0)
job.m_jobRequirements.m_queues = allowedQueues
elif fieldName == 'parallel environment':
# the value could be 'ompi range: 32'
matchObj = re.match('ompi range: (?P<numSlots>[0-9]+)[?]*', strFieldValue)
if matchObj:
job.m_jobRequirements.m_parallelEnvironment = ParallelEnvironment.MPI
else:
assert( False )
else:
# ignore he other fields
None
line = f.readline()
f.close()
def parseJobState(self, strJobStatus):
jobState = 0
for i in range(0, len(strJobStatus)):
c = strJobStatus[i]
if c == 'r':
jobState += JobStateFlags.RUNNING
elif c == 'w':
jobState += JobStateFlags.WAITING
elif c == 'q':
jobState += JobStateFlags.QUEUED
elif c == 't':
jobState += JobStateFlags.TRANSFERING
elif c == 'd':
jobState += JobStateFlags.DELETED
elif c == 'h':
jobState += JobStateFlags.HOLD
elif c == 's':
jobState += JobStateFlags.SUSPENDED
elif c == 'E':
jobState += JobStateFlags.ERROR
else:
assert False, 'unhandled job state flag :"' + c + '"'
return jobState
def parseQueueMachineState(self, strQueueMachineStatus):
queueMachineState = 0
for i in range(0, len(strQueueMachineStatus)):
c = strQueueMachineStatus[i]
if c == 'd':
queueMachineState += QueueMachineStateFlags.DISABLED
elif c == 'a':
queueMachineState += QueueMachineStateFlags.ALARM
elif c == 'u':
queueMachineState += QueueMachineStateFlags.UNKNOWN
elif c == 'E':
queueMachineState += QueueMachineStateFlags.ERROR
elif c == 'o':
queueMachineState += QueueMachineStateFlags.OBSOLETE
elif c == 's':
queueMachineState += QueueMachineStateFlags.SUSPENDED
else:
assert False, 'unhandled queue machine state flag :"' + c + '"'
return queueMachineState
def parseQstatOutput(self, qstatOutput):
"""
parses result of command 'qstat -f -u \* -pri'
"""
def parse_pending_tasks(task_ranges_sequence):
"""
parses a job's task ids encoded in the form of a string containing a sequence of ranges
:param str task_ranges_sequence: a job's task ids encoded in the form of a string containing a sequence of non overlapping ranges separated with a comma. Each range is expected to be in the form "<min_index>-<max_index>:<step>"
:return list(int): the list of task ids
for example, this function would return [1, 2, 3, 4, 6, 7, 8] for the input string "1-4:1,6-8:1"
"""
task_ids = []
astrRanges = re.split(',', task_ranges_sequence)
for strRange in astrRanges:
singleIndexMatch = re.match('^(?P<elementIndex>[0-9]+)$', strRange)
if singleIndexMatch:
iElementIndex = int(singleIndexMatch.group('elementIndex'))
task_ids.extend(range(iElementIndex, iElementIndex + 1))
else:
# we expect strRange to be of the form "1-4:1", where :
# the 1st number is the min element index (sge imposes it to be greater than 0)
# the 2nd number is the max element index
# the 3rd number is the step between consecutive element indices
rangeMatch = re.match('^(?P<minElementIndex>[0-9]+)-(?P<maxElementIndex>[0-9]+):(?P<stepBetweenIndices>[0-9]+)$', strRange)
if rangeMatch is None:
logError('unexpected format for job array details : "%s" (line="%s"' % (strRange, line))
assert False
iMinElementIndex = int(rangeMatch.group('minElementIndex'))
iMaxElementIndex = int(rangeMatch.group('maxElementIndex'))
iStepBetweenIndices = int(rangeMatch.group('stepBetweenIndices'))
task_ids.extend(range(iMinElementIndex, iMaxElementIndex + 1, iStepBetweenIndices))
return task_ids
# ugly hack to work around the fact that qstat truncates the fqdn of cluster nodes
# graffy@physix-master:~$ qstat -f -u \*
# queuename qtype resv/used/tot. load_avg arch states
# ---------------------------------------------------------------------------------
# main.q@physix88.ipr.univ-renne BIP 0/0/36 14.03 lx-amd64
# TODO: fix this properly by parsing the output of 'qstat -f -u \* -xml' instead of 'qstat -f -u \*'
qstatOutput = re.sub(r'\.univ[^ ]*', '.univ-rennes1.fr', qstatOutput)
jobsState = JobsState()
f = io.StringIO(qstatOutput)
line = f.readline()
currentQueueMachine = None
bInPendingJobsSection = False
# examples of job line :
# 43521 0.55108 Confidiso3 aghoufi r 08/19/2009 18:40:09 1
# a typical job line in the pending jobs section looks like this :
# 43645 0.00000 LC_LV_MC aghoufi qw 08/21/2009 08:14:58 1
# a typical running job array line looks like this
# 43619 0.56000 SimpleJobA raffy r 08/20/2009 18:13:03 1 3
# a typical job array line in the pending jobs section looks like this
# 43646 0.00000 SimpleJobA raffy qw 08/21/2009 09:56:40 1 1-4:1
# nurg The job's total urgency value in normalized fashion.
# npprior The job's -p priority in normalized fashion.
# ntckts The job's ticket amount in normalized fashion.
# ppri The job's -p priority as specified by the user.
jobRegularExp = re.compile(r'^[ ]*(?P<jobId>[^ ]+)[ ]+(?P<JobPriority>[0-9.]+)[ ]+(?P<nurg>[0-9.]+)[ ]+(?P<npprior>[0-9.]+)[ ]+(?P<ntckts>[0-9.]+)[ ]+(?P<ppri>-?[0-9]+)[ ]+(?P<jobScriptName>[^ ]+)[ ]+(?P<jobOwner>[^ ]+)[ ]+(?P<jobStatus>[^ ]+)[ ]+(?P<jobStartOrSubmitTime>[0-9][0-9]/[0-9][0-9]/[0-9][0-9][0-9][0-9] [0-9][0-9]:[0-9][0-9]:[0-9][0-9])[ ]+(?P<numSlots>[0-9]+)[ ]+(?P<jobArrayDetails>[^\n]*)[\s]*$')
# example of machine line :
# allintel.q@simpatix34.univ-ren BIP 0/6/8 6.00 darwin-x86
machineRegularExp = re.compile(r'^(?P<queueName>[^@]+)@(?P<machineName>[^ ]+)[ ]+(?P<queueTypeString>[^ ]+)[ ]+(?P<numReservedSlots>[^/]+)/(?P<numUsedSlots>[^/]+)/(?P<numTotalSlots>[^ ]+)[ ]+(?P<cpuLoad>[^ ]+)[\s]+(?P<archName>[^ ]+)[\s]+(?P<queueMachineStatus>[^\s]*)')
pendingJobsHeaderRegularExp = re.compile('^ - PENDING JOBS - PENDING JOBS - PENDING JOBS - PENDING JOBS - PENDING JOBS[?]*')
while len(line) > 0:
# print line
# check if the current line is a line describing a job running on a machine
matchObj = jobRegularExp.match(line)
if matchObj:
# we are dealing with a job line
if not bInPendingJobsSection:
assert currentQueueMachine
# log('QstatParser::parseQstatOutput : jobId = "'+matchObj.group('jobId')+'"')
iJobId = int(matchObj.group('jobId'))
logging.debug('iJobId = %d' % iJobId)
jobState = self.parseJobState(matchObj.group('jobStatus'))
strJobArrayDetails = matchObj.group('jobArrayDetails')
bIsJobArray = (len(strJobArrayDetails) != 0)
# logDebug('strJobArrayDetails = "%s", bIsJobArray=%d' % (strJobArrayDetails, int(bIsJobArray)))
# each element of a job array is treated as a separate job for the sake of simplicity.
# For these elements, the job id in sge sense is the same, but they are different in this program's sense
task_ids = range(0, 1) # just one element, unless it's a job array
if bIsJobArray:
if bInPendingJobsSection:
task_ids = parse_pending_tasks(strJobArrayDetails)
else:
# we are in the running jobs section, and here we expect the strJobArrayDetails to just contain the index of the job array element
iJobArrayElementIndex = int(strJobArrayDetails)
assert iJobArrayElementIndex != 0 # sge does not allow element indices to be 0
task_ids = range(iJobArrayElementIndex, iJobArrayElementIndex + 1)
logging.debug('task_ids = %s' % task_ids)
for task_id in task_ids:
logging.debug('task_id = %s' % task_id)
jobId = None
if bIsJobArray:
jobId = JobId(iJobId, task_id)
else:
jobId = JobId(iJobId)
job = jobsState.getJob(jobId)
# logDebug('iElementIndex = %d job id = %s' % (iElementIndex, jobId.asStr()))
if job is None:
# this job hasn't been encountered yet in the output of qstat ...
# we could either be in the pending jobs section or in the running jobs section
job = Job(jobId)
jobsState.addJob(job)
job.setState(jobState)
strJobStartOrSubmitTime = matchObj.group('jobStartOrSubmitTime')
jobStartOrSubmitTime = time.strptime(strJobStartOrSubmitTime, '%m/%d/%Y %H:%M:%S')
if bInPendingJobsSection:
job.setSubmitTime(jobStartOrSubmitTime)
else:
job.setStartTime(jobStartOrSubmitTime)
job.setOwner(matchObj.group('jobOwner'))
job.setScriptName(matchObj.group('jobScriptName'))
if bInPendingJobsSection:
job.setNumRequiredSlots(int(matchObj.group('numSlots')))
else:
assert not bInPendingJobsSection # if we are in the pending jobs section, the job should be new
if not bInPendingJobsSection:
job.addSlots(currentQueueMachine.getName(), int(matchObj.group('numSlots')))
else:
# the current line does not describe a job
if not bInPendingJobsSection:
# check if this line describes the status of a machine
matchObj = machineRegularExp.match(line)
if matchObj:
queueName = matchObj.group('queueName')
machineName = matchObj.group('machineName')
queueMachine = QueueMachine(queueName, machineName)
# log(line)
# log('matchObj.group(queueTypeString) :' + matchObj.group('queueTypeString'))
# log('matchObj.group(numTotalSlots) :' + matchObj.group('numTotalSlots'))
queueMachine.setNumSlots(int(matchObj.group('numTotalSlots')))
queueMachine.setNumUsedSlots(int(matchObj.group('numUsedSlots')))
strCpuLoad = matchObj.group('cpuLoad')
if strCpuLoad != '-NA-':
queueMachine.setCpuLoad(float(strCpuLoad))
strQueueMachineState = matchObj.group('queueMachineStatus')
queueMachine.setState(self.parseQueueMachineState(strQueueMachineState))
# log('QstatParser::parseQstatOutput : queueName = "'+matchObj.group('queueName')+'"')
# log('QstatParser::parseQstatOutput : machineName = "'+matchObj.group('machineName')+'"')
currentQueueMachine = queueMachine
jobsState.addQueueMachine(queueMachine)
else:
matchObj = pendingJobsHeaderRegularExp.match(line)
if matchObj:
bInPendingJobsSection = True
currentQueueMachine = None
else:
# print line
None
else:
# we are in a pending jobs section
matchObj = re.match('^[#]+$', line)
if not matchObj:
# unexpected line
print('line = "' + line + '"')
assert False
None
line = f.readline()
f.close()
return jobsState
def parseJobDetails(self, qstatOutput, job):
"""
adds to job the details parsed from the output of the "qstat -j <jobid>" command
"""
f = io.StringIO(qstatOutput)
line = f.readline()
fieldRegularExp = re.compile('^(?P<fieldName>[^:]+):[ ]+(?P<fieldValue>[?]*)$')
while len(line) > 0:
# print line
# check if the current line is a line describing a job running on a machine
matchObj = fieldRegularExp.match(line)
if matchObj:
fieldName = matchObj.group('fieldName')
strFieldValue = matchObj.group('fieldValue')
if fieldName == 'job_number':
assert job.getId().asStr() == strFieldValue
elif fieldName == 'hard_queue_list':
allowedQueues = strFieldValue.split(',')
assert len(allowedQueues) > 0
job.m_jobRequirements.m_queues = allowedQueues
elif fieldName == 'parallel environment':
# the value could be 'ompi range: 32'
matchObj = re.match('ompi range: (?P<numSlots>[0-9]+)[?]*', strFieldValue)
if matchObj:
job.m_jobRequirements.m_parallelEnvironment = ParallelEnvironment.MPI
else:
assert False
else:
# ignore he other fields
None
line = f.readline()
f.close()

View File

@ -1,65 +1,81 @@
class QueueMachineStateFlags: #
DISABLED=1 # the queue machine is disabled
ALARM=2 # the queue machine is in alarm state (see man qstat)
UNKNOWN=4 # the queue machine is in unknown state because sge_execd cannot be contected (see man qstat)
ERROR=8 # the queue is in error state
OBSOLETE=16 # the queue no longer exists but it is still visible because it still contains running jobs
SUSPENDED=32 # the queue machine is suspended
class QueueMachineStateFlags: #
DISABLED = 1 # the queue machine is disabled
ALARM = 2 # the queue machine is in alarm state (see man qstat)
UNKNOWN = 4 # the queue machine is in unknown state because sge_execd cannot be contected (see man qstat)
ERROR = 8 # the queue is in error state
OBSOLETE = 16 # the queue no longer exists but it is still visible because it still contains running jobs
SUSPENDED = 32 # the queue machine is suspended
class QueueMachine:
"""
a QueueMachine instance represents a given SGE queue on a given machine (eg allintel.q@simpatix10)
"""
def __init__( self, queueName, machineName ):
self.m_queueName = queueName
self.m_machineName = machineName
self.m_numSlots = None
self.m_numUsedSlots = None
self.m_fCpuLoad = None
self.m_stateFlags = 0
self.m_strDisableMessage = ''
def getName( self ):
"""
returns the name of the machine queue (such as allintel.q@simpatix10)
"""
return self.m_queueName + '@' + self.m_machineName
def getQueueName( self ):
return self.m_queueName
def getMachineName( self ):
return self.m_machineName
def setNumSlots( self, numSlots ):
self.m_numSlots = numSlots
def setNumUsedSlots( self, numSlots ):
self.m_numUsedSlots = numSlots
def getNumSlots( self ):
assert( self.m_numSlots != None )
return self.m_numSlots
def getNumUsedSlots( self ):
assert( self.m_numUsedSlots != None )
return self.m_numUsedSlots
def setCpuLoad( self, fCpuLoad ):
self.m_fCpuLoad = fCpuLoad
def cpuLoadIsAvailable( self ):
return self.m_fCpuLoad != None
def getCpuLoad( self ):
assert( self.m_fCpuLoad != None )
return self.m_fCpuLoad
def setState( self, state ):
self.m_stateFlags = state
def isDisabled( self ):
return self.m_stateFlags & QueueMachineStateFlags.DISABLED
def isInErrorState( self ):
return self.m_stateFlags & QueueMachineStateFlags.ERROR
def isResponding( self ):
return not (self.m_stateFlags & QueueMachineStateFlags.UNKNOWN)
def isInAlarmState( self ):
return self.m_stateFlags & QueueMachineStateFlags.ALARM
def isSuspended( self ):
return self.m_stateFlags & QueueMachineStateFlags.SUSPENDED
"""
def getStateAsString( self ):
assert( self.m_strState != None )
return self.m_strState
"""
"""
a QueueMachine instance represents a given SGE queue on a given machine (eg allintel.q@simpatix10)
"""
def __init__(self, queueName, machineName):
self.m_queueName = queueName
self.m_machineName = machineName
self.m_numSlots = None
self.m_numUsedSlots = None
self.m_fCpuLoad = None
self.m_stateFlags = 0
self.m_strDisableMessage = ''
def getName(self):
"""
returns the name of the machine queue (such as allintel.q@simpatix10)
"""
return self.m_queueName + '@' + self.m_machineName
def getQueueName(self):
return self.m_queueName
def getMachineName(self):
return self.m_machineName
def setNumSlots(self, numSlots):
self.m_numSlots = numSlots
def setNumUsedSlots(self, numSlots):
self.m_numUsedSlots = numSlots
def getNumSlots(self):
assert self.m_numSlots is not None
return self.m_numSlots
def getNumUsedSlots(self):
assert self.m_numUsedSlots is not None
return self.m_numUsedSlots
def setCpuLoad(self, fCpuLoad):
self.m_fCpuLoad = fCpuLoad
def cpuLoadIsAvailable(self):
return self.m_fCpuLoad is not None
def getCpuLoad(self):
assert self.m_fCpuLoad is not None
return self.m_fCpuLoad
def setState(self, state):
self.m_stateFlags = state
def isDisabled(self):
return self.m_stateFlags & QueueMachineStateFlags.DISABLED
def isInErrorState(self):
return self.m_stateFlags & QueueMachineStateFlags.ERROR
def isResponding(self):
return not (self.m_stateFlags & QueueMachineStateFlags.UNKNOWN)
def isInAlarmState(self):
return self.m_stateFlags & QueueMachineStateFlags.ALARM
def isSuspended(self):
return self.m_stateFlags & QueueMachineStateFlags.SUSPENDED
"""
def getStateAsString(self):
assert(self.m_strState is not None)
return self.m_strState
"""

View File

@ -1,141 +1,147 @@
from PowerState import *
from Log import *
from PowerState import PowerState
from Log import logInfo
import time
import copy
class Slot:
def __init__( self ):
self.m_queueMachine = None
self.m_numSlots = None
self.m_job = None # job for which this slot is allocated
def __init__(self):
self.m_queueMachine = None
self.m_numSlots = None
self.m_job = None # job for which this slot is allocated
class SlotAllocator:
"""
a class that defines a strategy for allocating free slots for the given pending jobs
"""
def getMachinesThatNeedWakeUp( self, pendingJobs, clusterState ):
"""
returns the list of machines that need to wake up to make pending jobs running
"""
assert( False ) # this method is abstract
class SimpleSlotAllocator( SlotAllocator ):
def getMachinesThatNeedWakeUp( self, pendingJobs, clusterState ):
machinesThatNeedWakeUp = {}
highestPriorityPendingJob = pendingJobs.values()[0]
logInfo('SimpleSlotAllocator::getMachinesThatNeedWakeUp : looking for free slots for job ' + highestPriorityPendingJob.getId().asStr() )
numFreeSlots = {} # contains the number of free slots for each queueMachine
for queueMachine in clusterState.getJobsState().getQueueMachines().values():
numFreeSlots[ queueMachine ] = clusterState.getJobsState().getNumFreeSlotsOnQueueMachine( queueMachine )
logInfo('SimpleSlotAllocator::getMachinesThatNeedWakeUp : init numFreeSlots[ %s ] with %d ' % (queueMachine.getName(), numFreeSlots[ queueMachine ]) )
remainingNumSlotsToAllocate = highestPriorityPendingJob.m_jobRequirements.m_numSlots
logInfo('SimpleSlotAllocator::getMachinesThatNeedWakeUp : still %d slots to find' % remainingNumSlotsToAllocate )
# first look in running machines if there are available slots
for queueMachine in clusterState.getJobsState().getQueueMachines().values():
logInfo('SimpleSlotAllocator::getMachinesThatNeedWakeUp : examining queueMachine %s ' % queueMachine.getName() )
machine = clusterState.getMachines()[ queueMachine.getMachineName() ]
if machine.getPowerState() == PowerState.ON:
if clusterState.queueMachineFitsJobRequirements( queueMachine, highestPriorityPendingJob.m_jobRequirements ):
numSlotsAllocatedOnThisMachine = min( numFreeSlots[ queueMachine ], remainingNumSlotsToAllocate )
logInfo('SimpleSlotAllocator::getMachinesThatNeedWakeUp : found %d slots on already running %s ' % (numSlotsAllocatedOnThisMachine, queueMachine.getMachineName() ) )
remainingNumSlotsToAllocate -= numSlotsAllocatedOnThisMachine
numFreeSlots[ queueMachine ] -= numSlotsAllocatedOnThisMachine
logInfo('SimpleSlotAllocator::getMachinesThatNeedWakeUp : still %d slots to find' % remainingNumSlotsToAllocate )
assert( remainingNumSlotsToAllocate >= 0 )
if remainingNumSlotsToAllocate == 0:
break
if remainingNumSlotsToAllocate > 0:
# now look into machines that are asleep
for queueMachine in clusterState.getJobsState().getQueueMachines().values():
logInfo('SimpleSlotAllocator::getMachinesThatNeedWakeUp : examining queueMachine %s ' % queueMachine.getName() )
machine = clusterState.getMachines()[ queueMachine.getMachineName() ]
if machine.getPowerState() == PowerState.SLEEP:
if clusterState.queueMachineFitsJobRequirements( queueMachine, highestPriorityPendingJob.m_jobRequirements ):
numSlotsAllocatedOnThisMachine = min( numFreeSlots[ queueMachine ], remainingNumSlotsToAllocate )
logInfo('SimpleSlotAllocator::getMachinesThatNeedWakeUp : found %d slots on sleeping %s ' % (numSlotsAllocatedOnThisMachine, queueMachine.getMachineName() ) )
remainingNumSlotsToAllocate -= numSlotsAllocatedOnThisMachine
numFreeSlots[ queueMachine ] -= numSlotsAllocatedOnThisMachine
machinesThatNeedWakeUp[ machine.getName() ] = machine
logInfo('SimpleSlotAllocator::getMachinesThatNeedWakeUp : still %d slots to find' % remainingNumSlotsToAllocate )
assert( remainingNumSlotsToAllocate >= 0 )
if remainingNumSlotsToAllocate == 0:
break
if remainingNumSlotsToAllocate != 0:
return {} # not enough slots available
return machinesThatNeedWakeUp
class DecoupledSlotAllocator( SlotAllocator ):
"""
a slot allocator that doesn't know much about sge, and does not attempts to guess what sge'sceduler would do
Instead, it uses a very simple strategy : it wakes up all the machines periodically to allow jobs to get in.
"""
def __init__( self ):
self.m_delayBetweenPeriodicChecks = -1 # in seconds. Disable periodic checks by setting this to -1
self.m_lastCheckTime = time.time()
self.m_lastClusterState = None
def jobsStateHasChanged( self, newClusterState ):
"""
returns true if there is a change in the cluster state that can cause a pending job
to start (provided all machines are enabled)
"""
oldJobs = {}
if self.m_lastClusterState:
oldJobs = self.m_lastClusterState.m_jobsState.m_jobs
newJobs = newClusterState.m_jobsState.m_jobs
bJobsHaveChanged = False
oldJobsOnly = oldJobs.copy() # shallow copy
#print 'oldJobs : ', oldJobs
#print 'newJobs : ', newJobs
"""
print 'self.m_lastClusterState', self.m_lastClusterState
print 'newClusterState', newClusterState
if self.m_lastClusterState:
print 'self.m_lastClusterState.m_jobsState', self.m_lastClusterState.m_jobsState
print 'newClusterState.m_jobsState', newClusterState.m_jobsState
print 'id(self.m_lastClusterState) : ', id(self.m_lastClusterState)
print 'id(newClusterState) : ', id(newClusterState)
print 'len(oldJobs) : ', len(oldJobs)
print 'len(newJobs) : ', len(newJobs)
print 'id(oldJobs) : ', id(oldJobs)
print 'id(newJobs) : ', id(newJobs)
"""
for newJob in newJobs.values():
#logDebug('DecoupledSlotAllocator::jobsStateHasChanged newJob id=%s' % newJob.getId().asStr())
if newJob.getId() in oldJobs:
#logDebug('DecoupledSlotAllocator::jobsStateHasChanged job id=%d is in old jobs' % newJob.getId())
del oldJobsOnly[newJob.getId()]
else:
# ah ... a new job has arrived
logInfo('A new job (jobId =%s) has been detected ' % newJob.getId().asStr() )
bJobsHaveChanged = True
if len(oldJobsOnly) != 0:
for oldJob in oldJobsOnly.values():
logInfo('Job (jobId =%s) has finished' % oldJob.getId().asStr() )
# at least one old job has finished, freeing some slots
bJobsHaveChanged = True
return bJobsHaveChanged
def getMachinesThatNeedWakeUp( self, pendingJobs, clusterState ):
machinesThatNeedWakeUp = {}
bJobsStateHasChanged = self.jobsStateHasChanged( clusterState )
currentTime = time.time()
# we do periodic checks to detect changes in cluster state that are not detected by jobsStateHasChanged
# for example changes in the requirements, in the allocation policy, etc...
bItsTimeForPeriodicCheck = False
if self.m_delayBetweenPeriodicChecks > 0:
bItsTimeForPeriodicCheck = (currentTime - self.m_lastCheckTime) > self.m_delayBetweenPeriodicChecks
if bJobsStateHasChanged or bItsTimeForPeriodicCheck:
if bJobsStateHasChanged:
logInfo('DecoupledSlotAllocator::getMachinesThatNeedWakeUp : waking up machines that are asleep because jobs state has changed')
else:
logInfo('DecoupledSlotAllocator::getMachinesThatNeedWakeUp : waking up machines that are asleep for periodic check (to be sure pending jobs get a chance to start)')
for queueMachine in clusterState.getJobsState().getQueueMachines().values():
if queueMachine.getMachineName() in clusterState.getMachines():
# this means that the machine is under the cluster controller's control
machine = clusterState.getMachines()[ queueMachine.getMachineName() ]
if machine.getPowerState() == PowerState.SLEEP:
machinesThatNeedWakeUp[ machine.getName() ] = machine
self.m_lastCheckTime = currentTime
self.m_lastClusterState = copy.copy(clusterState)
#print 'self.m_lastClusterState', self.m_lastClusterState
return machinesThatNeedWakeUp
"""
a class that defines a strategy for allocating free slots for the given pending jobs
"""
def getMachinesThatNeedWakeUp(self, pendingJobs, clusterState):
"""
returns the list of machines that need to wake up to make pending jobs running
"""
assert False # this method is abstract
class SimpleSlotAllocator(SlotAllocator):
def getMachinesThatNeedWakeUp(self, pendingJobs, clusterState):
machinesThatNeedWakeUp = {}
highestPriorityPendingJob = pendingJobs.values()[0]
logInfo('SimpleSlotAllocator::getMachinesThatNeedWakeUp : looking for free slots for job ' + highestPriorityPendingJob.getId().asStr())
numFreeSlots = {} # contains the number of free slots for each queueMachine
for queueMachine in clusterState.getJobsState().getQueueMachines().values():
numFreeSlots[queueMachine] = clusterState.getJobsState().getNumFreeSlotsOnQueueMachine(queueMachine)
logInfo('SimpleSlotAllocator::getMachinesThatNeedWakeUp : init numFreeSlots[%s] with %d ' % (queueMachine.getName(), numFreeSlots[queueMachine]))
remainingNumSlotsToAllocate = highestPriorityPendingJob.m_jobRequirements.m_numSlots
logInfo('SimpleSlotAllocator::getMachinesThatNeedWakeUp : still %d slots to find' % remainingNumSlotsToAllocate)
# first look in running machines if there are available slots
for queueMachine in clusterState.getJobsState().getQueueMachines().values():
logInfo('SimpleSlotAllocator::getMachinesThatNeedWakeUp : examining queueMachine %s ' % queueMachine.getName())
machine = clusterState.getMachines()[queueMachine.getMachineName()]
if machine.getPowerState() == PowerState.ON:
if clusterState.queueMachineFitsJobRequirements(queueMachine, highestPriorityPendingJob.m_jobRequirements):
numSlotsAllocatedOnThisMachine = min(numFreeSlots[queueMachine], remainingNumSlotsToAllocate)
logInfo('SimpleSlotAllocator::getMachinesThatNeedWakeUp : found %d slots on already running %s ' % (numSlotsAllocatedOnThisMachine, queueMachine.getMachineName()))
remainingNumSlotsToAllocate -= numSlotsAllocatedOnThisMachine
numFreeSlots[queueMachine] -= numSlotsAllocatedOnThisMachine
logInfo('SimpleSlotAllocator::getMachinesThatNeedWakeUp : still %d slots to find' % remainingNumSlotsToAllocate)
assert remainingNumSlotsToAllocate >= 0
if remainingNumSlotsToAllocate == 0:
break
if remainingNumSlotsToAllocate > 0:
# now look into machines that are asleep
for queueMachine in clusterState.getJobsState().getQueueMachines().values():
logInfo('SimpleSlotAllocator::getMachinesThatNeedWakeUp : examining queueMachine %s ' % queueMachine.getName())
machine = clusterState.getMachines()[queueMachine.getMachineName()]
if machine.getPowerState() == PowerState.SLEEP:
if clusterState.queueMachineFitsJobRequirements(queueMachine, highestPriorityPendingJob.m_jobRequirements):
numSlotsAllocatedOnThisMachine = min(numFreeSlots[queueMachine], remainingNumSlotsToAllocate)
logInfo('SimpleSlotAllocator::getMachinesThatNeedWakeUp : found %d slots on sleeping %s ' % (numSlotsAllocatedOnThisMachine, queueMachine.getMachineName()))
remainingNumSlotsToAllocate -= numSlotsAllocatedOnThisMachine
numFreeSlots[queueMachine] -= numSlotsAllocatedOnThisMachine
machinesThatNeedWakeUp[machine.getName()] = machine
logInfo('SimpleSlotAllocator::getMachinesThatNeedWakeUp : still %d slots to find' % remainingNumSlotsToAllocate)
assert remainingNumSlotsToAllocate >= 0
if remainingNumSlotsToAllocate == 0:
break
if remainingNumSlotsToAllocate != 0:
return {} # not enough slots available
return machinesThatNeedWakeUp
class DecoupledSlotAllocator(SlotAllocator):
"""
a slot allocator that doesn't know much about sge, and does not attempts to guess what sge'sceduler would do
Instead, it uses a very simple strategy : it wakes up all the machines periodically to allow jobs to get in.
"""
def __init__(self):
self.m_delayBetweenPeriodicChecks = -1 # in seconds. Disable periodic checks by setting this to -1
self.m_lastCheckTime = time.time()
self.m_lastClusterState = None
def jobsStateHasChanged(self, newClusterState):
"""
returns true if there is a change in the cluster state that can cause a pending job
to start (provided all machines are enabled)
"""
oldJobs = {}
if self.m_lastClusterState:
oldJobs = self.m_lastClusterState.m_jobsState.m_jobs
newJobs = newClusterState.m_jobsState.m_jobs
bJobsHaveChanged = False
oldJobsOnly = oldJobs.copy() # shallow copy
# print 'oldJobs : ', oldJobs
# print 'newJobs : ', newJobs
"""
print 'self.m_lastClusterState', self.m_lastClusterState
print 'newClusterState', newClusterState
if self.m_lastClusterState:
print 'self.m_lastClusterState.m_jobsState', self.m_lastClusterState.m_jobsState
print 'newClusterState.m_jobsState', newClusterState.m_jobsState
print 'id(self.m_lastClusterState) : ', id(self.m_lastClusterState)
print 'id(newClusterState) : ', id(newClusterState)
print 'len(oldJobs) : ', len(oldJobs)
print 'len(newJobs) : ', len(newJobs)
print 'id(oldJobs) : ', id(oldJobs)
print 'id(newJobs) : ', id(newJobs)
"""
for newJob in newJobs.values():
# logDebug('DecoupledSlotAllocator::jobsStateHasChanged newJob id=%s' % newJob.getId().asStr())
if newJob.getId() in oldJobs:
# logDebug('DecoupledSlotAllocator::jobsStateHasChanged job id=%d is in old jobs' % newJob.getId())
del oldJobsOnly[newJob.getId()]
else:
# ah ... a new job has arrived
logInfo('A new job (jobId =%s) has been detected ' % newJob.getId().asStr())
bJobsHaveChanged = True
if len(oldJobsOnly) != 0:
for oldJob in oldJobsOnly.values():
logInfo('Job (jobId =%s) has finished' % oldJob.getId().asStr())
# at least one old job has finished, freeing some slots
bJobsHaveChanged = True
return bJobsHaveChanged
def getMachinesThatNeedWakeUp(self, pendingJobs, clusterState):
machinesThatNeedWakeUp = {}
bJobsStateHasChanged = self.jobsStateHasChanged(clusterState)
currentTime = time.time()
# we do periodic checks to detect changes in cluster state that are not detected by jobsStateHasChanged
# for example changes in the requirements, in the allocation policy, etc...
bItsTimeForPeriodicCheck = False
if self.m_delayBetweenPeriodicChecks > 0:
bItsTimeForPeriodicCheck = (currentTime - self.m_lastCheckTime) > self.m_delayBetweenPeriodicChecks
if bJobsStateHasChanged or bItsTimeForPeriodicCheck:
if bJobsStateHasChanged:
logInfo('DecoupledSlotAllocator::getMachinesThatNeedWakeUp : waking up machines that are asleep because jobs state has changed')
else:
logInfo('DecoupledSlotAllocator::getMachinesThatNeedWakeUp : waking up machines that are asleep for periodic check (to be sure pending jobs get a chance to start)')
for queueMachine in clusterState.getJobsState().getQueueMachines().values():
if queueMachine.getMachineName() in clusterState.getMachines():
# this means that the machine is under the cluster controller's control
machine = clusterState.getMachines()[queueMachine.getMachineName()]
if machine.getPowerState() == PowerState.SLEEP:
machinesThatNeedWakeUp[machine.getName()] = machine
self.m_lastCheckTime = currentTime
self.m_lastClusterState = copy.copy(clusterState)
# print 'self.m_lastClusterState', self.m_lastClusterState
return machinesThatNeedWakeUp

View File

@ -1,58 +1,58 @@
import Util
from QstatParser import *
import time
from Util import executeProgram
from QstatParser import QstatParser
from Log import logDebug, logWarning
class SunGridEngine:
def getCurrentJobsState( self ):
bBUG_00000009_IS_STILL_ALIVE = True
if bBUG_00000009_IS_STILL_ALIVE:
logDebug('Querying the current state of jobs')
returnCode = -1
delayBetweenAttemps = 5 # in seconds
while returnCode != 0:
command = ['qstat', '-f', '-u', '*']
(returnCode, qstatOutput, stderr) = executeProgram( command )
if returnCode != 0:
logWarning('command "%s" failed (returnCode = %d, stdout="%s", stderr="%s"). Retrying in %d seconds' % (' '.join(command), returnCode, qstatOutput, stderr, delayBetweenAttemps))
time.sleep(delayBetweenAttemps)
if bBUG_00000009_IS_STILL_ALIVE:
logDebug('Just got current state of jobs')
def getCurrentJobsState(self):
bBUG_00000009_IS_STILL_ALIVE = True
if bBUG_00000009_IS_STILL_ALIVE:
logDebug('Querying the current state of jobs')
returnCode = -1
delayBetweenAttemps = 5 # in seconds
while returnCode != 0:
command = ['qstat', '-f', '-u', '*']
(returnCode, qstatOutput, stderr) = executeProgram(command)
if returnCode != 0:
logWarning('command "%s" failed (returnCode = %d, stdout="%s", stderr="%s"). Retrying in %d seconds' % (' '.join(command), returnCode, qstatOutput, stderr, delayBetweenAttemps))
time.sleep(delayBetweenAttemps)
if bBUG_00000009_IS_STILL_ALIVE:
logDebug('Just got current state of jobs')
jobsState = QstatParser().parseQstatOutput( qstatOutput )
jobsState.setTime( time.time() )
# read the requirements for pending jobs (which parallel environment, which queue, which architecture) from sge
if False: # no need for job details at the moment and since it's very slow, it's been disabled
for unused_jobId, job in jobsState.getPendingJobs().items():
(returnCode, stdout, stderr) = executeProgram( ['qstat', '-j', job.getId().asStr()] )
assert returnCode != 0, 'prout'
QstatParser().parseJobDetails( stdout, job )
return jobsState
jobsState = QstatParser().parseQstatOutput(qstatOutput)
jobsState.setTime(time.time())
def setQueueInstanceActivation( self, strQueueInstanceName, bEnable ):
argument = 'd'
if bEnable:
argument = 'e'
bBUG_00000269_IS_STILL_ALIVE = True # for some reason, qmod -d (and maybe any sge command) could fail with error: commlib error: can't connect to service (Address already in use)
delayBetweenAttemps = 5 # in seconds
while True:
errorCode, unused_stdout, unused_stderr = executeProgram(['qmod', '-'+argument, strQueueInstanceName])
if bBUG_00000269_IS_STILL_ALIVE:
# if the command failed, try again
if errorCode == 0:
break
time.sleep(delayBetweenAttemps)
else:
break
return (errorCode == 0)
def queueIsEmpty( self, strMachineName ):
(returnCode, qstatOutput, unused_stderr) = executeProgram( ['qstat', '-f', '-u', '*'] )
assert( returnCode == 0 )
jobsState = QstatParser().parseQstatOutput( qstatOutput )
jobs = jobsState.getJobsOnMachine( strMachineName )
return (len(jobs) == 0)
# read the requirements for pending jobs (which parallel environment, which queue, which architecture) from sge
if False: # no need for job details at the moment and since it's very slow, it's been disabled
for unused_jobId, job in jobsState.getPendingJobs().items():
(returnCode, stdout, stderr) = executeProgram(['qstat', '-j', job.getId().asStr()])
assert returnCode != 0, 'prout'
QstatParser().parseJobDetails(stdout, job)
return jobsState
def setQueueInstanceActivation(self, strQueueInstanceName, bEnable):
argument = 'd'
if bEnable:
argument = 'e'
bBUG_00000269_IS_STILL_ALIVE = True # for some reason, qmod -d (and maybe any sge command) could fail with error: commlib error: can't connect to service (Address already in use)
delayBetweenAttemps = 5 # in seconds
while True:
errorCode, unused_stdout, unused_stderr = executeProgram(['qmod', '-' + argument, strQueueInstanceName])
if bBUG_00000269_IS_STILL_ALIVE:
# if the command failed, try again
if errorCode == 0:
break
time.sleep(delayBetweenAttemps)
else:
break
return (errorCode == 0)
def queueIsEmpty(self, strMachineName):
(returnCode, qstatOutput, unused_stderr) = executeProgram(['qstat', '-f', '-u', '*'])
assert returnCode == 0
jobsState = QstatParser().parseQstatOutput(qstatOutput)
jobs = jobsState.getJobsOnMachine(strMachineName)
return (len(jobs) == 0)

View File

@ -1,53 +1,56 @@
#!/usr/bin/env python
import sys
sys.path.insert(0, '..')
from Log import *
from Log import logInfo
import Util
from PowerState import *
from PowerState import PowerState
from HTMLParser import HTMLParser
def Test0000():
logInfo('Testing bug 00000003 if a series of wake up, goto sleep can shutdown a machine')
strTargetMachineName = 'simpatix12'
ePowerState = Util.getPowerState(strTargetMachineName)
while True:
if ePowerState == PowerState.ON:
bSuccess = Util.blockingPutMachineToSleep(strTargetMachineName)
assert( bSuccess )
bSuccess = Util.blockingPutMachineToSleep(strTargetMachineName)
ePowerState = PowerState.SLEEP
elif ePowerState == PowerState.SLEEP:
bSuccess = Util.blockingWakeUpMachine(strTargetMachineName)
assert( bSuccess )
ePowerState = PowerState.ON
else:
assert(False)
logInfo('Testing bug 00000003 if a series of wake up, goto sleep can shutdown a machine')
strTargetMachineName = 'simpatix12'
ePowerState = Util.getPowerState(strTargetMachineName)
while True:
if ePowerState == PowerState.ON:
bSuccess = Util.blockingPutMachineToSleep(strTargetMachineName)
assert bSuccess
bSuccess = Util.blockingPutMachineToSleep(strTargetMachineName)
ePowerState = PowerState.SLEEP
elif ePowerState == PowerState.SLEEP:
bSuccess = Util.blockingWakeUpMachine(strTargetMachineName)
assert bSuccess
ePowerState = PowerState.ON
else:
assert False
def Test0001():
logInfo('Testing bug 00000003 : could it be caused by a sleep and a power on at the same tim ?')
strTargetMachineName = 'simpatix12'
ePowerState = Util.getPowerState(strTargetMachineName)
if ePowerState == PowerState.SLEEP:
bSuccess = Util.blockingWakeUpMachine(strTargetMachineName)
assert( bSuccess )
ePowerState = PowerState.ON
assert(ePowerState == PowerState.ON)
Util.executeCommand("ssh %s 'pmset sleepnow'" % strTargetMachineName )
bSuccess = Util.blockingWakeUpMachine(strTargetMachineName)
assert(bSuccess)
logInfo('Testing bug 00000003 : could it be caused by a sleep and a power on at the same tim ?')
strTargetMachineName = 'simpatix12'
ePowerState = Util.getPowerState(strTargetMachineName)
if ePowerState == PowerState.SLEEP:
bSuccess = Util.blockingWakeUpMachine(strTargetMachineName)
assert bSuccess
ePowerState = PowerState.ON
assert ePowerState == PowerState.ON
Util.executeCommand("ssh %s 'pmset sleepnow'" % strTargetMachineName)
bSuccess = Util.blockingWakeUpMachine(strTargetMachineName)
assert bSuccess
def Test0002():
logInfo('Testing bug 00000003 : could it be caused by a power on quickly followed by a sleep ?')
strTargetMachineName = 'simpatix12'
ePowerState = Util.getPowerState(strTargetMachineName)
if ePowerState == PowerState.ON:
bSuccess = Util.blockingWakeUpMachine(strTargetMachineName)
assert( bSuccess )
ePowerState = PowerState.SLEEP
assert(ePowerState == PowerState.SLEEP)
Util.executeIpmiCommand( strTargetMachineName, 'chassis power on' )
Util.executeCommand("ssh %s 'pmset sleepnow'" % strTargetMachineName )
logInfo('Testing bug 00000003 : could it be caused by a power on quickly followed by a sleep ?')
strTargetMachineName = 'simpatix12'
ePowerState = Util.getPowerState(strTargetMachineName)
if ePowerState == PowerState.ON:
bSuccess = Util.blockingWakeUpMachine(strTargetMachineName)
assert bSuccess
ePowerState = PowerState.SLEEP
assert ePowerState == PowerState.SLEEP
Util.executeIpmiCommand(strTargetMachineName, 'chassis power on')
Util.executeCommand("ssh %s 'pmset sleepnow'" % strTargetMachineName)
if __name__ == '__main__':
Test0000()
Test0000()

View File

@ -1,228 +1,234 @@
#import .Util
#import ..SimpaDbUtil
from .Log import *
from .PowerState import *
# import .Util
# import ..SimpaDbUtil
from .Log import logDebug, logInfo, logWarning, logError
from .PowerState import PowerState, PowerStateToStr
import re
import io
import os
import traceback
import sys
def executeProgram( astrArguments ):
bBUG_00000008_IS_STILL_ACTIVE = True
if bBUG_00000008_IS_STILL_ACTIVE:
logDebug('executeProgram : program = [%s]' % (','.join(astrArguments) ))
(returnCode, stdout, stderr) = Lib.Util.executeProgram( astrArguments )
if bBUG_00000008_IS_STILL_ACTIVE:
logDebug('executeCommand : return code of [%s] = %d' % (','.join(astrArguments), returnCode))
# for debugging purpose, log info in case the command failed
if returnCode != 0:
logDebug('executeCommand : return code of [%s] = %d' % (','.join(astrArguments), returnCode))
logDebug('executeCommand : stdout of [%s] = %s' % (','.join(astrArguments), stdout))
logDebug('executeCommand : stderr of [%s] = %s' % (','.join(astrArguments), stderr))
return (returnCode, stdout, stderr)
def executeCommand( command ):
#logDebug('executeCommand : command = ' + command)
(returnCode, stdout, stderr) = Lib.Util.executeCommand( command )
#logDebug('executeCommand : return code of "'+command+'" = '+str(returnCode))
return (returnCode, stdout, stderr)
def executeProgram(astrArguments):
bBUG_00000008_IS_STILL_ACTIVE = True
if bBUG_00000008_IS_STILL_ACTIVE:
logDebug('executeProgram : program = [%s]' % (','.join(astrArguments)))
(returnCode, stdout, stderr) = Lib.Util.executeProgram(astrArguments)
if bBUG_00000008_IS_STILL_ACTIVE:
logDebug('executeCommand : return code of [%s] = %d' % (','.join(astrArguments), returnCode))
# for debugging purpose, log info in case the command failed
if returnCode != 0:
logDebug('executeCommand : return code of [%s] = %d' % (','.join(astrArguments), returnCode))
logDebug('executeCommand : stdout of [%s] = %s' % (','.join(astrArguments), stdout))
logDebug('executeCommand : stderr of [%s] = %s' % (','.join(astrArguments), stderr))
return (returnCode, stdout, stderr)
def executeIpmiCommand( machineName, ipmiCommandArgs ):
lomIpAddress = Lib.SimpaDbUtil.getLightOutManagementIpAddress( machineName )
lomPasswordFilepath = '/usr/local/etc/LightOutManagementPassword.txt'
astrProgram = ['ipmitool', '-U', 'admin', '-H', lomIpAddress, '-f', lomPasswordFilepath]
astrProgram.extend( ipmiCommandArgs )
#print 'executeIpmiCommand'
#print astrProgram
bBUG_00000005_IS_STILL_ACTIVE = True
if bBUG_00000005_IS_STILL_ACTIVE:
# bug 00000005 causes ipmitool to randomly fail for no apparent reason (two consecutive calls might give different errors, these errors not always being timeouts). Therefore we try and try again, until the command succeeds. If we don't do this, cluster controller keeps stopping because ipmi commands fail. The effect of this hack is that the UNPLUGGED power state is no longer detected; therefore, with this hack, cluster controller is expecting all machines to be plugged.
bCommandSucceeded = False
while not bCommandSucceeded:
(returnCode, stdout, stderr) = executeProgram( astrProgram )
if returnCode == 0:
bCommandSucceeded = True
else:
logWarning('the command "%s" failed. Retrying a bit later' % ' '.join(astrProgram))
time.sleep(5) # wait for 5 seconds before the next attempt, in order not to saturate activity
else:
(returnCode, stdout, stderr) = executeProgram( astrProgram )
"""
sh-3.2# ipmitool -U admin -H 129.20.27.220 -f /usr/local/etc/LightOutManagementPassword.txt sensor get 'ACPI State'
Unabled to establish a session with the BMC.
Command failed due to insufficient resources for session (0xFFFEF901)
-> this error means that the number of active conections to the BMC has reached the maximum (usually 5).
sh-3.2# ipmitool -U admin -H 129.20.27.212 -f /usr/local/etc/LightOutManagementPassword.txt sensor get 'ACPI State'
Unabled to establish a session with the BMC.
Command failed due to Unknown (0xFFFEF923) (0xFFFEF923)
sh-3.2# ipmitool -U admin -H 129.20.27.212 -f /usr/local/etc/LightOutManagementPassword.txt sensor get 'ACPI State'
Unabled to establish a session with the BMC.
Command failed due to Timeout (0xFFFEF9C3)
"""
return (returnCode, stdout, stderr)
def executeCommand(command):
# logDebug('executeCommand : command = ' + command)
(returnCode, stdout, stderr) = Lib.Util.executeCommand(command)
# logDebug('executeCommand : return code of "'+command+'" = '+str(returnCode))
return (returnCode, stdout, stderr)
def executeIpmiCommand(machineName, ipmiCommandArgs):
lomIpAddress = Lib.SimpaDbUtil.getLightOutManagementIpAddress(machineName)
lomPasswordFilepath = '/usr/local/etc/LightOutManagementPassword.txt'
astrProgram = ['ipmitool', '-U', 'admin', '-H', lomIpAddress, '-f', lomPasswordFilepath]
astrProgram.extend(ipmiCommandArgs)
# print 'executeIpmiCommand'
# print astrProgram
bBUG_00000005_IS_STILL_ACTIVE = True
if bBUG_00000005_IS_STILL_ACTIVE:
# bug 00000005 causes ipmitool to randomly fail for no apparent reason (two consecutive calls might give different errors, these errors not always being timeouts). Therefore we try and try again, until the command succeeds. If we don't do this, cluster controller keeps stopping because ipmi commands fail. The effect of this hack is that the UNPLUGGED power state is no longer detected; therefore, with this hack, cluster controller is expecting all machines to be plugged.
bCommandSucceeded = False
while not bCommandSucceeded:
(returnCode, stdout, stderr) = executeProgram(astrProgram)
if returnCode == 0:
bCommandSucceeded = True
else:
logWarning('the command "%s" failed. Retrying a bit later' % ' '.join(astrProgram))
time.sleep(5) # wait for 5 seconds before the next attempt, in order not to saturate activity
else:
(returnCode, stdout, stderr) = executeProgram(astrProgram)
"""
sh-3.2# ipmitool -U admin -H 129.20.27.220 -f /usr/local/etc/LightOutManagementPassword.txt sensor get 'ACPI State'
Unabled to establish a session with the BMC.
Command failed due to insufficient resources for session (0xFFFEF901)
-> this error means that the number of active conections to the BMC has reached the maximum (usually 5).
sh-3.2# ipmitool -U admin -H 129.20.27.212 -f /usr/local/etc/LightOutManagementPassword.txt sensor get 'ACPI State'
Unabled to establish a session with the BMC.
Command failed due to Unknown (0xFFFEF923) (0xFFFEF923)
sh-3.2# ipmitool -U admin -H 129.20.27.212 -f /usr/local/etc/LightOutManagementPassword.txt sensor get 'ACPI State'
Unabled to establish a session with the BMC.
Command failed due to Timeout (0xFFFEF9C3)
"""
return (returnCode, stdout, stderr)
def getPowerState(machineName):
ePowerState = PowerState.UNKNOWN
bPowerStateRead = False
iNumFailedAttempts = 0
while not bPowerStateRead:
(returnCode, stdout, stderr) = executeIpmiCommand(machineName, ['sensor', 'get', 'ACPI State'])
if returnCode == 0:
matchObj = re.search(r'\[(?P<AcpiState>S[0-9][^\:]*)\:', stdout)
bBUG_00000002_IS_STILL_ACTIVE = True
if bBUG_00000002_IS_STILL_ACTIVE:
if matchObj is None:
# the following warning has been commented out because it pollutes the logs and apparently
# it's a 'feature' of all 4 core machines : if the machine is woken up using ipmitool, then
# no power on event is logged ...
# logWarning('degenerate ipmitool output for machine %s (see bug 00000002). Assuming power in on because that''s what I noticed when I had the case.' % machineName)
return PowerState.ON
else:
assert matchObj
strAcpiState = matchObj.group('AcpiState')
if strAcpiState == 'S0/G0':
ePowerState = PowerState.ON
elif strAcpiState == 'S3': # memory is still powered
ePowerState = PowerState.SLEEP
elif strAcpiState == 'S5/G2': # soft-off
ePowerState = PowerState.OFF
else:
print(strAcpiState)
assert False
bPowerStateRead = True
else:
# error ... it's either because the machine is unplugged or because the machine is busy (well I'm not sure what happened but I had the case where the command failed for no apparent reason, and therefore I suspect it to be busy). In order to differentiate these 2 cases, we try again and if this caommand fails too many times then we decide it's unplugged (very dodgy I know but I'm disapointed that this command doen't always work, and for now I don't know other ways to differentiate between these cases....)
iMAX_NUM_ATTEMPTS = 5
iNumFailedAttempts += 1
if iNumFailedAttempts < iMAX_NUM_ATTEMPTS:
logWarning('failed to read the power state of %s. I\'ll try a again a bit later....' % machineName)
time.sleep(5)
else:
logWarning('failed to read the power state of %s too many times. I assume this machine is unplugged' % machineName)
ePowerState = PowerState.UNPLUGGED # too many attempts failed ... I guess it's because the machine is unplugged
bPowerStateRead = True
return ePowerState
def wakeUpMachine(machineName):
"""
this method seems more reliable than wake on lan (sometimes, sending wake on lan packet seems to have no effect)
@return true on success, false otherwise
@note I once had this method failing for no obvious reason.. maybe this command does not succeed if the machine is in a transition state
"""
(returnCode, stdout, stderr) = executeIpmiCommand(machineName, ['chassis', 'power', 'on'])
bSuccess = (returnCode == 0) # this command can fail if the machine is manually unplugged for example
return bSuccess
def blockingPutMachineToSleep(machineName):
"""
@return true on success, false otherwise
"""
logInfo('putting machine %s to sleep...' % machineName)
iMaxNumAttempts = 5
bSuccess = False
bBUG_239_IS_STILL_ALIVE = True
iAttempt = 0
# note : each sleep order is not actually succeeding (god knows why). Therefore, we need to try again and again.
while not bSuccess:
# note : pmset must be executed as root
(returnCode, stdout, stderr) = executeProgram(['ssh', machineName, 'pmset sleepnow'])
# check if the machine actually went to sleep
iMaxGoToSleepDuration = 30 # in seconds
iDelay = 0
while iDelay < iMaxGoToSleepDuration:
time.sleep(5)
iDelay += 5
ePowerState = getPowerState(machineName)
if ePowerState == PowerState.SLEEP:
logInfo('machine %s is now sleeping (put to sleep succeeded)' % machineName)
return True
else:
if ePowerState != PowerState.ON:
logWarning('unexpectedly, powerState of %s is %s' % (machineName, PowerStateToStr(ePowerState)))
assert ePowerState == PowerState.ON
iAttempt += 1
if iAttempt > iMaxNumAttempts:
if bBUG_239_IS_STILL_ALIVE:
logWarning('the attempt to put %s to sleep failed too many times (probably because of bug 239 (machine is in a weird state : power on but no ssh possible) ?)... giving up. ' % (machineName))
return False
else:
logWarning('the attempt to put %s to sleep failed too many times... giving up' % (machineName))
return False
else:
logWarning('the attempt to put %s to sleep failed... trying again' % (machineName))
return True
def getPowerState( machineName ):
ePowerState = PowerState.UNKNOWN
bPowerStateRead = False
iNumFailedAttempts = 0
while not bPowerStateRead:
(returnCode, stdout, stderr) = executeIpmiCommand( machineName, ['sensor', 'get', 'ACPI State'] )
if returnCode == 0:
matchObj = re.search('\[(?P<AcpiState>S[0-9][^\:]*)\:', stdout)
bBUG_00000002_IS_STILL_ACTIVE = True
if bBUG_00000002_IS_STILL_ACTIVE:
if matchObj == None:
# the following warning has been commented out because it pollutes the logs and apparently
# it's a 'feature' of all 4 core machines : if the machine is woken up using ipmitool, then
# no power on event is logged ...
#logWarning('degenerate ipmitool output for machine %s (see bug 00000002). Assuming power in on because that''s what I noticed when I had the case.' % machineName)
return PowerState.ON
else:
assert( matchObj )
strAcpiState = matchObj.group('AcpiState')
if strAcpiState == 'S0/G0':
ePowerState = PowerState.ON
elif strAcpiState == 'S3': # memory is still powered
ePowerState = PowerState.SLEEP
elif strAcpiState == 'S5/G2': # soft-off
ePowerState = PowerState.OFF
else:
print(strAcpiState)
assert( False )
bPowerStateRead = True
else:
# error ... it's either because the machine is unplugged or because the machine is busy (well I'm not sure what happened but I had the case where the command failed for no apparent reason, and therefore I suspect it to be busy ). In order to differentiate these 2 cases, we try again and if this caommand fails too many times then we decide it's unplugged (very dodgy I know but I'm disapointed that this command doen't always work, and for now I don't know other ways to differentiate between these cases....)
iMAX_NUM_ATTEMPTS=5
iNumFailedAttempts += 1
if iNumFailedAttempts < iMAX_NUM_ATTEMPTS:
logWarning('failed to read the power state of %s. I\'ll try a again a bit later....' % machineName)
time.sleep(5)
else:
logWarning('failed to read the power state of %s too many times. I assume this machine is unplugged' % machineName)
ePowerState = PowerState.UNPLUGGED # too many attempts failed ... I guess it's because the machine is unplugged
bPowerStateRead = True
return ePowerState
def wakeUpMachine( machineName ):
"""
this method seems more reliable than wake on lan (sometimes, sending wake on lan packet seems to have no effect)
@return true on success, false otherwise
@note I once had this method failing for no obvious reason.. maybe this command does not succeed if the machine is in a transition state
"""
(returnCode, stdout, stderr) = executeIpmiCommand( machineName, ['chassis', 'power', 'on'] )
bSuccess = (returnCode == 0) # this command can fail if the machine is manually unplugged for example
return bSuccess
def blockingPutMachineToSleep( machineName ):
"""
@return true on success, false otherwise
"""
logInfo('putting machine %s to sleep...' % machineName)
iMaxNumAttempts = 5
bSuccess = False
bBUG_239_IS_STILL_ALIVE = True
iAttempt = 0
# note : each sleep order is not actually succeeding (god knows why). Therefore, we need to try again and again.
while not bSuccess:
# note : pmset must be executed as root
(returnCode, stdout, stderr) = executeProgram(['ssh', machineName, 'pmset sleepnow'])
# check if the machine actually went to sleep
iMaxGoToSleepDuration = 30 # in seconds
iDelay = 0
while iDelay < iMaxGoToSleepDuration:
time.sleep(5)
iDelay += 5
ePowerState = getPowerState( machineName )
if ePowerState == PowerState.SLEEP:
logInfo('machine %s is now sleeping (put to sleep succeeded)' % machineName)
return True
else:
if ePowerState != PowerState.ON:
logWarning('unexpectedly, powerState of %s is %s' % (machineName, PowerStateToStr(ePowerState)))
assert(ePowerState == PowerState.ON)
iAttempt += 1
if iAttempt > iMaxNumAttempts:
if bBUG_239_IS_STILL_ALIVE:
logWarning('the attempt to put %s to sleep failed too many times (probably because of bug 239 (machine is in a weird state : power on but no ssh possible) ?)... giving up. ' % (machineName))
return False
else:
logWarning('the attempt to put %s to sleep failed too many times... giving up' % (machineName))
return False
else:
logWarning('the attempt to put %s to sleep failed... trying again' % (machineName))
return True
def blockingWakeUpMachine(machineName):
logInfo('waking up machine %s...' % machineName)
numAttempts = 0
bWakeUpFailed = True
while bWakeUpFailed: # try more than once because sometimes for an unknown reason, the wake up order is ignored by the machine ... to be investigated
iMaxNumWakeUpAttempts = 50
iNumWakeUpAttempts = 0
bWakeUpMachineSucceeded = False
while not bWakeUpMachineSucceeded:
bWakeUpMachineSucceeded = wakeUpMachine( machineName )
iNumWakeUpAttempts += 1
# the previous command can fail if the machine is already in a transition
# in that case we try sevral times bevire giving up
if(bWakeUpMachineSucceeded == False):
if iNumWakeUpAttempts < iMaxNumWakeUpAttempts:
iDelay = 5
logWarning('wake up attempt %d of %s failed... I\'ll try again in %d seconds' % (iNumWakeUpAttempts, machineName, iDelay))
time.sleep(iDelay)
else:
logWarning('wake up attempt %d of %s failed too many times... giving up' % (iNumWakeUpAttempts, machineName))
return False # couldn't wake up to machine for whatever reason
bWakeUpFailed = False
# wait until the machine is operational
WAKEUPTIMEOUT=5*60 # max number of seconds allowed for a machine to be alive after a wakeup request
wakeUpToAliveDuration = 0
while not Lib.SimpaDbUtil.isMachineResponding( machineName ):
time.sleep(5)
wakeUpToAliveDuration+=5
if wakeUpToAliveDuration > WAKEUPTIMEOUT:
# the wake up failed for whatever reason (power state changed manually ? wake up order got lost ?)
logWarning('%s took too long (more than %d seconds) to respond after a successful wakeup request.' % (machineName, WAKEUPTIMEOUT))
bWakeUpFailed = True
break
if bWakeUpFailed:
numAttempts+=1
if numAttempts >= 2:
logWarning('giving up waking up %s because the wake up request succeeded but the machine never actually came alive (and this too many times)' % (machineName))
return False # power state changed manually ?
else:
logWarning('attempting to wake up %s one more time' % (machineName))
else:
# wake up completed
logInfo('Waking up of machine %s completed successfully' % machineName)
return True
logInfo('waking up machine %s...' % machineName)
numAttempts = 0
bWakeUpFailed = True
while bWakeUpFailed: # try more than once because sometimes for an unknown reason, the wake up order is ignored by the machine ... to be investigated
iMaxNumWakeUpAttempts = 50
iNumWakeUpAttempts = 0
bWakeUpMachineSucceeded = False
while not bWakeUpMachineSucceeded:
bWakeUpMachineSucceeded = wakeUpMachine(machineName)
iNumWakeUpAttempts += 1
# the previous command can fail if the machine is already in a transition
# in that case we try sevral times bevire giving up
if not bWakeUpMachineSucceeded:
if iNumWakeUpAttempts < iMaxNumWakeUpAttempts:
iDelay = 5
logWarning('wake up attempt %d of %s failed... I\'ll try again in %d seconds' % (iNumWakeUpAttempts, machineName, iDelay))
time.sleep(iDelay)
else:
logWarning('wake up attempt %d of %s failed too many times... giving up' % (iNumWakeUpAttempts, machineName))
return False # couldn't wake up to machine for whatever reason
bWakeUpFailed = False
# wait until the machine is operational
WAKEUPTIMEOUT = 5 * 60 # max number of seconds allowed for a machine to be alive after a wakeup request
wakeUpToAliveDuration = 0
while not Lib.SimpaDbUtil.isMachineResponding(machineName):
time.sleep(5)
wakeUpToAliveDuration += 5
if wakeUpToAliveDuration > WAKEUPTIMEOUT:
# the wake up failed for whatever reason (power state changed manually ? wake up order got lost ?)
logWarning('%s took too long (more than %d seconds) to respond after a successful wakeup request.' % (machineName, WAKEUPTIMEOUT))
bWakeUpFailed = True
break
if bWakeUpFailed:
numAttempts += 1
if numAttempts >= 2:
logWarning('giving up waking up %s because the wake up request succeeded but the machine never actually came alive (and this too many times)' % (machineName))
return False # power state changed manually ?
else:
logWarning('attempting to wake up %s one more time' % (machineName))
else:
# wake up completed
logInfo('Waking up of machine %s completed successfully' % machineName)
return True
def onException(exception):
sys.stdout.flush()
strExceptionType = type( exception )
strMessage = 'exception %s : %s\n' % (strExceptionType, exception.message)
#traceback.print_last()
f = io.StringIO()
traceback.print_exc(file=f)
strMessage += f.getvalue()
f.close()
logError(strMessage)
print(strMessage)
try:
# I had the case (see bugzilla 234) where an assert failed in a child thread, but the main process kept going. I suspect that it was caused
#by a failure of sendTextMail... that's why I've embedded the sendmail inside a try except block, so that is this operation fails, then the
# kill of the main process is still executed.
Lib.Util.sendTextMail( 'ClusterController <guillaume.raffy@univ-rennes1.fr>', 'guillaume.raffy@univ-rennes1.fr', 'ClusterController has stopped because of an exception', strMessage)
except BaseException:
logError("Could not send the email to notify the administrator that cluster controller failed")
pass
executeCommand('kill -9 %d' % os.getpid()) # stop other threads immediately
exit()
sys.stdout.flush()
strExceptionType = type(exception)
strMessage = 'exception %s : %s\n' % (strExceptionType, exception.message)
# traceback.print_last()
f = io.StringIO()
traceback.print_exc(file=f)
strMessage += f.getvalue()
f.close()
logError(strMessage)
print(strMessage)
try:
# I had the case (see bugzilla 234) where an assert failed in a child thread, but the main process kept going. I suspect that it was caused
# by a failure of sendTextMail... that's why I've embedded the sendmail inside a try except block, so that is this operation fails, then the
# kill of the main process is still executed.
Lib.Util.sendTextMail('ClusterController <guillaume.raffy@univ-rennes1.fr>', 'guillaume.raffy@univ-rennes1.fr', 'ClusterController has stopped because of an exception', strMessage)
except BaseException:
logError("Could not send the email to notify the administrator that cluster controller failed")
pass
executeCommand('kill -9 %d' % os.getpid()) # stop other threads immediately
exit()

View File

@ -1,42 +1,47 @@
import Sensor
class ClusterNodeSensorsReadings:
"""
"""
"""
POWERSTATE_UNKNOWN=0
POWERSTATE_OFF=1
POWERSTATE_ON=2
POWERSTATE_SLEEP=3
"""
def __init__(self, clusterNodeName):
self.m_clusterNodeName = clusterNodeName
self.m_sensors = {}
#self.m_powerState = ClusterNodeStatus.POWERSTATE_UNKNOWN
return
def addSensor(self, sensor):
self.m_sensors[sensor.m_name] = sensor
def dump(self):
for key,sensor in self.m_sensors.items():
sensor.dump()
return
#def getPowerState(self):
# return self.m_powerState
def getLowestTemperature( self ):
#log('ClusterNodeSensorsReadings::getLowestTemperature : start')
lowestTemperature = 0.0
lowestTemperatureIsDefined = False
for key,sensor in self.m_sensors.items():
#log('ClusterNodeSensorsReadings::getLowestTemperature : start')
if sensor.typeName() == 'Temperature':
sensor.m_temperature
if lowestTemperatureIsDefined:
if sensor.m_temperature < lowestTemperature:
lowestTemperature = sensor.m_temperature
else:
lowestTemperature = sensor.m_temperature
lowestTemperatureIsDefined = True
assert( lowestTemperatureIsDefined )
#log('ClusterNodeSensorsReadings::getLowestTemperature : end')
return lowestTemperature
"""
"""
"""
POWERSTATE_UNKNOWN=0
POWERSTATE_OFF=1
POWERSTATE_ON=2
POWERSTATE_SLEEP=3
"""
def __init__(self, clusterNodeName):
self.m_clusterNodeName = clusterNodeName
self.m_sensors = {}
# self.m_powerState = ClusterNodeStatus.POWERSTATE_UNKNOWN
return
def addSensor(self, sensor):
self.m_sensors[sensor.m_name] = sensor
def dump(self):
for key, sensor in self.m_sensors.items():
sensor.dump()
return
# def getPowerState(self):
# return self.m_powerState
def getLowestTemperature(self):
# log('ClusterNodeSensorsReadings::getLowestTemperature : start')
lowestTemperature = 0.0
lowestTemperatureIsDefined = False
for key, sensor in self.m_sensors.items():
# log('ClusterNodeSensorsReadings::getLowestTemperature : start')
if sensor.typeName() == 'Temperature':
sensor.m_temperature
if lowestTemperatureIsDefined:
if sensor.m_temperature < lowestTemperature:
lowestTemperature = sensor.m_temperature
else:
lowestTemperature = sensor.m_temperature
lowestTemperatureIsDefined = True
assert lowestTemperatureIsDefined
# log('ClusterNodeSensorsReadings::getLowestTemperature : end')
return lowestTemperature

View File

@ -3,79 +3,81 @@ import re
from Sensor import FanSensor, TemperatureSensor
from ClusterNodeSensorsReadings import ClusterNodeSensorsReadings
class IpmiTool202Parser:
def parseSensorOutput( self, strOutput, clusterNodeName ):
sensorReadings=ClusterNodeSensorsReadings(clusterNodeName)
f = io.StringIO(strOutput)
line = f.readline()
while( len(line) > 0 ):
#print line,
matchObj = re.match( '^Sensor ID[ ]*\: \'(?P<sensorName>[a-zA-Z 0-9]+)\'', line )
if matchObj:
sensorName = matchObj.group('sensorName')
# print sensorName
# read the entity id
line = f.readline()
matchObj = re.match( '^ Entity ID[ ]*\: (?P<entityId>[0-9\.]+)', line )
assert(matchObj)
entityId = matchObj.group('entityId')
# print entityId
# read the sensor type
line = f.readline()
matchObj = re.match( '^ Sensor Type[\(\)a-zA-Z ]*\: (?P<sensorType>[a-zA-Z \(\)]+)', line )
assert(matchObj)
sensorType = matchObj.group('sensorType')
#print sensorType
if sensorType == 'Fan':
rpms = self.parseFanSensorOutput(f)
if temperature != None:
sensor = FanSensor(sensorName)
sensor.m_rpms = rpms
elif sensorType == 'Temperature':
temperature = self.parseTemperatureSensorOutput(f)
if temperature != None:
sensor = TemperatureSensor(sensorName)
sensor.m_temperature = temperature
else:
#ignoring other sensors
sensor = None
if sensor:
sensorReadings.addSensor( sensor )
else:
None
#assert(False)
line = f.readline()
f.close()
def parseFanSensorOutput(self, file):
"""
reads the fan specific ipdmitool output
"""
line = file.readline()
#print line
matchObj = re.match( '^ Sensor Reading[ ]*\: (?P<numRpms>[0-9]+) \(\+/\- (?P<rpmsPrecision>[0-9]+)\) RPM', line )
if(matchObj):
numRpms = matchObj.group('numRpms')
#print numRpms
rpms = float( numRpms )
return rpms
else:
matchObj = re.match( '^ Sensor Reading[ ]*\: Not Present', line )
assert(matchObj)
return None
def parseTemperatureSensorOutput(self, file):
"""
reads the temperature specific ipdmitool output
"""
# Sensor Reading : 36 (+/- 0) degrees C
line = file.readline()
#print line
matchObj = re.match( '^ Sensor Reading[ ]*\: (?P<temperature>[0-9]+) \(\+/\- (?P<precision>[0-9]+)\) degrees C', line )
if(matchObj):
temperature = matchObj.group('temperature')
temperature = float( temperature )
return temperature
else:
matchObj = re.match( '^ Sensor Reading[ ]*\: Not Present', line )
assert(matchObj)
return None
def parseSensorOutput(self, strOutput, clusterNodeName):
sensorReadings = ClusterNodeSensorsReadings(clusterNodeName)
f = io.StringIO(strOutput)
line = f.readline()
while len(line) > 0:
# print line,
matchObj = re.match(r'^Sensor ID[ ]*\: \'(?P<sensorName>[a-zA-Z 0-9]+)\'', line)
if matchObj:
sensorName = matchObj.group('sensorName')
# print sensorName
# read the entity id
line = f.readline()
matchObj = re.match(r'^ Entity ID[ ]*\: (?P<entityId>[0-9\.]+)', line)
assert matchObj
entityId = matchObj.group('entityId')
# print entityId
# read the sensor type
line = f.readline()
matchObj = re.match(r'^ Sensor Type[\(\)a-zA-Z ]*\: (?P<sensorType>[a-zA-Z \(\)]+)', line)
assert matchObj
sensorType = matchObj.group('sensorType')
# print sensorType
if sensorType == 'Fan':
rpms = self.parseFanSensorOutput(f)
if temperature is not None:
sensor = FanSensor(sensorName)
sensor.m_rpms = rpms
elif sensorType == 'Temperature':
temperature = self.parseTemperatureSensorOutput(f)
if temperature is not None:
sensor = TemperatureSensor(sensorName)
sensor.m_temperature = temperature
else:
# ignoring other sensors
sensor = None
if sensor:
sensorReadings.addSensor(sensor)
else:
None
# assert(False)
line = f.readline()
f.close()
def parseFanSensorOutput(self, file):
"""
reads the fan specific ipdmitool output
"""
line = file.readline()
# print line
matchObj = re.match(r'^ Sensor Reading[ ]*\: (?P<numRpms>[0-9]+) \(\+/\- (?P<rpmsPrecision>[0-9]+)\) RPM', line)
if matchObj:
numRpms = matchObj.group('numRpms')
# print numRpms
rpms = float(numRpms)
return rpms
else:
matchObj = re.match(r'^ Sensor Reading[ ]*\: Not Present', line)
assert matchObj
return None
def parseTemperatureSensorOutput(self, file):
"""
reads the temperature specific ipdmitool output
"""
# Sensor Reading : 36 (+/- 0) degrees C
line = file.readline()
# print line
matchObj = re.match(r'^ Sensor Reading[ ]*\: (?P<temperature>[0-9]+) \(\+/\- (?P<precision>[0-9]+)\) degrees C', line)
if matchObj:
temperature = matchObj.group('temperature')
temperature = float(temperature)
return temperature
else:
matchObj = re.match(r'^ Sensor Reading[ ]*\: Not Present', line)
assert matchObj
return None

View File

@ -3,37 +3,37 @@ import re
from Sensor import FanSensor, TemperatureSensor
from ClusterNodeSensorsReadings import ClusterNodeSensorsReadings
class IpmiTool218Parser:
def parseSensorOutput( self, strOutput, clusterNodeName ):
sensorReadings=ClusterNodeSensorsReadings(clusterNodeName)
f = io.StringIO(strOutput)
line = f.readline()
while( len(line) > 0 ):
#print line,
matchObj = re.match( '^(?P<sensorName>[a-zA-Z 0-9]+[a-zA-Z 0-9]*[a-zA-Z0-9])[ ]*\| (?P<sensorValue>[\.0-9]+)[ ]*\| (?P<sensorUnit>[a-zA-Z0-9][a-zA-Z 0-9]*[a-zA-Z0-9])[?]*', line )
if matchObj:
#log('readClusterNodeSensorsIpmiTool2_1_8 : sensorName = '+matchObj.group('sensorName'))
#log('readClusterNodeSensorsIpmiTool2_1_8 : sensorValue = '+matchObj.group('sensorValue'))
#log('readClusterNodeSensorsIpmiTool2_1_8 : sensorUnit = "'+matchObj.group('sensorUnit')+'"')
sensorName = matchObj.group('sensorName')
sensorValue = matchObj.group('sensorValue')
sensorUnit = matchObj.group('sensorUnit')
sensor = None
if sensorUnit == 'degrees C':
sensor = TemperatureSensor(sensorName)
sensor.m_temperature = float( sensorValue )
elif sensorUnit == 'RPM':
sensor = FanSensor(sensorName)
sensor.m_rpms = float( sensorValue )
else:
None
if sensor:
#log('readClusterNodeSensorsIpmiTool2_1_8 : adding sensor')
sensorReadings.addSensor( sensor )
else:
None
#assert(False)
line = f.readline()
f.close()
return sensorReadings
def parseSensorOutput(self, strOutput, clusterNodeName):
sensorReadings = ClusterNodeSensorsReadings(clusterNodeName)
f = io.StringIO(strOutput)
line = f.readline()
while len(line) > 0:
# print line,
matchObj = re.match(r'^(?P<sensorName>[a-zA-Z 0-9]+[a-zA-Z 0-9]*[a-zA-Z0-9])[ ]*\| (?P<sensorValue>[\.0-9]+)[ ]*\| (?P<sensorUnit>[a-zA-Z0-9][a-zA-Z 0-9]*[a-zA-Z0-9])[?]*', line)
if matchObj:
# log('readClusterNodeSensorsIpmiTool2_1_8 : sensorName = '+matchObj.group('sensorName'))
# log('readClusterNodeSensorsIpmiTool2_1_8 : sensorValue = '+matchObj.group('sensorValue'))
# log('readClusterNodeSensorsIpmiTool2_1_8 : sensorUnit = "'+matchObj.group('sensorUnit')+'"')
sensorName = matchObj.group('sensorName')
sensorValue = matchObj.group('sensorValue')
sensorUnit = matchObj.group('sensorUnit')
sensor = None
if sensorUnit == 'degrees C':
sensor = TemperatureSensor(sensorName)
sensor.m_temperature = float(sensorValue)
elif sensorUnit == 'RPM':
sensor = FanSensor(sensorName)
sensor.m_rpms = float(sensorValue)
else:
None
if sensor:
# log('readClusterNodeSensorsIpmiTool2_1_8 : adding sensor')
sensorReadings.addSensor(sensor)
else:
None
# assert(False)
line = f.readline()
f.close()
return sensorReadings

View File

@ -1,23 +1,23 @@
class Sensor:
def __init__(self, sensorName):
self.m_name = sensorName
self.m_isValid = True # false if this sensor is not actually present on the target machine
return
def dump(self):
print self.m_name
def __init__(self, sensorName):
self.m_name = sensorName
self.m_isValid = True # false if this sensor is not actually present on the target machine
return
def dump(self):
print self.m_name
class FanSensor(Sensor):
def __init__(self, sensorName):
Sensor.__init__(self, sensorName)
def dump(self):
print 'Fan \'', self.m_name, '\' rpm=',self.m_rpms
def typeName(self):
return 'Fan'
def __init__(self, sensorName):
Sensor.__init__(self, sensorName)
def dump(self):
print 'Fan \'', self.m_name, '\' rpm=',self.m_rpms
def typeName(self):
return 'Fan'
class TemperatureSensor(Sensor):
def __init__(self, sensorName):
Sensor.__init__(self, sensorName)
def dump(self):
print 'Temperature \'', self.m_name, '\' temperature=',self.m_temperature
def typeName(self):
return 'Temperature'
def __init__(self, sensorName):
Sensor.__init__(self, sensorName)
def dump(self):
print 'Temperature \'', self.m_name, '\' temperature=',self.m_temperature
def typeName(self):
return 'Temperature'

View File

@ -6,9 +6,9 @@ if sys.version_info < (3, 0):
else:
from io import StringIO
import re
from .wol import *
from .wol import wake_on_lan
import os
from .Util import *
from .Util import executeProgram, executeCommand, log
import abc
import sqlite3
from .mysql2sqlite import mysql_to_sqlite
@ -17,7 +17,7 @@ from .mysql2sqlite import mysql_to_sqlite
def isMachineResponding(machineName):
(returnCode, stdout, stderr) = executeProgram(['ping', '-o', '-t', '1', machineName])
# log( 'isMachineResponding : result of command %s : %d' % (command, returnCode) )
if returnCode == 0:
return True
else:
@ -33,7 +33,7 @@ def isMachineResponding(machineName):
# don't stop the program until we understand bug00000004
else:
log('isMachineResponding : Unexpected return code : returnCode=%d, stdout="%s", stderr="%s" , machineName = %s' % (returnCode, stdout, stderr, machineName))
assert(False)
assert False
return False
@ -60,10 +60,10 @@ class RemoteMysqlDb(ISqlDatabaseBackend):
self._db_user = db_user
self._db_name = db_name
self._connect()
def _connect(self):
self._conn = MySQLdb.connect(self._db_server_fqdn, self._db_user, '', self._db_name)
assert(self._conn)
assert self._conn
def query(self, sql_query):
"""
@ -73,7 +73,7 @@ class RemoteMysqlDb(ISqlDatabaseBackend):
rows = conn.store_result()
return rows
class SqlFile(ISqlDatabaseBackend):
def __init__(self, sql_file_path, truncate_hex_strings=False):
"""
@ -128,7 +128,7 @@ class TableAttrNotFound(Exception):
class SqlDatabaseReader(object):
def __init__(self, inv_provider):
"""
:param ISqlDatabaseBackend inv_provider: the input that provides the inventory data
@ -138,7 +138,7 @@ class SqlDatabaseReader(object):
def query(self, sql_query):
"""
performs a query on the sql database
:param str sql_query: the sql query to perform
"""
return self._inv_provider.query(sql_query)
@ -146,7 +146,7 @@ class SqlDatabaseReader(object):
def get_table_attr(self, table, key_name, key_value, attr_name):
"""
reads the value of the fiven attribute of the given item in the given table
:param str table: the name of the table to read
:param str key_name: the name of the column that stores the id of the item to read
:param str key_value: the id of the item to read
@ -163,13 +163,13 @@ class SqlDatabaseReader(object):
def machineNameToMacAddress(machineName):
conn = MySQLdb.connect('simpatix10', 'simpadb_reader', '', 'simpadb')
assert(conn)
assert conn
sqlQuery = """SELECT mac_address FROM ethernet_cards WHERE machine_name='""" + machineName + """' AND type='normal'"""
# print sqlQuery
conn.query(sqlQuery)
r = conn.store_result()
row = r.fetch_row(0)
assert( len(row) == 1)
assert len(row) == 1
# print 'row =', row
macAddress = row[0][0]
# print macAddress
@ -182,13 +182,13 @@ def getLightOutManagementIpAddress(machineName):
the light out management ip of servers allows to talk to the server even when it's asleep
"""
conn = MySQLdb.connect('simpatix10', 'simpadb_reader', '', 'simpadb')
assert(conn)
assert conn
sqlQuery = """SELECT ip_address_1,ip_address_2,ip_address_3,ip_address_4 FROM ethernet_cards WHERE machine_name='""" + machineName + """' AND type='light_out_management'"""
# print sqlQuery
conn.query(sqlQuery)
r = conn.store_result()
row = r.fetch_row(0)
assert(len(row) == 1)
assert len(row) == 1
# print 'row =', row
ipAddress = ('%s.%s.%s.%s') % (row[0][0], row[0][1], row[0][2], row[0][3])
# print macAddress
@ -199,7 +199,7 @@ def getLightOutManagementIpAddress(machineName):
def getClusterMachinesNames():
clusterMachinesNames = []
conn = MySQLdb.connect('simpatix10', 'simpadb_reader', '', 'simpadb')
assert(conn)
assert conn
sqlQuery = """SELECT name FROM machines WHERE affectation='cluster'"""
# print sqlQuery
conn.query(sqlQuery)
@ -231,7 +231,7 @@ def putToSleep(machineName):
print 'stderr :'
print stderr
"""
assert(returnCode == 0)
assert returnCode == 0
# check if the command succeeded by looking at the output (that's the only way I found)
f = StringIO.StringIO(stdout)
line = f.readline()
@ -248,7 +248,7 @@ def wakeUp(machineName):
wake_on_lan(macAddress)
return True
def isNonRespondingMachineSleeping(machineName):
"""
note : crappy method to detect if the machine is sleeping (if other methods are available, I would be very interested)

View File

@ -1,61 +1,67 @@
#!/usr/bin/python
#import sys
#sys.path.insert(0, '/homes/raffy/SvnGRaffy/dev/Python')
# import sys
# sys.path.insert(0, '/homes/raffy/SvnGRaffy/dev/Python')
import re
#import Lib.Util
# import Lib.Util
class SgeConfig:
def __init__( self ):
self.m_attrs={}
def hasAttr(self, attr_name):
return attr_name in self.m_attrs.keys()
def getAttr( self, strAttrName ):
return self.m_attrs[ strAttrName ]
def setAttr( self, strAttrName, strAttrValue ):
assert isinstance(strAttrName, str)
assert isinstance(strAttrValue, str)
self.m_attrs[ strAttrName ] = strAttrValue
def loadFromSgeFormat1String( self, strSgeConfigString ):
"""
loads attrs from a string such as :
hostname simpatix11.univ-rennes1.fr
load_scaling NONE
complex_values has_molpro_2010=0
load_values arch=darwin-x86,num_proc=4,mem_total=8192.000000M, \
swap_total=0.000000M,virtual_total=8192.000000M, \
load_avg=5.126465,load_short=5.186523, \
load_medium=5.126465,load_long=5.087891, \
mem_free=6654.054688M,swap_free=0.000000M, \
virtual_free=6654.054688M,mem_used=1537.945312M, \
swap_used=0.000000M,virtual_used=1537.945312M, \
cpu=100.000000,m_topology=NONE,m_topology_inuse=NONE, \
m_socket=0,m_core=0,np_load_avg=1.281616, \
np_load_short=1.296631,np_load_medium=1.281616, \
np_load_long=1.271973
processors 4
user_lists NONE
xuser_lists NONE
projects NONE
xprojects NONE
usage_scaling NONE
report_variables NONE
"""
self.m_attrs={}
# put multiline attributes on one line
strSgeConfigString = re.sub(r"\\\n", "", strSgeConfigString)
for strAttrDef in strSgeConfigString.split("\n"):
# print("strAttrDef=%s" % strAttrDef)
if len(strAttrDef) != 0:
matchObj = re.match( "^(?P<attrName>[^\s]+)[ ]+(?P<attrValue>[^\s].*)$", strAttrDef )
assert matchObj is not None
#print( '%s = %s\n' % (matchObj.group("attrName"), matchObj.group("attrValue") ) )
self.m_attrs[ matchObj.group("attrName") ] = matchObj.group("attrValue")
def loadFromSgeFormat2String( self, strSgeConfigString ):
"""
loads attrs from a string such as :
arch=darwin-x86,num_proc=4,mem_total=8192.000000M, \
def __init__(self):
self.m_attrs = {}
def hasAttr(self, attr_name):
return attr_name in self.m_attrs.keys()
def getAttr(self, strAttrName):
return self.m_attrs[strAttrName]
def setAttr(self, strAttrName, strAttrValue):
assert isinstance(strAttrName, str)
assert isinstance(strAttrValue, str)
self.m_attrs[strAttrName] = strAttrValue
def loadFromSgeFormat1String(self, strSgeConfigString):
"""
loads attrs from a string such as :
hostname simpatix11.univ-rennes1.fr
load_scaling NONE
complex_values has_molpro_2010=0
load_values arch=darwin-x86,num_proc=4,mem_total=8192.000000M, \
swap_total=0.000000M,virtual_total=8192.000000M, \
load_avg=5.126465,load_short=5.186523, \
load_medium=5.126465,load_long=5.087891, \
mem_free=6654.054688M,swap_free=0.000000M, \
virtual_free=6654.054688M,mem_used=1537.945312M, \
swap_used=0.000000M,virtual_used=1537.945312M, \
cpu=100.000000,m_topology=NONE,m_topology_inuse=NONE, \
m_socket=0,m_core=0,np_load_avg=1.281616, \
np_load_short=1.296631,np_load_medium=1.281616, \
np_load_long=1.271973
processors 4
user_lists NONE
xuser_lists NONE
projects NONE
xprojects NONE
usage_scaling NONE
report_variables NONE
"""
self.m_attrs = {}
# put multiline attributes on one line
strSgeConfigString = re.sub(r"\\\n", "", strSgeConfigString)
for strAttrDef in strSgeConfigString.split("\n"):
# print("strAttrDef=%s" % strAttrDef)
if len(strAttrDef) != 0:
matchObj = re.match(r"^(?P<attrName>[^\s]+)[]+(?P<attrValue>[^\s].*)$", strAttrDef)
assert matchObj is not None
# print('%s = %s\n' % (matchObj.group("attrName"), matchObj.group("attrValue")))
self.m_attrs[matchObj.group("attrName")] = matchObj.group("attrValue")
def loadFromSgeFormat2String(self, strSgeConfigString):
"""
loads attrs from a string such as :
arch=darwin-x86,num_proc=4,mem_total=8192.000000M, \
swap_total=0.000000M,virtual_total=8192.000000M, \
load_avg=5.126465,load_short=5.186523, \
load_medium=5.126465,load_long=5.087891, \
@ -66,87 +72,88 @@ class SgeConfig:
m_socket=0,m_core=0,np_load_avg=1.281616, \
np_load_short=1.296631,np_load_medium=1.281616, \
np_load_long=1.271973
"""
self.m_attrs={}
if strSgeConfigString != "NONE":
for strAttrDef in strSgeConfigString.split(","):
#print strAttrDef
if len(strAttrDef) != 0:
matchObj = re.match( "^\s*(?P<attrName>[^=]+)=(?P<attrValue>.*)$", strAttrDef )
#print matchObj.group("attrName")
self.m_attrs[ matchObj.group("attrName") ] = matchObj.group("attrValue")
def asFormat1String( self ):
strResult = ""
for (k,v) in self.m_attrs.items():
#print "%s %s" % (k,v)
# if the attribute's value is a list of comma separated strings, make sure there are no spaces after the commas, otherwise the value is not properly interpreted when read back into sge
# for example if the user sets the value of administrator_mail (using qconf -mconf global) to "alice@univ-rennes1.fr, bob@univ-rennes1.fr", then the next call to qconf -sconf global will show a wrong value for administrator_mail, as shown below:
# pag_cmd none
# administrator_mail alice@univ-rennes1.fr,
# token_extend_time none
"""
self.m_attrs = {}
if strSgeConfigString != "NONE":
for strAttrDef in strSgeConfigString.split(","):
# print strAttrDef
if len(strAttrDef) != 0:
matchObj = re.match(r"^\s*(?P<attrName>[^=]+)=(?P<attrValue>.*)$", strAttrDef)
# print matchObj.group("attrName")
self.m_attrs[matchObj.group("attrName")] = matchObj.group("attrValue")
# it's even worse, as it messes with the whole config, putting unwanted attributes in the reporting_params attribute. In short, inputting commas followed by spaces seems to confuse sge....
def asFormat1String(self):
strResult = ""
for (k, v) in self.m_attrs.items():
# print "%s %s" % (k,v)
# if the attribute's value is a list of comma separated strings, make sure there are no spaces after the commas, otherwise the value is not properly interpreted when read back into sge
# for example if the user sets the value of administrator_mail (using qconf -mconf global) to "alice@univ-rennes1.fr, bob@univ-rennes1.fr", then the next call to qconf -sconf global will show a wrong value for administrator_mail, as shown below:
# pag_cmd none
# administrator_mail alice@univ-rennes1.fr,
# token_extend_time none
# the tests below show that administrator_mail can only take a value, which can be a separator separated list, in which a separator is either :
# - separator_form_a: a comma character (no spaces after)
# - separator_form_b: a comma character, followed by any number of spaces, then a backslash, then \n
# it's even worse, as it messes with the whole config, putting unwanted attributes in the reporting_params attribute. In short, inputting commas followed by spaces seems to confuse sge....
# because we remove carriage returns in our values, the only storage option is separator_form_a
# the tests below show that administrator_mail can only take a value, which can be a separator separated list, in which a separator is either :
# - separator_form_a: a comma character (no spaces after)
# - separator_form_b: a comma character, followed by any number of spaces, then a backslash, then \n
# administrator_mail alice@univ-rennes1.fr
# -> ok
# because we remove carriage returns in our values, the only storage option is separator_form_a
# administrator_mail alice@univ-rennes1.fr,bob@univ-rennes1.fr
# -> ok
# administrator_mail alice@univ-rennes1.fr
# -> ok
# administrator_mail alice@univ-rennes1.fr, bob@univ-rennes1.fr
# -> messes up
# administrator_mail alice@univ-rennes1.fr,bob@univ-rennes1.fr
# -> ok
# administrator_mail alice@univ-rennes1.fr, \
# bob@univ-rennes1.fr
# -> ok
# administrator_mail alice@univ-rennes1.fr, bob@univ-rennes1.fr
# -> messes up
# administrator_mail alice@univ-rennes1.fr,bob@univ-rennes1.fr, \
# bob2@univ-rennes1.fr
# -> ok
# administrator_mail alice@univ-rennes1.fr, \
# bob@univ-rennes1.fr
# -> ok
# administrator_mail alice@univ-rennes1.fr,bob@univ-rennes1.fr, \
# bob2@univ-rennes1.fr
# -> ok
# administrator_mail alice@univ-rennes1.fr,bob@univ-rennes1.fr, \
# bob2@univ-rennes1.fr
# -> ok
# administrator_mail alice@univ-rennes1.fr,bob@univ-rennes1.fr \
# bob2@univ-rennes1.fr
# -> error
# root@physix-master:~# qconf -Mconf /tmp/global
# only a single value is allowed for configuration attribute "administrator_mail"
# administrator_mail alice@univ-rennes1.fr,bob@univ-rennes1.fr, \
# bob2@univ-rennes1.fr
# -> ok
cleaned_value = re.sub(',\s*', ',', v)
# administrator_mail alice@univ-rennes1.fr,bob@univ-rennes1.fr \
# bob2@univ-rennes1.fr
# -> error
# root@physix-master:~# qconf -Mconf /tmp/global
# only a single value is allowed for configuration attribute "administrator_mail"
# prevent space pollution in space separated values, such as in reporting_params (see https://bugzilla.ipr.univ-rennes1.fr/show_bug.cgi?id=2812). If spaces are not compacted, the space separated values will contain more and more spaces and at some point corrupt the value : a line containing just a backslash, such as in the following example:
# reporting_params accounting=true reporting=false \
# flush_time=00:00:15 joblog=false \
# sharelog=00:00:00
# \
cleaned_value = re.sub('\s+', ' ', cleaned_value)
strResult += "%s %s\n" % (k, cleaned_value)
# print("strResult=%s" % strResult)
return strResult
def asFormat2String( self ):
strResult = ""
iNumAttrs = len(self.m_attrs)
if iNumAttrs == 0:
return "NONE"
iAttr = 0
for (k,v) in self.m_attrs.items():
#print "%s %s" % (k,v)
strResult += "%s=%s" % (k,v)
if iAttr != (iNumAttrs - 1):
strResult += ","
iAttr+=1
#print strSgeConfigString
return strResult
def dump( self ):
for (k,v) in self.m_attrs.items():
print("['%s']='%s'" % (k,v))
cleaned_value = re.sub(r',\s*', ',', v)
# prevent space pollution in space separated values, such as in reporting_params (see https://bugzilla.ipr.univ-rennes1.fr/show_bug.cgi?id=2812). If spaces are not compacted, the space separated values will contain more and more spaces and at some point corrupt the value : a line containing just a backslash, such as in the following example:
# reporting_params accounting=true reporting=false \
# flush_time=00:00:15 joblog=false \
# sharelog=00:00:00
# \
cleaned_value = re.sub(r'\s+', ' ', cleaned_value)
strResult += "%s %s\n" % (k, cleaned_value)
# print("strResult=%s" % strResult)
return strResult
def asFormat2String(self):
strResult = ""
iNumAttrs = len(self.m_attrs)
if iNumAttrs == 0:
return "NONE"
iAttr = 0
for (k, v) in self.m_attrs.items():
# print "%s %s" % (k,v)
strResult += "%s=%s" % (k, v)
if iAttr != (iNumAttrs - 1):
strResult += ","
iAttr += 1
# print strSgeConfigString
return strResult
def dump(self):
for (k, v) in self.m_attrs.items():
print("['%s']='%s'" % (k, v))

View File

@ -15,6 +15,7 @@ else:
from html.parser import HTMLParser
from email.mime.text import MIMEText
def sendTextMail(strFrom, to, strSubject, text):
# from = "SimpaCluster <guillaume.raffy@univ-rennes1.fr>"
mail = MIMEText(text)
@ -31,7 +32,7 @@ def sendTextMail(strFrom, to, strSubject, text):
class Error(Exception):
def __init__(self, strMessage):
self.m_strMessage = strMessage
def getHostName():
(returnCode, stdout, stderr) = executeProgram(['hostname', '-s'])
@ -47,7 +48,7 @@ def log(message):
def executeProgram(astrArguments):
# log('executeProgram : program [%s]' % (','.join(astrArguments)))
popen = subprocess.Popen( astrArguments, shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE) # bufsize=1 seems to prevent deadlocks that happen 50% the time
popen = subprocess.Popen(astrArguments, shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE) # bufsize=1 seems to prevent deadlocks that happen 50% the time
stdout, stderr = popen.communicate()
# popen.wait()
result = (popen.returncode, stdout.decode(), stderr)
@ -60,7 +61,7 @@ def executeCommand(command):
"""
executes the shell command such as 'set x=1; myprog $x'
"""
popen = subprocess.Popen( [command], shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, executable='/bin/bash') # bufsize=1 seems to prevent deadlocks that happen 50% the time
popen = subprocess.Popen([command], shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, executable='/bin/bash') # bufsize=1 seems to prevent deadlocks that happen 50% the time
# if we don't specify the optional executable argument, then the default non interactive shell will be used. On debian, the default non-interactive shell is dash, which doesn't understand the keyword 'source' that we use in many places
stdout, stderr = popen.communicate()
# popen.wait()
@ -85,29 +86,29 @@ def executeCommandOn(target_machine_fqdn, command, user=None):
target = '%s@%s' % (user, target_machine_fqdn)
else:
target = target_machine_fqdn
result = executeProgram(['ssh', target, "%s" % command])
logging.debug("finished executing %s on %s as %s" % (command, target_machine_fqdn, user))
return result
def getUpsStatus():
class MyHTMLParser(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
self.TokenList = []
def handle_data( self, data):
def handle_data(self, data):
data = data.strip()
if data and len(data) > 0:
self.TokenList.append(data)
# print data
def GetTokenList(self):
return self.TokenList
from urllib.request import urlopen
try:
url = 'http://Net Vision:public@129.20.27.119/PageMonComprehensive.html'
f = urlopen(url)
@ -118,7 +119,8 @@ def getUpsStatus():
return
h = MyHTMLParser()
h.feed(res)
tokensList = h.GetTokenList() # @UnusedVariable
tokensList = h.GetTokenList() # noqa:F841
if __name__ == '__main__':
from SimpaDbUtil import wakeUp

View File

@ -1,63 +0,0 @@
'''
The goal of this application is to convert a mno database into mno's web site compatible database (drupal)
'''
import sqlite3
import os
import re
import sys
from SimpaDbUtil import SqlFile, SqlDatabaseReader
from _sqlite3 import Row
class OrchestraSqlDb( object ):
def __init__(self, sql_reader):
"""
:param SqlDatabaseReader sql_reader: the inventory database
"""
super(OrchestraSqlDb, self).__init__()
self._sql_reader = sql_reader
def query(self, sql_query):
return self._sql_reader.query(sql_query)
class Concert(object):
pass
class Recording(object):
pass
class OrchestraDb(object):
def __init__(self, mno_drupal_db_sql_file_path):
self.concerts = {}
sql_source = SqlFile(mno_drupal_db_sql_file_path)
sql_reader = SqlDatabaseReader(sql_source)
orchestra_sql_db = OrchestraSqlDb(sql_reader)
self._parse_from_orchestra_drupal_db(orchestra_sql_db)
def _parse_from_orchestra_drupal_db(self, orchestra_sql_db):
"""
:param OrchestraSqlDb orchestra_sql_db:
"""
concert_rows = orchestra_sql_db.query("SELECT nid,title FROM node WHERE type is 'concert'")
for concert_row in concert_rows:
(nid, title)=concert_row
print(title)
nid = int(nid)
track_id_rows = orchestra_sql_db.query("SELECT field_tracks_target_id FROM field_revision_field_tracks WHERE entity_id=%d" % nid )
for track_id_row in track_id_rows:
(field_tracks_target_id, ) = track_id_row
#print(field_tracks_target_id)
track_rows = orchestra_sql_db.query("SELECT title FROM node WHERE nid=%d" % field_tracks_target_id)
(recording_title, ) = track_rows[0]
print("\t%s" % recording_title)
mno_db = OrchestraDb('/Users/graffy/data/Perso/MeltingNotes_work.git/website/v2_drupal/melting_drupal.sql')

View File

@ -1,84 +1,83 @@
import re
def mysql_to_sqlite( mysql_sql_code, truncate_hex_strings = False ):
"""
converts a mysql-compatible sql code into a sqlite-ompatible sql code
note: the original code was found on internet, then tweaked
"""
content = mysql_sql_code
# unused commands
COMMAND_RE = re.compile(r'^(SET).*?;\n$', re.IGNORECASE | re.MULTILINE | re.DOTALL)
content = COMMAND_RE.sub('', content)
def mysql_to_sqlite(mysql_sql_code, truncate_hex_strings=False):
"""
converts a mysql-compatible sql code into a sqlite-ompatible sql code
# sqlite doesn't like COMMENT= , remove it properly before the table constraint filter because the table constraint filter is not clever enough to cope with ; inside comment strings
# ENGINE=InnoDB DEFAULT CHARSET=utf8 COMMENT='A list of URL aliases for Drupal paths; a user may visit...';
COMMENTS_EQUAL_RE = re.compile(r'\s+COMMENT=\'[^\']*\'', re.IGNORECASE | re.MULTILINE | re.DOTALL)
# content = re.sub(r'^-- Tab[.]', 'toto', content, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL)
content = COMMENTS_EQUAL_RE.sub('', content)
note: the original code was found on internet, then tweaked
"""
content = mysql_sql_code
# table constraints
TCONS_RE = re.compile(r'\)(\s*(CHARSET|DEFAULT|ENGINE)(=.*?)?\s*)+;', re.IGNORECASE | re.MULTILINE | re.DOTALL)
content = TCONS_RE.sub(');', content)
# unused commands
COMMAND_RE = re.compile(r'^(SET).*?;\n$', re.IGNORECASE | re.MULTILINE | re.DOTALL)
content = COMMAND_RE.sub('', content)
# remove comments
# `nid` int(10) UNSIGNED NOT NULL DEFAULT '0' COMMENT 'The node.nid this record affects.',
COMMENTS_RE = re.compile(r'\s+COMMENT\s+\'[^\']*\'', re.IGNORECASE | re.MULTILINE | re.DOTALL)
# content = re.sub(r'^-- Tab[.]', 'toto', content, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL)
content = COMMENTS_RE.sub('', content)
# sqlite doesn't like COMMENT= , remove it properly before the table constraint filter because the table constraint filter is not clever enough to cope with ; inside comment strings
# ENGINE=InnoDB DEFAULT CHARSET=utf8 COMMENT='A list of URL aliases for Drupal paths; a user may visit...';
COMMENTS_EQUAL_RE = re.compile(r'\s+COMMENT=\'[^\']*\'', re.IGNORECASE | re.MULTILINE | re.DOTALL)
# content = re.sub(r'^-- Tab[.]', 'toto', content, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL)
content = COMMENTS_EQUAL_RE.sub('', content)
# sqlite doesn't like ' being escaped as \', use '' instead
content = re.sub(r'\\\'', '\'\'', content, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL )
# table constraints
TCONS_RE = re.compile(r'\)(\s*(CHARSET|DEFAULT|ENGINE)(=.*?)?\s*)+;', re.IGNORECASE | re.MULTILINE | re.DOTALL)
content = TCONS_RE.sub(');', content)
if truncate_hex_strings:
# sqlite doesn't like too big hex strings 0x613a343a7b733a383a
content = re.sub(r'0x[0-9a-f]+', '0xdeadbeef', content, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL )
# remove comments
# `nid` int(10) UNSIGNED NOT NULL DEFAULT '0' COMMENT 'The node.nid this record affects.',
COMMENTS_RE = re.compile(r'\s+COMMENT\s+\'[^\']*\'', re.IGNORECASE | re.MULTILINE | re.DOTALL)
# content = re.sub(r'^-- Tab[.]', 'toto', content, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL)
content = COMMENTS_RE.sub('', content)
# sqlite doesn't understand
# `format` varchar(100) CHARACTER SET utf8 COLLATE utf8_bin NOT NULL
content = re.sub(r'\s+CHARACTER SET\s+[^\s]+', '', content, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL )
# sqlite doesn't know the utf8_bin :
# `format` varchar(100) CHARACTER SET utf8 COLLATE utf8_bin NOT NULL
#no such collation sequence: utf8_bin
content = re.sub(r'\s+COLLATE\s+utf8_bin\s+', ' ', content, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL )
# sqlite doesn't like ' being escaped as \', use '' instead
content = re.sub(r'\\\'', '\'\'', content, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL)
# sqlite doesn't like 'unsigned' as in `ip_address_3` tinyint(3) unsigned NOT NULL default '27',
content = re.sub(r' unsigned ', ' ', content, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL )
if truncate_hex_strings:
# sqlite doesn't like too big hex strings 0x613a343a7b733a383a
content = re.sub(r'0x[0-9a-f]+', '0xdeadbeef', content, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL)
# sqlite doesn't like 'enum' as in `type` enum('normal','light_out_management') NOT NULL default 'normal',,
content = re.sub(r' enum\([^\)]*\) ', ' varchar(255) ', content)
# sqlite doesn't understand
# `format` varchar(100) CHARACTER SET utf8 COLLATE utf8_bin NOT NULL
content = re.sub(r'\s+CHARACTER SET\s+[^\s]+', '', content, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL)
# sqlite doesn't support much of alter table (https://www.sqlite.org/lang_altertable.html). The following is not supported :
# ALTER TABLE `blocked_ips`
# ADD PRIMARY KEY (`iid`),
# ADD KEY `blocked_ip` (`ip`);
content = re.sub(r'alter table [^;]*;', '', content, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL )
# sqlite doesn't know the utf8_bin :
# `format` varchar(100) CHARACTER SET utf8 COLLATE utf8_bin NOT NULL
# no such collation sequence: utf8_bin
content = re.sub(r'\s+COLLATE\s+utf8_bin\s+', ' ', content, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL)
# sqlite doesn't like 'unsigned' as in `ip_address_3` tinyint(3) unsigned NOT NULL default '27',
content = re.sub(r' unsigned ', ' ', content, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL)
# COMMIT;
# sqlite3.OperationalError: cannot commit - no transaction is active
content = re.sub(r'commit\s*;', '', content, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL )
# sqlite doesn't like 'enum' as in `type` enum('normal','light_out_management') NOT NULL default 'normal',,
content = re.sub(r' enum\([^\)]*\) ', ' varchar(255) ', content)
# insert multiple values
# INSERTVALS_RE = re.compile(r'^(INSERT INTO.*?VALUES)\s*\((.*)\*;', re.IGNORECASE | re.MULTILINE | re.DOTALL)
INSERTVALS_RE = re.compile(r'^(INSERT INTO.*?VALUES)\s*([^;]*);', re.IGNORECASE | re.MULTILINE | re.DOTALL)
#INSERTVALS_RE = re.compile(r'^(INSERT INTO.*?VALUES)\s*((\[^\)](\)));$', re.IGNORECASE | re.MULTILINE | re.DOTALL)
INSERTVALS_SPLIT_RE = re.compile(r'\)\s*,\s*\(', re.IGNORECASE | re.MULTILINE | re.DOTALL)
# sqlite doesn't support much of alter table (https://www.sqlite.org/lang_altertable.html). The following is not supported :
# ALTER TABLE `blocked_ips`
# ADD PRIMARY KEY (`iid`),
# ADD KEY `blocked_ip` (`ip`);
content = re.sub(r'alter table [^;]*;', '', content, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL)
# COMMIT;
# sqlite3.OperationalError: cannot commit - no transaction is active
content = re.sub(r'commit\s*;', '', content, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL)
def insertvals_replacer(match):
insert, values = match.groups()
# print("insert=%s"%insert)
# print("values=%s"%values)
values = re.sub('^\s*\(' ,'', values)
values = re.sub('\)\s*$' ,'', values)
replacement = ''
for vals in INSERTVALS_SPLIT_RE.split(values):
#print("vals=%s"%vals)
replacement = '%s\n%s (%s);' % (replacement, insert, vals)
return replacement
# insert multiple values
# INSERTVALS_RE = re.compile(r'^(INSERT INTO.*?VALUES)\s*\((.*)\*;', re.IGNORECASE | re.MULTILINE | re.DOTALL)
INSERTVALS_RE = re.compile(r'^(INSERT INTO.*?VALUES)\s*([^;]*);', re.IGNORECASE | re.MULTILINE | re.DOTALL)
# INSERTVALS_RE = re.compile(r'^(INSERT INTO.*?VALUES)\s*((\[^\)](\)));$', re.IGNORECASE | re.MULTILINE | re.DOTALL)
INSERTVALS_SPLIT_RE = re.compile(r'\)\s*,\s*\(', re.IGNORECASE | re.MULTILINE | re.DOTALL)
content = INSERTVALS_RE.sub(insertvals_replacer, content)
return content
def insertvals_replacer(match):
insert, values = match.groups()
# print("insert=%s"%insert)
# print("values=%s"%values)
values = re.sub(r'^\s*\(', '', values)
values = re.sub(r'\)\s*$', '', values)
replacement = ''
for vals in INSERTVALS_SPLIT_RE.split(values):
# print("vals=%s"%vals)
replacement = '%s\n%s (%s);' % (replacement, insert, vals)
return replacement
content = INSERTVALS_RE.sub(insertvals_replacer, content)
return content

View File

@ -1,17 +1,17 @@
class Version(object):
"""
simple version number made of a series of positive integers separated by dots
distutils.version.StrictVersion : not good because versions such as 3.2.0.4 are not allowed (StrictVersion allows no more than 3 numbers)
distutils.version.LooseVersion : not good because the version string could be anything (https://stackoverflow.com/questions/11887762/how-do-i-compare-version-numbers-in-python)
"""
def __init__(self, version_as_string):
"""
:param str version_as_string: eg '6.2u5' or '8.1.9'
"""
self.numbers = [int(s) for s in version_as_string.replace('u', '.').split('.')]
def get_number(self, index):
if index >= len(self.numbers):
return 0

View File

@ -4,6 +4,7 @@
import socket
import struct
def wake_on_lan(macaddress):
""" Switches on remote computers using WOL. """
@ -15,10 +16,10 @@ def wake_on_lan(macaddress):
macaddress = macaddress.replace(sep, '')
else:
raise ValueError('Incorrect MAC address format')
# Pad the synchronization stream.
data = ''.join(['FFFFFFFFFFFF', macaddress * 20])
send_data = ''
send_data = ''
# Split up the hex values and pack.
for i in range(0, len(data), 2):
@ -29,14 +30,13 @@ def wake_on_lan(macaddress):
sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
sock.setsockopt(socket.SOL_SOCKET, socket.SO_BROADCAST, 1)
sock.sendto(send_data, ('<broadcast>', 7))
if __name__ == '__main__':
# Use macaddresses with any seperators.
wake_on_lan('00:1E:52:F3:61:60') # simpatix28
#wake_on_lan('00:24:36:F2:D0:FA') # simpatix33
#wake_on_lan('0F:0F:DF:0F:BF:EF')
#wake_on_lan('0F-0F-DF-0F-BF-EF')
wake_on_lan('00:1E:52:F3:61:60') # simpatix28
# wake_on_lan('00:24:36:F2:D0:FA') # simpatix33
# wake_on_lan('0F:0F:DF:0F:BF:EF')
# wake_on_lan('0F-0F-DF-0F-BF-EF')
# or without any seperators.
#wake_on_lan('0F0FDF0FBFEF')
# wake_on_lan('0F0FDF0FBFEF')

View File

@ -1,7 +1,8 @@
from setuptools import setup
setup(name='cocluto',
version=1.00,
setup(
name='cocluto',
version=1.01,
description='compute cluster utility tools',
url='https://git.ipr.univ-rennes1.fr/graffy/cocluto',
author='Guillaume Raffy',