fixed pylint errors and cleaned up

work related to Bug 3315 - make simpaweb django app a packageable application
This commit is contained in:
Guillaume Raffy 2023-05-23 17:27:12 +02:00
parent 7a5d32dec0
commit 270304f58e
28 changed files with 2323 additions and 2293 deletions

View File

@ -2,57 +2,69 @@
import sys import sys
sys.path.insert(0, '..') sys.path.insert(0, '..')
import os import os
import MySQLdb
import threading
from Lib.Util import * from Lib.Util import *
from Lib.SimpaDbUtil import * from Lib.SimpaDbUtil import *
import time import time
from ClusterStatus import ClusterStatus from ClusterStatus import ClusterStatus
from SlotAllocator import * from SlotAllocator import DecoupledSlotAllocator
from Log import * from Log import logDebug, logInfo
from ClusterNodeStatusUpdater import * from ClusterNodeStatusUpdater import IWakeUpCompleteNotifier, ISleepCompleteNotifier
from SunGridEngine import SunGridEngine from SunGridEngine import SunGridEngine
import Util from Util import log, onException
from WebServer import WebServerThread from WebServer import WebServerThread
from PowerState import PowerState
from HTMLParser import HTMLParser from HTMLParser import HTMLParser
VERSION='1.18' VERSION = '1.18'
class MyHTMLParser(HTMLParser): class MyHTMLParser(HTMLParser):
def __init__(self): def __init__(self):
HTMLParser.__init__(self) HTMLParser.__init__(self)
self.TokenList = [] self.TokenList = []
def handle_data( self,data):
def handle_data(self, data):
data = data.strip() data = data.strip()
if data and len(data) > 0: if data and len(data) > 0:
self.TokenList.append(data) self.TokenList.append(data)
#print data # print data
def GetTokenList(self): def GetTokenList(self):
return self.TokenList return self.TokenList
class WakeUpCompleteNotifier( IWakeUpCompleteNotifier ): class WakeUpCompleteNotifier(IWakeUpCompleteNotifier):
def __init__(self, machineName, clusterController): def __init__(self, machineName, clusterController):
self.m_machineName = machineName self.m_machineName = machineName
self.m_clusterController = clusterController self.m_clusterController = clusterController
def onWakeUpComplete( self ):
def onWakeUpComplete(self):
logDebug('WakeUpCompleteNotifier::onWakeUpComplete : start') logDebug('WakeUpCompleteNotifier::onWakeUpComplete : start')
self.m_clusterController.onMachineWakeUpComplete( self.m_machineName ) self.m_clusterController.onMachineWakeUpComplete(self.m_machineName)
class SleepCompleteNotifier(ISleepCompleteNotifier):
class SleepCompleteNotifier( ISleepCompleteNotifier ):
def __init__(self, machineName, clusterController): def __init__(self, machineName, clusterController):
self.m_machineName = machineName self.m_machineName = machineName
self.m_clusterController = clusterController self.m_clusterController = clusterController
def onSleepComplete( self, bSleepSucceeded ):
logDebug('SleepCompleteNotifier::onSleepComplete : start')
self.m_clusterController.onMachineSleepComplete( self.m_machineName, bSleepSucceeded )
def jouleToKwh( fEnergyInJoules ): def onSleepComplete(self, bSleepSucceeded):
logDebug('SleepCompleteNotifier::onSleepComplete : start')
self.m_clusterController.onMachineSleepComplete(self.m_machineName, bSleepSucceeded)
def jouleToKwh(fEnergyInJoules):
""" """
converts joules to kWH converts joules to kWH
""" """
# 1 kWh = 1000 * 3600 J # 1 kWh = 1000 * 3600 J
return fEnergyInJoules / (1000.0 * 3600.0) return fEnergyInJoules / (1000.0 * 3600.0)
class ClusterController: class ClusterController:
""" """
The cluster controller monitors the cluster's activity and has multiple purposes : The cluster controller monitors the cluster's activity and has multiple purposes :
@ -67,10 +79,10 @@ class ClusterController:
jobs (eg add some machines to a queue). jobs (eg add some machines to a queue).
Mechanism to let user get priority Mechanism to let user get priority
""" """
def __init__( self ): def __init__(self):
gridEngine = SunGridEngine() gridEngine = SunGridEngine()
self.m_clusterStatus = ClusterStatus( gridEngine ) self.m_clusterStatus = ClusterStatus(gridEngine)
self.m_slotAllocator = DecoupledSlotAllocator() #SimpleSlotAllocator() self.m_slotAllocator = DecoupledSlotAllocator() # SimpleSlotAllocator()
self.m_machinesThatNeedWakeUp = {} self.m_machinesThatNeedWakeUp = {}
self.m_machinesThatNeedWakeupLock = threading.Lock() # to prevent concurrent access to m_machinesThatNeedWakeUp self.m_machinesThatNeedWakeupLock = threading.Lock() # to prevent concurrent access to m_machinesThatNeedWakeUp
self.m_machinesThatNeedSleeping = {} self.m_machinesThatNeedSleeping = {}
@ -82,47 +94,47 @@ class ClusterController:
self.m_bStop = False self.m_bStop = False
self.m_bStopLock = threading.Lock() # to prevent concurrent access to m_bStop self.m_bStopLock = threading.Lock() # to prevent concurrent access to m_bStop
def getClusterStatus( self ): def getClusterStatus(self):
return self.m_clusterStatus return self.m_clusterStatus
def log( self, message ): def log(self, message):
print message print(message)
def shutdownLeastImportantNode( self ): def shutdownLeastImportantNode(self):
self.log("ClusterController::shutdownLeastImportantNode : start") self.log("ClusterController::shutdownLeastImportantNode : start")
def onMachineWakeUpComplete( self, machineName ): def onMachineWakeUpComplete(self, machineName):
self.m_machinesThatNeedWakeupLock.acquire() self.m_machinesThatNeedWakeupLock.acquire()
#logDebug('ClusterController::onMachineWakeUpComplete : machine %s old len(self.m_machinesThatNeedWakeUp) = %d' % (machineName,len(self.m_machinesThatNeedWakeUp)) ) # logDebug('ClusterController::onMachineWakeUpComplete : machine %s old len(self.m_machinesThatNeedWakeUp) = %d' % (machineName,len(self.m_machinesThatNeedWakeUp)))
del self.m_machinesThatNeedWakeUp[ machineName ] del self.m_machinesThatNeedWakeUp[machineName]
#logDebug('ClusterController::onMachineWakeUpComplete : machine %s new len(self.m_machinesThatNeedWakeUp) = %d' % (machineName,len(self.m_machinesThatNeedWakeUp)) ) # logDebug('ClusterController::onMachineWakeUpComplete : machine %s new len(self.m_machinesThatNeedWakeUp) = %d' % (machineName,len(self.m_machinesThatNeedWakeUp)))
self.m_machinesThatNeedWakeupLock.release() self.m_machinesThatNeedWakeupLock.release()
logDebug('ClusterController::onMachineWakeUpComplete : removed %s from the list of machines that need waking up because it\'s now awake' % machineName) logDebug('ClusterController::onMachineWakeUpComplete : removed %s from the list of machines that need waking up because it\'s now awake' % machineName)
def onMachineSleepComplete( self, machineName, bSleepSucceeded ): def onMachineSleepComplete(self, machineName, bSleepSucceeded):
self.m_machinesThatNeedSleepingLock.acquire() self.m_machinesThatNeedSleepingLock.acquire()
#logDebug('ClusterController::onMachineSleepComplete : machine %s old len(self.m_machinesThatNeedWakeUp) = %d' % (machineName,len(self.m_machinesThatNeedWakeUp)) ) # logDebug('ClusterController::onMachineSleepComplete : machine %s old len(self.m_machinesThatNeedWakeUp) = %d' % (machineName,len(self.m_machinesThatNeedWakeUp)))
del self.m_machinesThatNeedSleeping[ machineName ] del self.m_machinesThatNeedSleeping[machineName]
#logDebug('ClusterController::onMachineSleepComplete : machine %s new len(self.m_machinesThatNeedWakeUp) = %d' % (machineName,len(self.m_machinesThatNeedWakeUp)) ) # logDebug('ClusterController::onMachineSleepComplete : machine %s new len(self.m_machinesThatNeedWakeUp) = %d' % (machineName,len(self.m_machinesThatNeedWakeUp)))
self.m_machinesThatNeedSleepingLock.release() self.m_machinesThatNeedSleepingLock.release()
if bSleepSucceeded: if bSleepSucceeded:
logDebug('ClusterController::onMachineSleepComplete : removed %s from the list of machines that need sleeping because it\'s now sleeping' % machineName) logDebug('ClusterController::onMachineSleepComplete : removed %s from the list of machines that need sleeping because it\'s now sleeping' % machineName)
else: else:
logDebug('ClusterController::onMachineSleepComplete : removed %s from the list of machines that need sleeping because it can\'t be put to sleep at the moment (eg a job just arrived)' % machineName) logDebug('ClusterController::onMachineSleepComplete : removed %s from the list of machines that need sleeping because it can\'t be put to sleep at the moment (eg a job just arrived)' % machineName)
def getNumPendingWakeUps( self ): def getNumPendingWakeUps(self):
self.m_machinesThatNeedWakeupLock.acquire() self.m_machinesThatNeedWakeupLock.acquire()
numPendingWakeUps = len(self.m_machinesThatNeedWakeUp) numPendingWakeUps = len(self.m_machinesThatNeedWakeUp)
self.m_machinesThatNeedWakeupLock.release() self.m_machinesThatNeedWakeupLock.release()
return numPendingWakeUps return numPendingWakeUps
def getNumPendingSleeps( self ): def getNumPendingSleeps(self):
self.m_machinesThatNeedSleepingLock.acquire() self.m_machinesThatNeedSleepingLock.acquire()
numPendingSleeps = len(self.m_machinesThatNeedSleeping) numPendingSleeps = len(self.m_machinesThatNeedSleeping)
self.m_machinesThatNeedSleepingLock.release() self.m_machinesThatNeedSleepingLock.release()
return numPendingSleeps return numPendingSleeps
def putIdleMachinesToSleep( self ): def putIdleMachinesToSleep(self):
self.m_clusterStatus.m_lock.acquire() self.m_clusterStatus.m_lock.acquire()
idleMachines = self.m_clusterStatus.getIdleMachines() idleMachines = self.m_clusterStatus.getIdleMachines()
# logInfo('idleMachines :') # logInfo('idleMachines :')
@ -131,20 +143,19 @@ class ClusterController:
if idleMachine.getPowerState() == PowerState.ON: if idleMachine.getPowerState() == PowerState.ON:
# logInfo('\t%s' % machineName) # logInfo('\t%s' % machineName)
if idleMachine.getName() != 'simpatix10': # never put simpatix10 to sleep because it's the sge master and is also server for other things if idleMachine.getName() != 'simpatix10': # never put simpatix10 to sleep because it's the sge master and is also server for other things
self.m_machinesThatNeedSleeping[idleMachine.getName()]=idleMachine self.m_machinesThatNeedSleeping[idleMachine.getName()] = idleMachine
self.m_clusterStatus.m_lock.release() self.m_clusterStatus.m_lock.release()
listOfMachinesThatNeedSleeping = self.m_machinesThatNeedSleeping.values() # duplicate the list so that we don't iterate on m_machinesThatNeedSleeping, which could cause a runtime error because callbacks alter m_machinesThatNeedWakeUp listOfMachinesThatNeedSleeping = self.m_machinesThatNeedSleeping.values() # duplicate the list so that we don't iterate on m_machinesThatNeedSleeping, which could cause a runtime error because callbacks alter m_machinesThatNeedWakeUp
for machine in listOfMachinesThatNeedSleeping: for machine in listOfMachinesThatNeedSleeping:
logInfo('ClusterController::putIdleMachinesToSleep : requesting sleep for %s because it\'s idle' % machine.getName()) logInfo('ClusterController::putIdleMachinesToSleep : requesting sleep for %s because it\'s idle' % machine.getName())
machine.requestSleep( SleepCompleteNotifier( machine.getName(), self ) ) machine.requestSleep(SleepCompleteNotifier(machine.getName(), self))
if len(listOfMachinesThatNeedSleeping) != 0: if len(listOfMachinesThatNeedSleeping) != 0:
# hack : wait until the sleep requests are handled so that we don't request the same machine to sleep multiple times # hack : wait until the sleep requests are handled so that we don't request the same machine to sleep multiple times
while self.getNumPendingSleeps() > 0: while self.getNumPendingSleeps() > 0:
time.sleep(1) time.sleep(1)
def wakeUpMachinesForPendingJobs(self): def wakeUpMachinesForPendingJobs(self):
listOfMachinesThatNeedWakeUp = [] listOfMachinesThatNeedWakeUp = []
@ -156,15 +167,15 @@ class ClusterController:
logInfo('\t%d' % job.getId().asStr()) logInfo('\t%d' % job.getId().asStr())
""" """
if len(pendingJobs) != 0: if len(pendingJobs) != 0:
self.m_machinesThatNeedWakeUp = self.m_slotAllocator.getMachinesThatNeedWakeUp( pendingJobs, self.m_clusterStatus ) self.m_machinesThatNeedWakeUp = self.m_slotAllocator.getMachinesThatNeedWakeUp(pendingJobs, self.m_clusterStatus)
if len(self.m_machinesThatNeedWakeUp) == 0: if len(self.m_machinesThatNeedWakeUp) == 0:
None None
#logInfo('ClusterController::updateNormalState : no machine needs waking up' ) # logInfo('ClusterController::updateNormalState : no machine needs waking up')
else: else:
listOfMachinesThatNeedWakeUp = self.m_machinesThatNeedWakeUp.values() # duplicate the list so that we don't iterate on m_machinesThatNeedWakeUp, which would cause a runtime error because callbacks alter m_machinesThatNeedWakeUp listOfMachinesThatNeedWakeUp = self.m_machinesThatNeedWakeUp.values() # duplicate the list so that we don't iterate on m_machinesThatNeedWakeUp, which would cause a runtime error because callbacks alter m_machinesThatNeedWakeUp
for machine in listOfMachinesThatNeedWakeUp: for machine in listOfMachinesThatNeedWakeUp:
logInfo('ClusterController::wakeUpMachinesForPendingJobs : requesting wake up for '+machine.getName() ) logInfo('ClusterController::wakeUpMachinesForPendingJobs : requesting wake up for ' + machine.getName())
machine.requestWakeUp( WakeUpCompleteNotifier( machine.getName(), self ) ) machine.requestWakeUp(WakeUpCompleteNotifier(machine.getName(), self))
self.m_clusterStatus.m_lock.release() self.m_clusterStatus.m_lock.release()
if len(listOfMachinesThatNeedWakeUp) != 0: if len(listOfMachinesThatNeedWakeUp) != 0:
@ -178,49 +189,49 @@ class ClusterController:
time.sleep(iSGE_CHEK_RUNNABLE_JOBS_DELAY) # note : this is annoying because it blocks the main thread. This could be improved if we forbid the machines to go to sleep for that much time.... time.sleep(iSGE_CHEK_RUNNABLE_JOBS_DELAY) # note : this is annoying because it blocks the main thread. This could be improved if we forbid the machines to go to sleep for that much time....
logInfo('ClusterController::wakeUpMachinesForPendingJobs : end of the delay given to SGE to allocate slots') logInfo('ClusterController::wakeUpMachinesForPendingJobs : end of the delay given to SGE to allocate slots')
def updateNormalState( self ): def updateNormalState(self):
# attempt to shut down machines that are idle # attempt to shut down machines that are idle
self.putIdleMachinesToSleep() self.putIdleMachinesToSleep()
# wake up necessary machines if there are pending jobs # wake up necessary machines if there are pending jobs
self.wakeUpMachinesForPendingJobs() self.wakeUpMachinesForPendingJobs()
def storeSessionInDatabase( self ): def storeSessionInDatabase(self):
conn = MySQLdb.connect('simpatix10', 'clusterctrl', '', 'clustercontroller') conn = MySQLdb.connect('simpatix10', 'clusterctrl', '', 'clustercontroller')
assert(conn) assert conn
# retrieve the session id, as it's an auto_increment field # retrieve the session id, as it's an auto_increment field
sqlCommand = "SELECT AUTO_INCREMENT FROM information_schema.TABLES WHERE TABLE_SCHEMA = 'clustercontroller' AND TABLE_NAME = 'sessions_desc'" sqlCommand = "SELECT AUTO_INCREMENT FROM information_schema.TABLES WHERE TABLE_SCHEMA = 'clustercontroller' AND TABLE_NAME = 'sessions_desc'"
print sqlCommand print(sqlCommand)
conn.query(sqlCommand) conn.query(sqlCommand)
r=conn.store_result() r = conn.store_result()
iSessionId = r.fetch_row()[0][0] iSessionId = r.fetch_row()[0][0]
# stores information about the session # stores information about the session
sqlCommand = "INSERT INTO `sessions_desc` (`start_time`, end_time, `program_version`, `machine_name`, `pid`, num_controlled_machines) VALUES (NOW(), NOW(), '%s', 'simpatix10', %d, %d);" % (VERSION, os.getpid(), len(self.m_clusterStatus.m_clusterNodes)) sqlCommand = "INSERT INTO `sessions_desc` (`start_time`, end_time, `program_version`, `machine_name`, `pid`, num_controlled_machines) VALUES (NOW(), NOW(), '%s', 'simpatix10', %d, %d);" % (VERSION, os.getpid(), len(self.m_clusterStatus.m_clusterNodes))
print sqlCommand print(sqlCommand)
conn.query(sqlCommand) conn.query(sqlCommand)
# initialize the energy savings table # initialize the energy savings table
sqlCommand = "INSERT INTO session_to_energy_savings (session_id, energy_savings_kwh) VALUES (%d,0.0);" % (iSessionId) sqlCommand = "INSERT INTO session_to_energy_savings (session_id, energy_savings_kwh) VALUES (%d,0.0);" % (iSessionId)
print sqlCommand print(sqlCommand)
conn.query(sqlCommand) conn.query(sqlCommand)
conn.close() conn.close()
print( 'Session Iid = %d' % iSessionId ) print('Session Iid = %d' % iSessionId)
return iSessionId return iSessionId
def updateSessionEnergyConsumptionInDatabase( self ): def updateSessionEnergyConsumptionInDatabase(self):
conn = MySQLdb.connect('simpatix10', 'root', '', 'clustercontroller') conn = MySQLdb.connect('simpatix10', 'root', '', 'clustercontroller')
assert(conn) assert conn
# update energy savings for the current session # update energy savings for the current session
sqlCommand = "UPDATE session_to_energy_savings SET energy_savings_kwh=%f WHERE session_id=%d;" % ( jouleToKwh(self.m_clusterStatus.getEnergySavings()) ,self.m_iSessionId) sqlCommand = "UPDATE session_to_energy_savings SET energy_savings_kwh=%f WHERE session_id=%d;" % (jouleToKwh(self.m_clusterStatus.getEnergySavings()), self.m_iSessionId)
print sqlCommand print(sqlCommand)
conn.query(sqlCommand) conn.query(sqlCommand)
# update the end time of the current session # update the end time of the current session
sqlCommand = "UPDATE sessions_desc SET end_time=NOW() WHERE session_id=%d;" % (self.m_iSessionId) sqlCommand = "UPDATE sessions_desc SET end_time=NOW() WHERE session_id=%d;" % (self.m_iSessionId)
print sqlCommand print(sqlCommand)
conn.query(sqlCommand) conn.query(sqlCommand)
conn.close() conn.close()
@ -231,7 +242,7 @@ class ClusterController:
""" """
self.m_clusterStatus.setControlOnMachine(machineName, bControl) self.m_clusterStatus.setControlOnMachine(machineName, bControl)
def run( self ): def run(self):
""" """
""" """
self.m_iSessionId = self.storeSessionInDatabase() self.m_iSessionId = self.storeSessionInDatabase()
@ -247,24 +258,24 @@ class ClusterController:
startTime = time.localtime() startTime = time.localtime()
while not self.m_bStop: while not self.m_bStop:
currentTime = time.time() currentTime = time.time()
#clusterStatus.m_nodesStatus['simpatix10'].dump() # clusterStatus.m_nodesStatus['simpatix10'].dump()
if (not self.m_lastEnergyStatusLogTime) or (currentTime > (self.m_lastEnergyStatusLogTime +self.DELAY_BETWEEN_ENERGY_STATUS_LOGS)): if (not self.m_lastEnergyStatusLogTime) or (currentTime > (self.m_lastEnergyStatusLogTime + self.DELAY_BETWEEN_ENERGY_STATUS_LOGS)):
iNumMachines = len(self.m_clusterStatus.m_clusterNodes) iNumMachines = len(self.m_clusterStatus.m_clusterNodes)
iNumMachinesOn = 0 iNumMachinesOn = 0
iNumSleepingMachines = 0 iNumSleepingMachines = 0
for machine in self.m_clusterStatus.m_clusterNodes.values(): for machine in self.m_clusterStatus.m_clusterNodes.values():
ePowerState = machine.getPowerState() ePowerState = machine.getPowerState()
if ePowerState == PowerState.ON: if ePowerState == PowerState.ON:
iNumMachinesOn+=1 iNumMachinesOn += 1
elif ePowerState == PowerState.SLEEP: elif ePowerState == PowerState.SLEEP:
iNumSleepingMachines+=1 iNumSleepingMachines += 1
logInfo('%d machines (%d ON, %d SLEEPING)' % (iNumMachines, iNumMachinesOn, iNumSleepingMachines)) logInfo('%d machines (%d ON, %d SLEEPING)' % (iNumMachines, iNumMachinesOn, iNumSleepingMachines))
iNumSlots = self.m_clusterStatus.getNumControlledSlots() iNumSlots = self.m_clusterStatus.getNumControlledSlots()
iNumUsedSlots = self.m_clusterStatus.getNumUsedSlots() iNumUsedSlots = self.m_clusterStatus.getNumUsedSlots()
iNumWastedSlots = self.m_clusterStatus.getNumWastedSlots() iNumWastedSlots = self.m_clusterStatus.getNumWastedSlots()
iNumSleepingSlots = self.m_clusterStatus.getNumSleepingSlots() iNumSleepingSlots = self.m_clusterStatus.getNumSleepingSlots()
logInfo('%d slots (%d used, %d wasted, %d sleeping)' % (iNumSlots, iNumUsedSlots, iNumWastedSlots, iNumSleepingSlots )) logInfo('%d slots (%d used, %d wasted, %d sleeping)' % (iNumSlots, iNumUsedSlots, iNumWastedSlots, iNumSleepingSlots))
logInfo('cluster estimated power consumption : %f W (saving from cluster controller : %f W)' % (self.m_clusterStatus.getCurrentPowerConsumption(), self.m_clusterStatus.getCurrentPowerSavings()) ) logInfo('cluster estimated power consumption : %f W (saving from cluster controller : %f W)' % (self.m_clusterStatus.getCurrentPowerConsumption(), self.m_clusterStatus.getCurrentPowerSavings()))
logInfo('cluster estimated energy consumption since %s : %f kWh (saving from cluster controller : %f kWh)' % (time.asctime(startTime), jouleToKwh(self.m_clusterStatus.getEnergyConsumption()), jouleToKwh(self.m_clusterStatus.getEnergySavings()))) logInfo('cluster estimated energy consumption since %s : %f kWh (saving from cluster controller : %f kWh)' % (time.asctime(startTime), jouleToKwh(self.m_clusterStatus.getEnergyConsumption()), jouleToKwh(self.m_clusterStatus.getEnergySavings())))
self.updateSessionEnergyConsumptionInDatabase() self.updateSessionEnergyConsumptionInDatabase()
self.m_lastEnergyStatusLogTime = currentTime self.m_lastEnergyStatusLogTime = currentTime
@ -274,11 +285,11 @@ class ClusterController:
self.m_clusterStatus.stopReadingThreads() self.m_clusterStatus.stopReadingThreads()
def storeClusterNodeStatus( clusterNodeStatus ): def storeClusterNodeStatus(clusterNodeStatus):
#conn = MySQLdb.connect('simpatix10', 'measures_writer', '', 'simpa_measurements') # conn = MySQLdb.connect('simpatix10', 'measures_writer', '', 'simpa_measurements')
conn = MySQLdb.connect('simpatix10', 'root', '', 'simpa_measurements') conn = MySQLdb.connect('simpatix10', 'root', '', 'simpa_measurements')
assert(conn) assert conn
#conn.query("""INSERT INTO `fan_rpm_logs` (`fan_id`, `rpm`, `date`) VALUES ('titi', 2000, NOW());""") # conn.query("""INSERT INTO `fan_rpm_logs` (`fan_id`, `rpm`, `date`) VALUES ('titi', 2000, NOW());""")
''' '''
conn.query("""SELECT * FROM fan_rpm_logs""") conn.query("""SELECT * FROM fan_rpm_logs""")
r=conn.store_result() r=conn.store_result()
@ -287,28 +298,29 @@ def storeClusterNodeStatus( clusterNodeStatus ):
for key, sensor in clusterNodeStatus.m_sensors.items(): for key, sensor in clusterNodeStatus.m_sensors.items():
sensorId = clusterNodeStatus.m_clusterNodeName + '_' + sensor.m_name sensorId = clusterNodeStatus.m_clusterNodeName + '_' + sensor.m_name
if sensor.typeName() == 'Fan': if sensor.typeName() == 'Fan':
sqlCommand = """INSERT INTO `fan_rpm_logs` (`fan_id`, `rpm`, `date`) VALUES ('"""+sensorId+"""', """+str(sensor.m_rpms)+""", NOW());""" sqlCommand = """INSERT INTO `fan_rpm_logs` (`fan_id`, `rpm`, `date`) VALUES ('""" + sensorId + """', """ + str(sensor.m_rpms) + """, NOW());"""
print sqlCommand print(sqlCommand)
conn.query(sqlCommand) conn.query(sqlCommand)
elif sensor.typeName() == 'Temperature': elif sensor.typeName() == 'Temperature':
sqlCommand = """INSERT INTO `temperature_logs` (`temp_sensor_id`, `temperature`, `date`) VALUES ('"""+sensorId+"""', """+str(sensor.m_temperature)+""", NOW());""" sqlCommand = """INSERT INTO `temperature_logs` (`temp_sensor_id`, `temperature`, `date`) VALUES ('""" + sensorId + """', """ + str(sensor.m_temperature) + """, NOW());"""
print sqlCommand print(sqlCommand)
conn.query(sqlCommand) conn.query(sqlCommand)
else: else:
assert(False) assert False
conn.close() conn.close()
if __name__ == '__main__': if __name__ == '__main__':
#Lib.Util.sendTextMail( 'SimpaCluster <guillaume.raffy@univ-rennes1.fr>', 'guillaume.raffy@univ-rennes1.fr', 'mail subject', 'mail content') # Lib.Util.sendTextMail('SimpaCluster <guillaume.raffy@univ-rennes1.fr>', 'guillaume.raffy@univ-rennes1.fr', 'mail subject', 'mail content')
try: try:
logInfo('ClusterController v. %s starting....' % VERSION) logInfo('ClusterController v. %s starting....' % VERSION)
#executeCommand('ping -o -t 1 simpatix310 > /dev/null') # executeCommand('ping -o -t 1 simpatix310 > /dev/null')
#print executeCommand('ssh simpatix10 "ipmitool sensor"') # print executeCommand('ssh simpatix10 "ipmitool sensor"')
#assert False, 'prout' # assert False, 'prout'
controller = ClusterController() controller = ClusterController()
controller.run() controller.run()
#machineNameToMacAddress( 'simpatix10' ) # machineNameToMacAddress('simpatix10')
#except AssertionError, error: # except AssertionError, error:
#except KeyboardInterrupt, error: # except KeyboardInterrupt, error:
except BaseException, exception: # catches all exceptions, including the ctrl+C (KeyboardInterrupt) except BaseException as exception: # catches all exceptions, including the ctrl+C (KeyboardInterrupt)
Util.onException(exception) onException(exception)

View File

@ -1,48 +1,50 @@
import threading import threading
from PowerState import * from PowerState import PowerState, PowerStateToStr
from ClusterNodeStatusUpdater import * from ClusterNodeStatusUpdater import ClusterNodeStatusUpdater
import Lib.Util import Lib.Util
import Lib.SimpaDbUtil import Lib.SimpaDbUtil
from Log import logInfo, logWarning
from datetime import datetime
from datetime import *
class ClusterNode: class ClusterNode:
""" """
the state of a machine node the state of a machine node
""" """
def __init__( self, machineName, cluster, gridEngine ): def __init__(self, machineName, cluster, gridEngine):
self.m_name = machineName self.m_name = machineName
self.m_cluster = cluster # the cluster this machine belongs to self.m_cluster = cluster # the cluster this machine belongs to
self.m_requestedPowerState = PowerState.ON self.m_requestedPowerState = PowerState.ON
self.m_powerState = PowerState.UNKNOWN self.m_powerState = PowerState.UNKNOWN
self.m_lastPowerStateTime = None # time at which the last value of self.m_powerState has been set self.m_lastPowerStateTime = None # time at which the last value of self.m_powerState has been set
self.m_machineStatusUpdater = ClusterNodeStatusUpdater( machineName, self, gridEngine ) self.m_machineStatusUpdater = ClusterNodeStatusUpdater(machineName, self, gridEngine)
self.m_energyConsumption = 0.0 # estimate of the energy consumption of this machine since the start of cluster controller (in joules) self.m_energyConsumption = 0.0 # estimate of the energy consumption of this machine since the start of cluster controller (in joules)
self.m_energySavings = 0.0 # estimate of the energy savings on this machine caused by the cluster controller since it started (in joules) self.m_energySavings = 0.0 # estimate of the energy savings on this machine caused by the cluster controller since it started (in joules)
def getName( self ): def getName(self):
return self.m_name return self.m_name
def isReady( self ): def isReady(self):
if self.m_powerState == PowerState.UNKNOWN: if self.m_powerState == PowerState.UNKNOWN:
#logInfo( self.m_name + ' is not ready (waiting for power state)' ) # logInfo(self.m_name + ' is not ready (waiting for power state)')
return False return False
if self.m_powerState == PowerState.ON: if self.m_powerState == PowerState.ON:
return True return True
#log( self.m_name + ' is ready' ) # log(self.m_name + ' is ready')
return True return True
def getPowerState( self ): def getPowerState(self):
return self.m_powerState return self.m_powerState
def setShouldAlwaysBeOn( self ): def setShouldAlwaysBeOn(self):
self.m_machineStatusUpdater.setShouldAlwaysBeOn( ) self.m_machineStatusUpdater.setShouldAlwaysBeOn()
self.setPowerState( PowerState.ON ) self.setPowerState(PowerState.ON)
def setPowerState( self, powerState ): def setPowerState(self, powerState):
bUpdateRequiredChecks = False bUpdateRequiredChecks = False
if self.m_powerState == PowerState.UNKNOWN: if self.m_powerState == PowerState.UNKNOWN:
logInfo('ClusterNode::setPowerState : '+self.m_name+'\'s power state has been initialized to '+PowerStateToStr( powerState )) logInfo('ClusterNode::setPowerState : ' + self.m_name + '\'s power state has been initialized to ' + PowerStateToStr(powerState))
self.m_powerState = powerState self.m_powerState = powerState
self.m_lastPowerStateTime = datetime.now() self.m_lastPowerStateTime = datetime.now()
bUpdateRequiredChecks = True bUpdateRequiredChecks = True
@ -51,7 +53,7 @@ class ClusterNode:
self.updateEnergyMeasurements() self.updateEnergyMeasurements()
# then change the power state # then change the power state
if self.m_powerState != powerState: if self.m_powerState != powerState:
logInfo('ClusterNode::setPowerState : '+self.m_name+'\'s power state has been changed to '+PowerStateToStr( powerState )) logInfo('ClusterNode::setPowerState : ' + self.m_name + '\'s power state has been changed to ' + PowerStateToStr(powerState))
self.m_powerState = powerState self.m_powerState = powerState
self.m_lastPowerStateTime = datetime.now() self.m_lastPowerStateTime = datetime.now()
bUpdateRequiredChecks = True bUpdateRequiredChecks = True
@ -69,18 +71,18 @@ class ClusterNode:
self.m_machineStatusUpdater.m_bCheckPowerState = True self.m_machineStatusUpdater.m_bCheckPowerState = True
self.m_machineStatusUpdater.m_bCheckSensors = False self.m_machineStatusUpdater.m_bCheckSensors = False
else: else:
assert( False ) assert False
def onNewPowerStateReading( self, powerState ): def onNewPowerStateReading(self, powerState):
""" """
called when a new powerstate reading arrives called when a new powerstate reading arrives
""" """
if powerState != self.getPowerState(): if powerState != self.getPowerState():
if self.getPowerState() != PowerState.UNKNOWN: if self.getPowerState() != PowerState.UNKNOWN:
logWarning('ClusterNode::onNewPowerStateReading : '+self.m_name+'\'s power state has been (manually it seems) changed to '+PowerStateToStr( powerState )) logWarning('ClusterNode::onNewPowerStateReading : ' + self.m_name + '\'s power state has been (manually it seems) changed to ' + PowerStateToStr(powerState))
self.setPowerState( powerState ) self.setPowerState(powerState)
def getPowerConsumptionForPowerState( self, ePowerState ): def getPowerConsumptionForPowerState(self, ePowerState):
""" """
returns the power consumption estimation (in watts) of this machine for the given power state returns the power consumption estimation (in watts) of this machine for the given power state
""" """
@ -96,45 +98,45 @@ class ClusterNode:
elif ePowerState == PowerState.UNPLUGGED: elif ePowerState == PowerState.UNPLUGGED:
fCurrentIntensity = 0.0 fCurrentIntensity = 0.0
else: else:
assert(False) assert False
return fCurrentIntensity * fCurrentVoltage return fCurrentIntensity * fCurrentVoltage
def updateEnergyMeasurements( self ): def updateEnergyMeasurements(self):
timeInterval = datetime.now() - self.m_lastPowerStateTime timeInterval = datetime.now() - self.m_lastPowerStateTime
self.m_energyConsumption += self.getPowerConsumptionForPowerState( self.m_powerState ) * timeInterval.seconds self.m_energyConsumption += self.getPowerConsumptionForPowerState(self.m_powerState) * timeInterval.seconds
self.m_energySavings += ( self.getPowerConsumptionForPowerState( PowerState.ON ) - self.getPowerConsumptionForPowerState( self.m_powerState ) ) * timeInterval.seconds self.m_energySavings += (self.getPowerConsumptionForPowerState(PowerState.ON) - self.getPowerConsumptionForPowerState(self.m_powerState)) * timeInterval.seconds
self.m_lastPowerStateTime = datetime.now() self.m_lastPowerStateTime = datetime.now()
#logDebug('energy savings on %s : %f J' %(self.getName(), self.m_energySavings)) # logDebug('energy savings on %s : %f J' %(self.getName(), self.m_energySavings))
def getEnergyConsumption( self ): def getEnergyConsumption(self):
""" """
in joules in joules
""" """
self.updateEnergyMeasurements() self.updateEnergyMeasurements()
return self.m_energyConsumption return self.m_energyConsumption
def getPowerConsumption( self ): def getPowerConsumption(self):
fCurrentPowerConsumption = self.getPowerConsumptionForPowerState( self.m_powerState ) fCurrentPowerConsumption = self.getPowerConsumptionForPowerState(self.m_powerState)
#logDebug('getPowerConsumption of %s : %f (powerstate = %d)' % (self.getName(), fCurrentPowerConsumption, self.m_powerState)) # logDebug('getPowerConsumption of %s : %f (powerstate = %d)' % (self.getName(), fCurrentPowerConsumption, self.m_powerState))
return fCurrentPowerConsumption return fCurrentPowerConsumption
def getEnergySavings( self ): def getEnergySavings(self):
self.updateEnergyMeasurements() self.updateEnergyMeasurements()
return self.m_energySavings return self.m_energySavings
def onSleepFailedBecauseAJobJustArrived( self ): def onSleepFailedBecauseAJobJustArrived(self):
logInfo('%s was scheduled to sleep but the sleep is canceled because it\'s currently executing a new job' % self.m_name) logInfo('%s was scheduled to sleep but the sleep is canceled because it\'s currently executing a new job' % self.m_name)
def requestSleep( self, sleepCompleteNotifier = None ): def requestSleep(self, sleepCompleteNotifier=None):
self.m_machineStatusUpdater.requestSleep( sleepCompleteNotifier ) self.m_machineStatusUpdater.requestSleep(sleepCompleteNotifier)
def requestWakeUp( self, wakeUpCompleteNotifier = None ): def requestWakeUp(self, wakeUpCompleteNotifier=None):
self.m_machineStatusUpdater.requestWakeUp( wakeUpCompleteNotifier ) self.m_machineStatusUpdater.requestWakeUp(wakeUpCompleteNotifier)
def getQueueMachineName( self ): def getQueueMachineName(self):
return self.getCluster().getJobsState().getQueueMachine( self.m_name ).getName() return self.getCluster().getJobsState().getQueueMachine(self.m_name).getName()
assert( self.m_queueName != None ) assert self.m_queueName is not None
return self.m_queueName return self.m_queueName
def getCluster( self ): def getCluster(self):
return self.m_cluster return self.m_cluster

View File

@ -2,143 +2,147 @@ import threading
import time import time
import Lib.Util import Lib.Util
import Lib.SimpaDbUtil import Lib.SimpaDbUtil
import os from PowerState import PowerState
import traceback from Log import logInfo, logDebug
import sys from Util import blockingWakeUpMachine, blockingPutMachineToSleep, getPowerState, onException
from PowerState import *
from QstatParser import *
import Util
class IWakeUpCompleteNotifier: class IWakeUpCompleteNotifier:
""" """
interface for wakeup notifiers interface for wakeup notifiers
""" """
def onWakeUpComplete( self ): def onWakeUpComplete(self):
assert( False ) assert False
class ISleepCompleteNotifier: class ISleepCompleteNotifier:
""" """
interface for sleep notifiers interface for sleep notifiers
""" """
def onSleepComplete( self, bSleepSucceeded ): def onSleepComplete(self, bSleepSucceeded):
assert( False ) assert False
class IRequest: class IRequest:
GO_TO_SLEEP = 1 GO_TO_SLEEP = 1
WAKE_UP = 2 WAKE_UP = 2
CHECK_POWER_STATE = 3 CHECK_POWER_STATE = 3
def __init__( self, requestType ): def __init__(self, requestType):
self.m_type = requestType self.m_type = requestType
def getType( self ): def getType(self):
return self.m_type return self.m_type
def process( self, clusterNodeStatusUpdater ): def process(self, clusterNodeStatusUpdater):
""" """
processes this request processes this request
""" """
assert( False ) # this method is abstract assert False # this method is abstract
class WakeUpRequest( IRequest ):
def __init__( self, wakeUpNotifier ): class WakeUpRequest(IRequest):
IRequest.__init__( self, IRequest.WAKE_UP )
def __init__(self, wakeUpNotifier):
IRequest.__init__(self, IRequest.WAKE_UP)
self.m_wakeUpNotifier = wakeUpNotifier self.m_wakeUpNotifier = wakeUpNotifier
def process( self, clusterNodeStatusUpdater ): def process(self, clusterNodeStatusUpdater):
assert( clusterNodeStatusUpdater.m_bShouldAlwaysBeOn == False ) # are we attempting to wake up a machine that should always be on ? assert clusterNodeStatusUpdater.m_bShouldAlwaysBeOn is False # are we attempting to wake up a machine that should always be on ?
logInfo('Handling wakeup request for %s' % clusterNodeStatusUpdater.getName() ) logInfo('Handling wakeup request for %s' % clusterNodeStatusUpdater.getName())
bSuccess = blockingWakeUpMachine( clusterNodeStatusUpdater.getName() ) bSuccess = blockingWakeUpMachine(clusterNodeStatusUpdater.getName())
assert( bSuccess ) assert bSuccess
# activate the associated machine queue # activate the associated machine queue
if clusterNodeStatusUpdater.setQueueActivation( True ): if clusterNodeStatusUpdater.setQueueActivation(True):
None # all is ok pass # all is ok
else: else:
assert( False ) assert False
clusterNodeStatusUpdater.m_stateLock.acquire() clusterNodeStatusUpdater.m_stateLock.acquire()
clusterNodeStatusUpdater.m_clusterNode.setPowerState( PowerState.ON ) clusterNodeStatusUpdater.m_clusterNode.setPowerState(PowerState.ON)
clusterNodeStatusUpdater.m_stateLock.release() clusterNodeStatusUpdater.m_stateLock.release()
if self.m_wakeUpNotifier: if self.m_wakeUpNotifier:
logDebug('ClusterNodeStatusUpdater::run : Sending wakeup notification') logDebug('ClusterNodeStatusUpdater::run : Sending wakeup notification')
self.m_wakeUpNotifier.onWakeUpComplete() self.m_wakeUpNotifier.onWakeUpComplete()
class SleepRequest( IRequest ):
def __init__( self, sleepCompleteNotifier ): class SleepRequest(IRequest):
IRequest.__init__( self, IRequest.GO_TO_SLEEP )
def __init__(self, sleepCompleteNotifier):
IRequest.__init__(self, IRequest.GO_TO_SLEEP)
self.m_sleepCompleteNotifier = sleepCompleteNotifier self.m_sleepCompleteNotifier = sleepCompleteNotifier
def process( self, clusterNodeStatusUpdater ): def process(self, clusterNodeStatusUpdater):
assert( clusterNodeStatusUpdater.m_bShouldAlwaysBeOn == False ) # are we attempting to put a machine the should stay on to sleep ? assert not clusterNodeStatusUpdater.m_bShouldAlwaysBeOn # are we attempting to put a machine the should stay on to sleep ?
logInfo('Handling sleep request for %s' % clusterNodeStatusUpdater.getName() ) logInfo('Handling sleep request for %s' % clusterNodeStatusUpdater.getName())
if clusterNodeStatusUpdater.setQueueActivation( False ): if clusterNodeStatusUpdater.setQueueActivation(False):
if clusterNodeStatusUpdater.queueIsEmpty(): if clusterNodeStatusUpdater.queueIsEmpty():
if blockingPutMachineToSleep( clusterNodeStatusUpdater.m_clusterNodeName ): if blockingPutMachineToSleep(clusterNodeStatusUpdater.m_clusterNodeName):
# now we know that the machine is asleep # now we know that the machine is asleep
clusterNodeStatusUpdater.m_stateLock.acquire() clusterNodeStatusUpdater.m_stateLock.acquire()
clusterNodeStatusUpdater.m_clusterNode.setPowerState( PowerState.SLEEP ) clusterNodeStatusUpdater.m_clusterNode.setPowerState(PowerState.SLEEP)
clusterNodeStatusUpdater.m_stateLock.release() clusterNodeStatusUpdater.m_stateLock.release()
if self.m_sleepCompleteNotifier: if self.m_sleepCompleteNotifier:
self.m_sleepCompleteNotifier.onSleepComplete( True ) self.m_sleepCompleteNotifier.onSleepComplete(True)
else: else:
assert( False ) assert False
else: else:
# reactivate the queue # reactivate the queue
if not clusterNodeStatusUpdater.setQueueActivation( True ): if not clusterNodeStatusUpdater.setQueueActivation(True):
assert( False ) assert False
clusterNodeStatusUpdater.m_stateLock.acquire() clusterNodeStatusUpdater.m_stateLock.acquire()
clusterNodeStatusUpdater.m_clusterNode.setPowerState( PowerState.ON ) # this is necessary to reenable the various cyclic checks that were disabled on sleep request clusterNodeStatusUpdater.m_clusterNode.setPowerState(PowerState.ON) # this is necessary to reenable the various cyclic checks that were disabled on sleep request
clusterNodeStatusUpdater.m_stateLock.release() clusterNodeStatusUpdater.m_stateLock.release()
clusterNodeStatusUpdater.m_clusterNode.onSleepFailedBecauseAJobJustArrived() clusterNodeStatusUpdater.m_clusterNode.onSleepFailedBecauseAJobJustArrived()
if self.m_sleepCompleteNotifier: if self.m_sleepCompleteNotifier:
self.m_sleepCompleteNotifier.onSleepComplete( False ) self.m_sleepCompleteNotifier.onSleepComplete(False)
else: else:
assert( False ) assert False
class CheckPowerStateRequest( IRequest ):
def __init__( self ): class CheckPowerStateRequest(IRequest):
IRequest.__init__( self, IRequest.CHECK_POWER_STATE )
def process( self, clusterNodeStatusUpdater ): def __init__(self):
powerState = Util.getPowerState( clusterNodeStatusUpdater.m_clusterNodeName ) IRequest.__init__(self, IRequest.CHECK_POWER_STATE)
def process(self, clusterNodeStatusUpdater):
powerState = getPowerState(clusterNodeStatusUpdater.m_clusterNodeName)
clusterNodeStatusUpdater.m_stateLock.acquire() clusterNodeStatusUpdater.m_stateLock.acquire()
clusterNodeStatusUpdater.m_clusterNode.onNewPowerStateReading( powerState ) clusterNodeStatusUpdater.m_clusterNode.onNewPowerStateReading(powerState)
clusterNodeStatusUpdater.m_lastPowerStateCheckTime = time.time() clusterNodeStatusUpdater.m_lastPowerStateCheckTime = time.time()
clusterNodeStatusUpdater.m_stateLock.release() clusterNodeStatusUpdater.m_stateLock.release()
class ClusterNodeStatusUpdater( threading.Thread ):
DELAY_BETWEEN_POWERSTATE_CHECKS=5*60 # in seconds
def __init__( self, machineName, clusterNode, gridEngine ): class ClusterNodeStatusUpdater(threading.Thread):
DELAY_BETWEEN_POWERSTATE_CHECKS = 5 * 60 # in seconds
def __init__(self, machineName, clusterNode, gridEngine):
threading.Thread.__init__(self) threading.Thread.__init__(self)
self.m_clusterNodeName = machineName self.m_clusterNodeName = machineName
self.m_clusterNode = clusterNode self.m_clusterNode = clusterNode
self.m_gridEngine = gridEngine self.m_gridEngine = gridEngine
self.m_bStop = False self.m_bStop = False
self.m_lastPowerStateCheckTime = None #time.time() self.m_lastPowerStateCheckTime = None # time.time()
self.m_bCheckPowerState = True self.m_bCheckPowerState = True
self.m_stateLock = threading.Lock() # lock that prevents concurrent access to the state of this instance self.m_stateLock = threading.Lock() # lock that prevents concurrent access to the state of this instance
self.m_bShouldAlwaysBeOn = False # indicates that the machine should never go to sleep or off for whatever reason (eg simpatix10) self.m_bShouldAlwaysBeOn = False # indicates that the machine should never go to sleep or off for whatever reason (eg simpatix10)
self.m_pendingRequestsQueue = [] self.m_pendingRequestsQueue = []
def getGridEngine( self ): def getGridEngine(self):
return self.m_gridEngine return self.m_gridEngine
def getName( self ): def getName(self):
return self.m_clusterNodeName return self.m_clusterNodeName
def setShouldAlwaysBeOn( self ): def setShouldAlwaysBeOn(self):
print('%s should always be on' % (self.getName()) ) print('%s should always be on' % (self.getName()))
self.m_bShouldAlwaysBeOn = True self.m_bShouldAlwaysBeOn = True
def pushRequest( self, request ): def pushRequest(self, request):
self.m_stateLock.acquire() self.m_stateLock.acquire()
self.m_pendingRequestsQueue.append(request) self.m_pendingRequestsQueue.append(request)
self.m_stateLock.release() self.m_stateLock.release()
def popRequest( self ): def popRequest(self):
oldestRequest = None oldestRequest = None
self.m_stateLock.acquire() self.m_stateLock.acquire()
if len(self.m_pendingRequestsQueue) != 0: if len(self.m_pendingRequestsQueue) != 0:
@ -146,14 +150,14 @@ class ClusterNodeStatusUpdater( threading.Thread ):
self.m_stateLock.release() self.m_stateLock.release()
return oldestRequest return oldestRequest
def run( self ): def run(self):
try: try:
while not self.m_bStop : while not self.m_bStop:
# handle the oldest request # handle the oldest request
request = self.popRequest() request = self.popRequest()
if request != None : if request is not None:
request.process( self ) request.process(self)
# schedule a power state check if required # schedule a power state check if required
currentTime = time.time() currentTime = time.time()
@ -161,28 +165,28 @@ class ClusterNodeStatusUpdater( threading.Thread ):
if not self.m_bShouldAlwaysBeOn: # don't do power checks on such machines because some current implementations of if not self.m_bShouldAlwaysBeOn: # don't do power checks on such machines because some current implementations of
# operations involved might cause the machine to go to sleep # operations involved might cause the machine to go to sleep
if (not self.m_lastPowerStateCheckTime) or (currentTime > (self.m_lastPowerStateCheckTime + ClusterNodeStatusUpdater.DELAY_BETWEEN_POWERSTATE_CHECKS)): if (not self.m_lastPowerStateCheckTime) or (currentTime > (self.m_lastPowerStateCheckTime + ClusterNodeStatusUpdater.DELAY_BETWEEN_POWERSTATE_CHECKS)):
self.pushRequest( CheckPowerStateRequest() ) self.pushRequest(CheckPowerStateRequest())
time.sleep(1) time.sleep(1)
except BaseException, exception: # catches all exceptions, including the ctrl+C (KeyboardInterrupt) except BaseException as exception: # catches all exceptions, including the ctrl+C (KeyboardInterrupt)
Util.onException(exception) onException(exception)
def requestSleep( self, sleepCompleteNotifier = None ): def requestSleep(self, sleepCompleteNotifier=None):
assert( self.m_bShouldAlwaysBeOn == False ) assert not self.m_bShouldAlwaysBeOn
self.pushRequest( SleepRequest( sleepCompleteNotifier ) ) self.pushRequest(SleepRequest(sleepCompleteNotifier))
def requestWakeUp( self, wakeUpNotifier = None ): def requestWakeUp(self, wakeUpNotifier=None):
assert( self.m_bShouldAlwaysBeOn == False ) assert self.m_bShouldAlwaysBeOn is False
self.pushRequest( WakeUpRequest( wakeUpNotifier ) ) self.pushRequest(WakeUpRequest(wakeUpNotifier))
def getQueueMachineName( self ): def getQueueMachineName(self):
return self.m_clusterNode.getQueueMachineName() return self.m_clusterNode.getQueueMachineName()
def setQueueActivation( self, bEnable ): def setQueueActivation(self, bEnable):
""" """
@return true on success, false otherwise @return true on success, false otherwise
""" """
return self.getGridEngine().setQueueInstanceActivation( self.getQueueMachineName(), bEnable ) return self.getGridEngine().setQueueInstanceActivation(self.getQueueMachineName(), bEnable)
def queueIsEmpty( self ): def queueIsEmpty(self):
return self.getGridEngine().queueIsEmpty( self.getName() ) return self.getGridEngine().queueIsEmpty(self.getName())

View File

@ -1,10 +1,13 @@
import threading import threading
from JobsStateUpdater import * from JobsStateUpdater import JobsStateUpdater
import Lib.Util import Lib.Util
import Lib.SimpaDbUtil import Lib.SimpaDbUtil
from ClusterNode import * from ClusterNode import ClusterNode
from Log import logInfo, logError
from PowerState import PowerState
import time import time
class ClusterStatus: class ClusterStatus:
""" """
The current state (jobs, sensors) of the cluster The current state (jobs, sensors) of the cluster
@ -15,39 +18,38 @@ class ClusterStatus:
self.m_gridEngine = gridEngine self.m_gridEngine = gridEngine
self.m_clusterNodes = {} self.m_clusterNodes = {}
self.m_lock = threading.Lock() # to prevent concurrent access to this instance self.m_lock = threading.Lock() # to prevent concurrent access to this instance
self.m_jobsStateUpdater = JobsStateUpdater( self ) self.m_jobsStateUpdater = JobsStateUpdater(self)
self.m_jobsState = None self.m_jobsState = None
#self.m_controlledMachineNames = [ 'simpatix30' ] # self.m_controlledMachineNames = ['simpatix30']
self.m_controlledMachineNames = [] # [ 'simpatix30' ] self.m_controlledMachineNames = [] # ['simpatix30']
if False: if False:
for iMachine in range(11, 40): for iMachine in range(11, 40):
if (iMachine == 31) or (iMachine == 32): if (iMachine == 31) or (iMachine == 32):
continue # these machines don't seem to be able to go to sleep properly (bug 00000010) continue # these machines don't seem to be able to go to sleep properly (bug 00000010)
if (iMachine == 18): if (iMachine == 18):
continue # this machine needs maintenance (restarting because it's very slow for an unknown reason) continue # this machine needs maintenance (restarting because it's very slow for an unknown reason)
self.m_controlledMachineNames.append( 'simpatix%d' % iMachine ) self.m_controlledMachineNames.append('simpatix%d' % iMachine)
nodeNames = Lib.SimpaDbUtil.getClusterMachinesNames() nodeNames = Lib.SimpaDbUtil.getClusterMachinesNames()
for nodeName in nodeNames: for nodeName in nodeNames:
if nodeName in self.m_controlledMachineNames: if nodeName in self.m_controlledMachineNames:
logInfo( 'machine %s is under the cluster controller\'s control' % nodeName ) logInfo('machine %s is under the cluster controller\'s control' % nodeName)
clusterNode = ClusterNode( nodeName, self, gridEngine ) clusterNode = ClusterNode(nodeName, self, gridEngine)
if nodeName == 'simpatix10': if nodeName == 'simpatix10':
clusterNode.setShouldAlwaysBeOn() clusterNode.setShouldAlwaysBeOn()
self.m_clusterNodes[ nodeName ] = clusterNode self.m_clusterNodes[nodeName] = clusterNode
return return
def setControlOnMachine(self, machineName, bControl): def setControlOnMachine(self, machineName, bControl):
if bControl: if bControl:
# add machineName under control of ClusterController # add machineName under control of ClusterController
for k, v in self.m_clusterNodes.items(): for k, v in self.m_clusterNodes.items():
if v.getName() == machineName : if v.getName() == machineName:
return # nothing to do : machineName is already under the control of ClusterController return # nothing to do : machineName is already under the control of ClusterController
clusterNode = ClusterNode( machineName, self, self.m_gridEngine ) clusterNode = ClusterNode(machineName, self, self.m_gridEngine)
if machineName == 'simpatix10': if machineName == 'simpatix10':
clusterNode.setShouldAlwaysBeOn() clusterNode.setShouldAlwaysBeOn()
self.m_clusterNodes[ machineName ] = clusterNode self.m_clusterNodes[machineName] = clusterNode
clusterNode.m_machineStatusUpdater.start() clusterNode.m_machineStatusUpdater.start()
else: else:
# remove machineName from control of ClusterController # remove machineName from control of ClusterController
@ -57,48 +59,48 @@ class ClusterStatus:
clusterNode.m_machineStatusUpdater.join() clusterNode.m_machineStatusUpdater.join()
self.m_clusterNodes.pop(machineName) self.m_clusterNodes.pop(machineName)
def getGridEngine( self ): def getGridEngine(self):
return self.m_gridEngine return self.m_gridEngine
def getMachines( self ): def getMachines(self):
return self.m_clusterNodes return self.m_clusterNodes
def startReadingThreads( self ): def startReadingThreads(self):
for k, v in self.m_clusterNodes.items(): for k, v in self.m_clusterNodes.items():
v.m_machineStatusUpdater.start() v.m_machineStatusUpdater.start()
self.m_jobsStateUpdater.start() self.m_jobsStateUpdater.start()
def stopReadingThreads( self ): def stopReadingThreads(self):
for k, v in self.m_clusterNodes.items(): for k, v in self.m_clusterNodes.items():
v.m_machineStatusUpdater.m_bStop = True v.m_machineStatusUpdater.m_bStop = True
v.m_machineStatusUpdater.join() v.m_machineStatusUpdater.join()
self.m_jobsStateUpdater.m_bStop = True self.m_jobsStateUpdater.m_bStop = True
self.m_jobsStateUpdater.join() self.m_jobsStateUpdater.join()
def onNewJobsState( self, newJobsState ): def onNewJobsState(self, newJobsState):
#logDebug( 'ClusterStatus::onNewJobsState : attempting to acquire lock to access m_jobsState' ) # logDebug('ClusterStatus::onNewJobsState : attempting to acquire lock to access m_jobsState')
self.m_lock.acquire() self.m_lock.acquire()
#logDebug( 'ClusterStatus::onNewJobsState : got lock to access m_jobsState' ) # logDebug('ClusterStatus::onNewJobsState : got lock to access m_jobsState')
self.m_jobsState = newJobsState self.m_jobsState = newJobsState
self.m_lock.release() self.m_lock.release()
def getJobsOnMachine( self, machineName ): def getJobsOnMachine(self, machineName):
return self.m_jobsState.getJobsOnMachine( machineName ) return self.m_jobsState.getJobsOnMachine(machineName)
def isReady( self ): def isReady(self):
for k, v in self.m_clusterNodes.items(): for k, v in self.m_clusterNodes.items():
if not v.isReady(): if not v.isReady():
logInfo( 'ClusterStatus::isReady : not ready because of ' + v.getName() ) logInfo('ClusterStatus::isReady : not ready because of ' + v.getName())
return False return False
#log('ClusterStatus::isReady() : '+k+' is ready') # log('ClusterStatus::isReady() : '+k+' is ready')
#assert( False ) # assert(False)
if self.m_jobsState == None: if self.m_jobsState is None:
logInfo( 'ClusterStatus::isReady : not ready because waiting for jobs state' ) logInfo('ClusterStatus::isReady : not ready because waiting for jobs state')
return False return False
return True return True
def getIdleMachines( self ): def getIdleMachines(self):
assert( self.isReady ) assert self.isReady
bBUG_00000009_IS_STILL_ALIVE = True bBUG_00000009_IS_STILL_ALIVE = True
if bBUG_00000009_IS_STILL_ALIVE: if bBUG_00000009_IS_STILL_ALIVE:
currentTime = time.time() currentTime = time.time()
@ -106,33 +108,33 @@ class ClusterStatus:
fJobsStateAge = currentTime - self.m_jobsState.getTime() fJobsStateAge = currentTime - self.m_jobsState.getTime()
if fJobsStateAge > fJOBS_STATE_MAX_ALLOWED_AGE: if fJobsStateAge > fJOBS_STATE_MAX_ALLOWED_AGE:
logError('ClusterStatus::getIdleMachines : age of jobs state is too old (%f s). This is bug 00000009.' % (fJobsStateAge)) logError('ClusterStatus::getIdleMachines : age of jobs state is too old (%f s). This is bug 00000009.' % (fJobsStateAge))
assert( False ) assert False
idleMachines = {} idleMachines = {}
for machineName, machine in self.m_clusterNodes.items(): for machineName, machine in self.m_clusterNodes.items():
if machine.getPowerState() == PowerState.ON: if machine.getPowerState() == PowerState.ON:
jobsOnThisMachine = self.getJobsOnMachine( machineName ) jobsOnThisMachine = self.getJobsOnMachine(machineName)
if len(jobsOnThisMachine) == 0: if len(jobsOnThisMachine) == 0:
idleMachines[ machineName ] = machine idleMachines[machineName] = machine
return idleMachines return idleMachines
def getPendingJobs( self ): def getPendingJobs(self):
return self.m_jobsState.getPendingJobs() return self.m_jobsState.getPendingJobs()
def getJobsState( self ): def getJobsState(self):
return self.m_jobsState return self.m_jobsState
def queueMachineFitsJobRequirements( self, queueMachine, jobRequirements ): def queueMachineFitsJobRequirements(self, queueMachine, jobRequirements):
if jobRequirements.m_queues: if jobRequirements.m_queues:
bQueueIsInAllowedQueues = False bQueueIsInAllowedQueues = False
for queueName in jobRequirements.m_queues: for queueName in jobRequirements.m_queues:
if queueName == queueMachine.getQueueName(): if queueName == queueMachine.getQueueName():
bQueueIsInAllowedQueues = True bQueueIsInAllowedQueues = True
if not bQueueIsInAllowedQueues: if not bQueueIsInAllowedQueues:
logInfo('queueMachineFitsJobRequirements : queueMachine '+queueMachine.getName()+' rejected because it\'s not in the allowed queues') logInfo('queueMachineFitsJobRequirements : queueMachine ' + queueMachine.getName() + ' rejected because it\'s not in the allowed queues')
return False return False
return True return True
def getEnergyConsumption( self ): def getEnergyConsumption(self):
""" """
returns an estimate of the energy consumption since the start of the cluster controller (in joules) returns an estimate of the energy consumption since the start of the cluster controller (in joules)
""" """
@ -142,7 +144,7 @@ class ClusterStatus:
fEnergyConsumption += machine.getEnergyConsumption() fEnergyConsumption += machine.getEnergyConsumption()
return fEnergyConsumption return fEnergyConsumption
def getEnergySavings( self ): def getEnergySavings(self):
""" """
returns an estimate of the energy saving since the start of the cluster controller (in joules) returns an estimate of the energy saving since the start of the cluster controller (in joules)
""" """
@ -152,58 +154,56 @@ class ClusterStatus:
fEnergySavings += machine.getEnergySavings() fEnergySavings += machine.getEnergySavings()
return fEnergySavings return fEnergySavings
def getCurrentPowerConsumption( self ): def getCurrentPowerConsumption(self):
fPowerConsumption = 0.0 fPowerConsumption = 0.0
for machine in self.m_clusterNodes.values(): for machine in self.m_clusterNodes.values():
if machine.isReady(): if machine.isReady():
fPowerConsumption += machine.getPowerConsumption() fPowerConsumption += machine.getPowerConsumption()
return fPowerConsumption return fPowerConsumption
def getCurrentPowerSavings( self ): def getCurrentPowerSavings(self):
fPowerSavings = 0.0 fPowerSavings = 0.0
for machine in self.m_clusterNodes.values(): for machine in self.m_clusterNodes.values():
if machine.isReady(): if machine.isReady():
fPowerSavings += machine.getPowerConsumptionForPowerState( PowerState.ON ) - machine.getPowerConsumption() fPowerSavings += machine.getPowerConsumptionForPowerState(PowerState.ON) - machine.getPowerConsumption()
return fPowerSavings return fPowerSavings
def getNumControlledSlots( self ): def getNumControlledSlots(self):
self.m_lock.acquire() self.m_lock.acquire()
iNumControlledSlots = 0 iNumControlledSlots = 0
for machine in self.m_clusterNodes.values(): for machine in self.m_clusterNodes.values():
queueMachine = self.m_jobsState.getQueueMachine( machine.getName() ) queueMachine = self.m_jobsState.getQueueMachine(machine.getName())
iNumControlledSlots += queueMachine.getNumSlots() iNumControlledSlots += queueMachine.getNumSlots()
self.m_lock.release() self.m_lock.release()
return iNumControlledSlots return iNumControlledSlots
def getNumUsedSlots( self ): def getNumUsedSlots(self):
self.m_lock.acquire() self.m_lock.acquire()
iNumUsedSlots = 0 iNumUsedSlots = 0
for machine in self.m_clusterNodes.values(): for machine in self.m_clusterNodes.values():
queueMachine = self.m_jobsState.getQueueMachine( machine.getName() ) queueMachine = self.m_jobsState.getQueueMachine(machine.getName())
iNumUsedSlotsOnThisMachine = queueMachine.getNumSlots() - self.m_jobsState.getNumFreeSlotsOnQueueMachine(queueMachine) iNumUsedSlotsOnThisMachine = queueMachine.getNumSlots() - self.m_jobsState.getNumFreeSlotsOnQueueMachine(queueMachine)
assert(iNumUsedSlotsOnThisMachine >= 0) assert iNumUsedSlotsOnThisMachine >= 0
iNumUsedSlots += iNumUsedSlotsOnThisMachine iNumUsedSlots += iNumUsedSlotsOnThisMachine
self.m_lock.release() self.m_lock.release()
return iNumUsedSlots return iNumUsedSlots
def getNumWastedSlots( self ): def getNumWastedSlots(self):
self.m_lock.acquire() self.m_lock.acquire()
iNumWastedSlots = 0 iNumWastedSlots = 0
for machine in self.m_clusterNodes.values(): for machine in self.m_clusterNodes.values():
if machine.getPowerState() == PowerState.ON: if machine.getPowerState() == PowerState.ON:
queueMachine = self.m_jobsState.getQueueMachine( machine.getName() ) queueMachine = self.m_jobsState.getQueueMachine(machine.getName())
iNumWastedSlots += self.m_jobsState.getNumFreeSlotsOnQueueMachine(queueMachine) iNumWastedSlots += self.m_jobsState.getNumFreeSlotsOnQueueMachine(queueMachine)
self.m_lock.release() self.m_lock.release()
return iNumWastedSlots return iNumWastedSlots
def getNumSleepingSlots( self ): def getNumSleepingSlots(self):
self.m_lock.acquire() self.m_lock.acquire()
iNumSleepingSlots = 0 iNumSleepingSlots = 0
for machine in self.m_clusterNodes.values(): for machine in self.m_clusterNodes.values():
if machine.getPowerState() == PowerState.SLEEP: if machine.getPowerState() == PowerState.SLEEP:
queueMachine = self.m_jobsState.getQueueMachine( machine.getName() ) queueMachine = self.m_jobsState.getQueueMachine(machine.getName())
iNumSleepingSlots += self.m_jobsState.getNumFreeSlotsOnQueueMachine(queueMachine) iNumSleepingSlots += self.m_jobsState.getNumFreeSlotsOnQueueMachine(queueMachine)
self.m_lock.release() self.m_lock.release()
return iNumSleepingSlots return iNumSleepingSlots

View File

@ -1,19 +1,21 @@
class JobStateFlags: class JobStateFlags:
RUNNING=1 # the job is running RUNNING = 1 # the job is running
WAITING=2 # the job is waiting WAITING = 2 # the job is waiting
QUEUED=4 # not sure what that exactly means but it reflects the q state of jobs as seen in the pending jobs list from qstat -f -u \* QUEUED = 4 # not sure what that exactly means but it reflects the q state of jobs as seen in the pending jobs list from qstat -f -u \*
TRANSFERING=8 TRANSFERING = 8
DELETED=16 DELETED = 16
HOLD=32 HOLD = 32
ERROR=64 ERROR = 64
SUSPENDED=128 SUSPENDED = 128
class ParallelEnvironment: class ParallelEnvironment:
MPI=1 MPI = 1
class JobRequirements: class JobRequirements:
def __init__( self ): def __init__(self):
self.m_numSlots = None self.m_numSlots = None
self.m_strArchitecture = None # machine architecture self.m_strArchitecture = None # machine architecture
self.m_parallelEnvironment = None self.m_parallelEnvironment = None
@ -28,13 +30,14 @@ class JobId:
share the same sge job identifier. To uniquely define a job array element, we also use the task id. share the same sge job identifier. To uniquely define a job array element, we also use the task id.
""" """
MAX_NUM_JOBS_IN_ARRAY = 1000000 MAX_NUM_JOBS_IN_ARRAY = 1000000
def __init__( self, iJobId, iJobArrayElementId = None):
def __init__(self, iJobId, iJobArrayElementId=None):
if iJobArrayElementId is not None: if iJobArrayElementId is not None:
assert iJobArrayElementId <= self.MAX_NUM_JOBS_IN_ARRAY assert iJobArrayElementId <= self.MAX_NUM_JOBS_IN_ARRAY
self.m_iJobId = iJobId self.m_iJobId = iJobId
self.m_iJobArrayElementId = iJobArrayElementId # None if this identifier does not refer to a job array element self.m_iJobArrayElementId = iJobArrayElementId # None if this identifier does not refer to a job array element
def __hash__( self ): def __hash__(self):
""" """
required to use a JobId as a dict hash key required to use a JobId as a dict hash key
""" """
@ -43,7 +46,7 @@ class JobId:
hash += self.m_iJobArrayElementId hash += self.m_iJobArrayElementId
return hash return hash
def __eq__( self, other ): def __eq__(self, other):
""" """
required to use a JobId as a dict hash key required to use a JobId as a dict hash key
""" """
@ -53,22 +56,21 @@ class JobId:
return False return False
return True return True
def isJobArrayElement( self ): def isJobArrayElement(self):
return (self.m_iJobArrayElementId != None) return (self.m_iJobArrayElementId is not None)
def getMainId(self): def getMainId(self):
return self.m_iJobId return self.m_iJobId
def asStr( self ): def asStr(self):
strResult = '%s' % self.m_iJobId strResult = '%s' % self.m_iJobId
if self.isJobArrayElement(): if self.isJobArrayElement():
strResult += '.%d' % self.m_iJobArrayElementId strResult += '.%d' % self.m_iJobArrayElementId
return strResult return strResult
class Job: class Job:
def __init__( self, jobId ): def __init__(self, jobId):
self.m_jobId = jobId self.m_jobId = jobId
self.m_startTime = None self.m_startTime = None
self.m_submitTime = None self.m_submitTime = None
@ -78,53 +80,67 @@ class Job:
self.m_stateFlags = 0 self.m_stateFlags = 0
self.m_jobRequirements = JobRequirements() self.m_jobRequirements = JobRequirements()
self.m_requestedRamPerCore = 0 self.m_requestedRamPerCore = 0
def getId( self ):
def getId(self):
return self.m_jobId return self.m_jobId
def setState( self, state ):
def setState(self, state):
self.m_stateFlags = state self.m_stateFlags = state
def setOwner( self, jobOwner ):
def setOwner(self, jobOwner):
if self.m_owner: if self.m_owner:
assert( self.m_owner == jobOwner ) assert self.m_owner == jobOwner
self.m_owner = jobOwner self.m_owner = jobOwner
def getOwner( self ):
def getOwner(self):
return self.m_owner return self.m_owner
def setStartTime( self, jobStartTime ):
def setStartTime(self, jobStartTime):
if self.m_startTime: if self.m_startTime:
assert( self.m_startTime == jobStartTime ) assert self.m_startTime == jobStartTime
self.m_startTime = jobStartTime self.m_startTime = jobStartTime
def setSubmitTime( self, jobSubmitTime ):
def setSubmitTime(self, jobSubmitTime):
if self.m_submitTime: if self.m_submitTime:
assert( self.m_submitTime == jobSubmitTime ) assert self.m_submitTime == jobSubmitTime
self.m_submitTime = jobSubmitTime self.m_submitTime = jobSubmitTime
def getStartTime( self ):
def getStartTime(self):
return self.m_startTime return self.m_startTime
def setScriptName( self, jobScriptName ):
def setScriptName(self, jobScriptName):
if self.m_scriptName: if self.m_scriptName:
assert( self.m_scriptName == jobScriptName ) assert self.m_scriptName == jobScriptName
self.m_scriptName = jobScriptName self.m_scriptName = jobScriptName
def addSlots( self, queueMachineName, numSlots ):
assert( self.m_slots.get( queueMachineName ) == None ) def addSlots(self, queueMachineName, numSlots):
if self.m_slots.get( queueMachineName ) == None: assert self.m_slots.get(queueMachineName) is None
self.m_slots[ queueMachineName ] = numSlots if self.m_slots.get(queueMachineName) is None:
self.m_slots[queueMachineName] = numSlots
else: else:
# should never happen # should never happen
self.m_slots[ queueMachineName ] += numSlots self.m_slots[queueMachineName] += numSlots
def getSlots( self ):
def getSlots(self):
return self.m_slots return self.m_slots
def setNumRequiredSlots( self, numSlots ):
def setNumRequiredSlots(self, numSlots):
self.m_jobRequirements.m_numSlots = numSlots self.m_jobRequirements.m_numSlots = numSlots
def isPending( self ):
def isPending(self):
""" """
returns true if this job is waiting in the queue for whatever reason returns true if this job is waiting in the queue for whatever reason
""" """
return self.m_stateFlags & JobStateFlags.QUEUED return self.m_stateFlags & JobStateFlags.QUEUED
def getRequestedRamPerCore( self ):
def getRequestedRamPerCore(self):
""" """
requested RAM per core in bytes requested RAM per core in bytes
""" """
return self.m_requestedRamPerCore return self.m_requestedRamPerCore
def setRequestedRamPerCore( self, requestedRam ):
def setRequestedRamPerCore(self, requestedRam):
""" """
requestedRam : requested RAM per core in bytes requestedRam : requested RAM per core in bytes
""" """
self.m_requestedRamPerCore=requestedRam self.m_requestedRamPerCore = requestedRam

View File

@ -1,85 +1,86 @@
from .Log import * from .Log import *
class JobsState: class JobsState:
""" """
represents a snapshot of the state of SGE jobs as seen by the SGE command "qstat -f -u \*" represents a snapshot of the state of SGE jobs as seen by the SGE command "qstat -f -u \*"
""" """
def __init__( self ): def __init__(self):
self.m_jobs = {} # list of jobs self.m_jobs = {} # list of jobs
self.m_jobArrayJobs = {} # a dictionary of jobs for each job array, indexed by job array id self.m_jobArrayJobs = {} # a dictionary of jobs for each job array, indexed by job array id
self.m_queueMachines = {} # list of queue machines such as allintel.q@simpatix10 self.m_queueMachines = {} # list of queue machines such as allintel.q@simpatix10
self.m_stateTime = None # the time at which the state was snapshot self.m_stateTime = None # the time at which the state was snapshot
def deleteAllJobs( self ): def deleteAllJobs(self):
self.m_jobs = {} self.m_jobs = {}
self.m_jobArrayJobs = {} self.m_jobArrayJobs = {}
def addJob( self, job ): def addJob(self, job):
jobId = job.getId() jobId = job.getId()
self.m_jobs[ jobId ] = job self.m_jobs[jobId] = job
if jobId.isJobArrayElement(): if jobId.isJobArrayElement():
tasks = self.m_jobArrayJobs.get(jobId.m_iJobId) tasks = self.m_jobArrayJobs.get(jobId.m_iJobId)
if tasks == None: if tasks is None:
tasks = {} tasks = {}
self.m_jobArrayJobs[ jobId.m_iJobId ] = tasks self.m_jobArrayJobs[jobId.m_iJobId] = tasks
tasks[jobId] = job tasks[jobId] = job
def getJob( self, jobId ): def getJob(self, jobId):
return self.m_jobs.get( jobId ) return self.m_jobs.get(jobId)
def getJobArrayJobs( self, iJobArrayId ): def getJobArrayJobs(self, iJobArrayId):
return self.m_jobArrayJobs.get( iJobArrayId ) return self.m_jobArrayJobs.get(iJobArrayId)
def setTime( self, stateTime ): def setTime(self, stateTime):
self.m_stateTime = stateTime self.m_stateTime = stateTime
def getTime( self ): def getTime(self):
return self.m_stateTime return self.m_stateTime
def getJobsOnMachine( self, machineName ): def getJobsOnMachine(self, machineName):
jobsOnMachine = {} jobsOnMachine = {}
for jobId, job in self.m_jobs.items(): for jobId, job in self.m_jobs.items():
for queueMachineName, numSlots in job.getSlots().items(): for queueMachineName, numSlots in job.getSlots().items():
jobMachineName = queueMachineName.split('@')[1] jobMachineName = queueMachineName.split('@')[1]
if jobMachineName == machineName: if jobMachineName == machineName:
jobsOnMachine[ jobId ] = job jobsOnMachine[jobId] = job
return jobsOnMachine return jobsOnMachine
def getNumFreeSlotsOnQueueMachine( self, queueMachine ): def getNumFreeSlotsOnQueueMachine(self, queueMachine):
#logInfo('getNumFreeSlotsOnQueueMachine : looking for free slots on queuemachine %s' % queueMachine.getName() ) # logInfo('getNumFreeSlotsOnQueueMachine : looking for free slots on queuemachine %s' % queueMachine.getName())
numUsedSlots = 0 numUsedSlots = 0
for job in self.m_jobs.values(): for job in self.m_jobs.values():
numUsedSlotsByThisJob = job.getSlots().get( queueMachine.getName() ) numUsedSlotsByThisJob = job.getSlots().get(queueMachine.getName())
if numUsedSlotsByThisJob != None: if numUsedSlotsByThisJob is not None:
#logInfo('getNumFreeSlotsOnQueueMachine : job %d uses %d slots' % (job.getId().asStr(), numUsedSlotsByThisJob) ) # logInfo('getNumFreeSlotsOnQueueMachine : job %d uses %d slots' % (job.getId().asStr(), numUsedSlotsByThisJob))
numUsedSlots += numUsedSlotsByThisJob numUsedSlots += numUsedSlotsByThisJob
else: else:
None None
#logInfo('getNumFreeSlotsOnQueueMachine : job %d uses no slot' % job.getId().asStr() ) # logInfo('getNumFreeSlotsOnQueueMachine : job %d uses no slot' % job.getId().asStr())
numFreeSlots = queueMachine.getNumSlots() - numUsedSlots numFreeSlots = queueMachine.getNumSlots() - numUsedSlots
assert( numFreeSlots >= 0 ) assert numFreeSlots >= 0
return numFreeSlots return numFreeSlots
def addQueueMachine( self, queueMachine ): def addQueueMachine(self, queueMachine):
self.m_queueMachines[ queueMachine.getName() ] = queueMachine self.m_queueMachines[queueMachine.getName()] = queueMachine
def getQueueMachine( self, machineName ): def getQueueMachine(self, machineName):
""" """
finds the queue machine associated with a machine finds the queue machine associated with a machine
""" """
queueMachine = None queueMachine = None
for qmName, qm in self.m_queueMachines.items(): for qmName, qm in self.m_queueMachines.items():
if qm.m_machineName == machineName: if qm.m_machineName == machineName:
assert( queueMachine == None ) # to be sure that no more than one queue machine is on a given machine assert queueMachine is None # to be sure that no more than one queue machine is on a given machine
queueMachine = qm queueMachine = qm
return queueMachine return queueMachine
def getQueueMachines( self ): def getQueueMachines(self):
return self.m_queueMachines return self.m_queueMachines
def getPendingJobs( self ): def getPendingJobs(self):
pendingJobs = {} pendingJobs = {}
for jobId, job in self.m_jobs.items(): for jobId, job in self.m_jobs.items():
if job.isPending(): if job.isPending():
pendingJobs[ job.getId() ] = job pendingJobs[job.getId()] = job
return pendingJobs return pendingJobs

View File

@ -1,29 +1,33 @@
import time import time
import threading import threading
gLogFilePath = '/tmp/ClusterController.log'#'/var/log/ClusterController.log' gLogFilePath = '/tmp/ClusterController.log' # '/var/log/ClusterController.log'
def log( message ):
def log(message):
threadName = threading.currentThread().getName() threadName = threading.currentThread().getName()
logMessage = time.asctime(time.localtime())+' : '+ threadName + ' : ' + message logMessage = time.asctime(time.localtime()) + ' : ' + threadName + ' : ' + message
print(logMessage) print(logMessage)
f = open(gLogFilePath, 'a+') f = open(gLogFilePath, 'a+')
assert( f ) assert f
try: try:
f.write( logMessage + '\n' ) f.write(logMessage + '\n')
finally: finally:
f.close() f.close()
def logDebug( message ):
log('[D]'+message) def logDebug(message):
log('[D]' + message)
return return
def logInfo( message ):
log('[I]'+message)
def logWarning( message ): def logInfo(message):
log('[W]'+message) log('[I]' + message)
def logError( message ):
log('[E]'+message)
def logWarning(message):
log('[W]' + message)
def logError(message):
log('[E]' + message)

View File

@ -1,12 +1,13 @@
class PowerState: class PowerState:
UNKNOWN=0 UNKNOWN = 0
OFF=1 OFF = 1
ON=2 ON = 2
SLEEP=3 SLEEP = 3
UNPLUGGED=4 UNPLUGGED = 4
def PowerStateToStr( powerState ):
def PowerStateToStr(powerState):
if powerState == PowerState.UNKNOWN: if powerState == PowerState.UNKNOWN:
return 'UNKNOWN' return 'UNKNOWN'
if powerState == PowerState.OFF: if powerState == PowerState.OFF:
@ -18,4 +19,4 @@ def PowerStateToStr( powerState ):
if powerState == PowerState.UNPLUGGED: if powerState == PowerState.UNPLUGGED:
return 'UNPLUGGED' return 'UNPLUGGED'
else: else:
assert( False ) assert False

View File

@ -1,15 +1,17 @@
import io import io
import re import re
from .JobsState import * from .JobsState import JobsState
from .QueueMachine import * from .QueueMachine import QueueMachine, QueueMachineStateFlags
from .Util import * from .Util import *
from .Log import * from .Log import logError
from .Job import * from .Job import JobStateFlags, JobId, Job, ParallelEnvironment
import logging
class QstatParser: class QstatParser:
def parseJobState( self, strJobStatus ): def parseJobState(self, strJobStatus):
jobState = 0 jobState = 0
for i in range(0, len(strJobStatus) ): for i in range(0, len(strJobStatus)):
c = strJobStatus[i] c = strJobStatus[i]
if c == 'r': if c == 'r':
jobState += JobStateFlags.RUNNING jobState += JobStateFlags.RUNNING
@ -30,9 +32,10 @@ class QstatParser:
else: else:
assert False, 'unhandled job state flag :"' + c + '"' assert False, 'unhandled job state flag :"' + c + '"'
return jobState return jobState
def parseQueueMachineState( self, strQueueMachineStatus ):
def parseQueueMachineState(self, strQueueMachineStatus):
queueMachineState = 0 queueMachineState = 0
for i in range(0, len(strQueueMachineStatus) ): for i in range(0, len(strQueueMachineStatus)):
c = strQueueMachineStatus[i] c = strQueueMachineStatus[i]
if c == 'd': if c == 'd':
queueMachineState += QueueMachineStateFlags.DISABLED queueMachineState += QueueMachineStateFlags.DISABLED
@ -49,7 +52,8 @@ class QstatParser:
else: else:
assert False, 'unhandled queue machine state flag :"' + c + '"' assert False, 'unhandled queue machine state flag :"' + c + '"'
return queueMachineState return queueMachineState
def parseQstatOutput( self, qstatOutput ):
def parseQstatOutput(self, qstatOutput):
""" """
parses result of command 'qstat -f -u \* -pri' parses result of command 'qstat -f -u \* -pri'
""" """
@ -69,30 +73,29 @@ class QstatParser:
singleIndexMatch = re.match('^(?P<elementIndex>[0-9]+)$', strRange) singleIndexMatch = re.match('^(?P<elementIndex>[0-9]+)$', strRange)
if singleIndexMatch: if singleIndexMatch:
iElementIndex = int(singleIndexMatch.group('elementIndex')) iElementIndex = int(singleIndexMatch.group('elementIndex'))
task_ids.extend(range(iElementIndex, iElementIndex+1)) task_ids.extend(range(iElementIndex, iElementIndex + 1))
else: else:
# we expect strRange to be of the form "1-4:1", where : # we expect strRange to be of the form "1-4:1", where :
# the 1st number is the min element index (sge imposes it to be greater than 0) # the 1st number is the min element index (sge imposes it to be greater than 0)
# the 2nd number is the max element index # the 2nd number is the max element index
# the 3rd number is the step between consecutive element indices # the 3rd number is the step between consecutive element indices
rangeMatch = re.match( '^(?P<minElementIndex>[0-9]+)-(?P<maxElementIndex>[0-9]+):(?P<stepBetweenIndices>[0-9]+)$', strRange) rangeMatch = re.match('^(?P<minElementIndex>[0-9]+)-(?P<maxElementIndex>[0-9]+):(?P<stepBetweenIndices>[0-9]+)$', strRange)
if rangeMatch == None: if rangeMatch is None:
logError('unexpected format for job array details : "%s" (line="%s"' % (strRange, line) ) logError('unexpected format for job array details : "%s" (line="%s"' % (strRange, line))
assert(False) assert False
iMinElementIndex=int(rangeMatch.group('minElementIndex')) iMinElementIndex = int(rangeMatch.group('minElementIndex'))
iMaxElementIndex=int(rangeMatch.group('maxElementIndex')) iMaxElementIndex = int(rangeMatch.group('maxElementIndex'))
iStepBetweenIndices=int(rangeMatch.group('stepBetweenIndices')) iStepBetweenIndices = int(rangeMatch.group('stepBetweenIndices'))
task_ids.extend(range(iMinElementIndex, iMaxElementIndex+1, iStepBetweenIndices)) task_ids.extend(range(iMinElementIndex, iMaxElementIndex + 1, iStepBetweenIndices))
return task_ids return task_ids
# ugly hack to work around the fact that qstat truncates the fqdn of cluster nodes # ugly hack to work around the fact that qstat truncates the fqdn of cluster nodes
# graffy@physix-master:~$ qstat -f -u \* # graffy@physix-master:~$ qstat -f -u \*
# queuename qtype resv/used/tot. load_avg arch states # queuename qtype resv/used/tot. load_avg arch states
# --------------------------------------------------------------------------------- # ---------------------------------------------------------------------------------
# main.q@physix88.ipr.univ-renne BIP 0/0/36 14.03 lx-amd64 # main.q@physix88.ipr.univ-renne BIP 0/0/36 14.03 lx-amd64
# TODO: fix this properly by parsing the output of 'qstat -f -u \* -xml' instead of 'qstat -f -u \*' # TODO: fix this properly by parsing the output of 'qstat -f -u \* -xml' instead of 'qstat -f -u \*'
qstatOutput = re.sub('\.univ[^ ]*', '.univ-rennes1.fr', qstatOutput) qstatOutput = re.sub(r'\.univ[^ ]*', '.univ-rennes1.fr', qstatOutput)
jobsState = JobsState() jobsState = JobsState()
f = io.StringIO(qstatOutput) f = io.StringIO(qstatOutput)
@ -113,95 +116,98 @@ class QstatParser:
# ntckts The job's ticket amount in normalized fashion. # ntckts The job's ticket amount in normalized fashion.
# ppri The job's -p priority as specified by the user. # ppri The job's -p priority as specified by the user.
jobRegularExp = re.compile( '^[ ]*(?P<jobId>[^ ]+)[ ]+(?P<JobPriority>[0-9.]+)[ ]+(?P<nurg>[0-9.]+)[ ]+(?P<npprior>[0-9.]+)[ ]+(?P<ntckts>[0-9.]+)[ ]+(?P<ppri>-?[0-9]+)[ ]+(?P<jobScriptName>[^ ]+)[ ]+(?P<jobOwner>[^ ]+)[ ]+(?P<jobStatus>[^ ]+)[ ]+(?P<jobStartOrSubmitTime>[0-9][0-9]/[0-9][0-9]/[0-9][0-9][0-9][0-9] [0-9][0-9]:[0-9][0-9]:[0-9][0-9])[ ]+(?P<numSlots>[0-9]+)[ ]+(?P<jobArrayDetails>[^\n]*)[\s]*$' ) jobRegularExp = re.compile(r'^[ ]*(?P<jobId>[^ ]+)[ ]+(?P<JobPriority>[0-9.]+)[ ]+(?P<nurg>[0-9.]+)[ ]+(?P<npprior>[0-9.]+)[ ]+(?P<ntckts>[0-9.]+)[ ]+(?P<ppri>-?[0-9]+)[ ]+(?P<jobScriptName>[^ ]+)[ ]+(?P<jobOwner>[^ ]+)[ ]+(?P<jobStatus>[^ ]+)[ ]+(?P<jobStartOrSubmitTime>[0-9][0-9]/[0-9][0-9]/[0-9][0-9][0-9][0-9] [0-9][0-9]:[0-9][0-9]:[0-9][0-9])[ ]+(?P<numSlots>[0-9]+)[ ]+(?P<jobArrayDetails>[^\n]*)[\s]*$')
# example of machine line : # example of machine line :
# allintel.q@simpatix34.univ-ren BIP 0/6/8 6.00 darwin-x86 # allintel.q@simpatix34.univ-ren BIP 0/6/8 6.00 darwin-x86
machineRegularExp = re.compile( '^(?P<queueName>[^@]+)@(?P<machineName>[^ ]+)[ ]+(?P<queueTypeString>[^ ]+)[ ]+(?P<numReservedSlots>[^/]+)/(?P<numUsedSlots>[^/]+)/(?P<numTotalSlots>[^ ]+)[ ]+(?P<cpuLoad>[^ ]+)[\s]+(?P<archName>[^ ]+)[\s]+(?P<queueMachineStatus>[^\s]*)' ) machineRegularExp = re.compile(r'^(?P<queueName>[^@]+)@(?P<machineName>[^ ]+)[ ]+(?P<queueTypeString>[^ ]+)[ ]+(?P<numReservedSlots>[^/]+)/(?P<numUsedSlots>[^/]+)/(?P<numTotalSlots>[^ ]+)[ ]+(?P<cpuLoad>[^ ]+)[\s]+(?P<archName>[^ ]+)[\s]+(?P<queueMachineStatus>[^\s]*)')
pendingJobsHeaderRegularExp = re.compile( '^ - PENDING JOBS - PENDING JOBS - PENDING JOBS - PENDING JOBS - PENDING JOBS[?]*' ) pendingJobsHeaderRegularExp = re.compile('^ - PENDING JOBS - PENDING JOBS - PENDING JOBS - PENDING JOBS - PENDING JOBS[?]*')
while( len(line) > 0 ): while len(line) > 0:
# print line # print line
# check if the current line is a line describing a job running on a machine # check if the current line is a line describing a job running on a machine
matchObj = jobRegularExp.match( line ) matchObj = jobRegularExp.match(line)
if matchObj: if matchObj:
# we are dealing with a job line # we are dealing with a job line
if not bInPendingJobsSection: if not bInPendingJobsSection:
assert( currentQueueMachine ) assert currentQueueMachine
#log('QstatParser::parseQstatOutput : jobId = "'+matchObj.group('jobId')+'"') # log('QstatParser::parseQstatOutput : jobId = "'+matchObj.group('jobId')+'"')
iJobId = int(matchObj.group('jobId')) iJobId = int(matchObj.group('jobId'))
jobState = self.parseJobState( matchObj.group('jobStatus') ) logging.debug('iJobId = %d' % iJobId)
jobState = self.parseJobState(matchObj.group('jobStatus'))
strJobArrayDetails = matchObj.group('jobArrayDetails') strJobArrayDetails = matchObj.group('jobArrayDetails')
bIsJobArray = (len(strJobArrayDetails) != 0) bIsJobArray = (len(strJobArrayDetails) != 0)
#logDebug('strJobArrayDetails = "%s", bIsJobArray=%d' % (strJobArrayDetails, int(bIsJobArray))) # logDebug('strJobArrayDetails = "%s", bIsJobArray=%d' % (strJobArrayDetails, int(bIsJobArray)))
# each element of a job array is treated as a separate job for the sake of simplicity. # each element of a job array is treated as a separate job for the sake of simplicity.
# For these elements, the job id in sge sense is the same, but they are different in this program's sense # For these elements, the job id in sge sense is the same, but they are different in this program's sense
task_ids = range(0,1) # just one element, unless it's a job array task_ids = range(0, 1) # just one element, unless it's a job array
if bIsJobArray: if bIsJobArray:
if bInPendingJobsSection: if bInPendingJobsSection:
task_ids = parse_pending_tasks(strJobArrayDetails) task_ids = parse_pending_tasks(strJobArrayDetails)
else: else:
# we are in the running jobs section, and here we expect the strJobArrayDetails to just contain the index of the job array element # we are in the running jobs section, and here we expect the strJobArrayDetails to just contain the index of the job array element
iJobArrayElementIndex = int(strJobArrayDetails) iJobArrayElementIndex = int(strJobArrayDetails)
assert(iJobArrayElementIndex != 0) # sge does not allow element indices to be 0 assert iJobArrayElementIndex != 0 # sge does not allow element indices to be 0
task_ids = range(iJobArrayElementIndex,iJobArrayElementIndex+1) task_ids = range(iJobArrayElementIndex, iJobArrayElementIndex + 1)
logging.debug('task_ids = %s' % task_ids)
for task_id in task_ids: for task_id in task_ids:
logging.debug('task_id = %s' % task_id)
jobId = None jobId = None
if bIsJobArray: if bIsJobArray:
jobId = JobId(iJobId, task_id) jobId = JobId(iJobId, task_id)
else: else:
jobId = JobId(iJobId) jobId = JobId(iJobId)
job = jobsState.getJob(jobId) job = jobsState.getJob(jobId)
#logDebug('iElementIndex = %d job id = %s' % (iElementIndex, jobId.asStr())) # logDebug('iElementIndex = %d job id = %s' % (iElementIndex, jobId.asStr()))
if job == None: if job is None:
# this job hasn't been encountered yet in the output of qstat ... # this job hasn't been encountered yet in the output of qstat ...
# we could either be in the pending jobs section or in the running jobs section # we could either be in the pending jobs section or in the running jobs section
job = Job(jobId) job = Job(jobId)
jobsState.addJob( job ) jobsState.addJob(job)
job.setState( jobState ) job.setState(jobState)
strJobStartOrSubmitTime = matchObj.group('jobStartOrSubmitTime') strJobStartOrSubmitTime = matchObj.group('jobStartOrSubmitTime')
jobStartOrSubmitTime = time.strptime(strJobStartOrSubmitTime, '%m/%d/%Y %H:%M:%S') jobStartOrSubmitTime = time.strptime(strJobStartOrSubmitTime, '%m/%d/%Y %H:%M:%S')
if bInPendingJobsSection: if bInPendingJobsSection:
job.setSubmitTime( jobStartOrSubmitTime ) job.setSubmitTime(jobStartOrSubmitTime)
else: else:
job.setStartTime( jobStartOrSubmitTime ) job.setStartTime(jobStartOrSubmitTime)
job.setOwner( matchObj.group('jobOwner') ) job.setOwner(matchObj.group('jobOwner'))
job.setScriptName( matchObj.group('jobScriptName') ) job.setScriptName(matchObj.group('jobScriptName'))
if bInPendingJobsSection: if bInPendingJobsSection:
job.setNumRequiredSlots(int(matchObj.group('numSlots'))) job.setNumRequiredSlots(int(matchObj.group('numSlots')))
else: else:
assert( not bInPendingJobsSection ) # if we are in the pending jobs section, the job should be new assert not bInPendingJobsSection # if we are in the pending jobs section, the job should be new
if not bInPendingJobsSection: if not bInPendingJobsSection:
job.addSlots( currentQueueMachine.getName(), int(matchObj.group('numSlots')) ) job.addSlots(currentQueueMachine.getName(), int(matchObj.group('numSlots')))
else: else:
# the current line does not describe a job # the current line does not describe a job
if not bInPendingJobsSection: if not bInPendingJobsSection:
# check if this line describes the status of a machine # check if this line describes the status of a machine
matchObj = machineRegularExp.match( line ) matchObj = machineRegularExp.match(line)
if matchObj: if matchObj:
queueName = matchObj.group('queueName') queueName = matchObj.group('queueName')
machineName = matchObj.group('machineName') machineName = matchObj.group('machineName')
queueMachine = QueueMachine( queueName, machineName ) queueMachine = QueueMachine(queueName, machineName)
#log(line) # log(line)
#log('matchObj.group(queueTypeString) :' + matchObj.group('queueTypeString')) # log('matchObj.group(queueTypeString) :' + matchObj.group('queueTypeString'))
#log('matchObj.group(numTotalSlots) :' + matchObj.group('numTotalSlots')) # log('matchObj.group(numTotalSlots) :' + matchObj.group('numTotalSlots'))
queueMachine.setNumSlots( int( matchObj.group('numTotalSlots') ) ) queueMachine.setNumSlots(int(matchObj.group('numTotalSlots')))
queueMachine.setNumUsedSlots( int( matchObj.group('numUsedSlots') ) ) queueMachine.setNumUsedSlots(int(matchObj.group('numUsedSlots')))
strCpuLoad = matchObj.group('cpuLoad') strCpuLoad = matchObj.group('cpuLoad')
if strCpuLoad != '-NA-': if strCpuLoad != '-NA-':
queueMachine.setCpuLoad( float(strCpuLoad) ) queueMachine.setCpuLoad(float(strCpuLoad))
strQueueMachineState = matchObj.group('queueMachineStatus') strQueueMachineState = matchObj.group('queueMachineStatus')
queueMachine.setState( self.parseQueueMachineState( strQueueMachineState ) ) queueMachine.setState(self.parseQueueMachineState(strQueueMachineState))
#log('QstatParser::parseQstatOutput : queueName = "'+matchObj.group('queueName')+'"') # log('QstatParser::parseQstatOutput : queueName = "'+matchObj.group('queueName')+'"')
#log('QstatParser::parseQstatOutput : machineName = "'+matchObj.group('machineName')+'"') # log('QstatParser::parseQstatOutput : machineName = "'+matchObj.group('machineName')+'"')
currentQueueMachine = queueMachine currentQueueMachine = queueMachine
jobsState.addQueueMachine( queueMachine ) jobsState.addQueueMachine(queueMachine)
else: else:
matchObj = pendingJobsHeaderRegularExp.match( line ) matchObj = pendingJobsHeaderRegularExp.match(line)
if matchObj: if matchObj:
bInPendingJobsSection = True bInPendingJobsSection = True
currentQueueMachine = None currentQueueMachine = None
else: else:
#print line # print line
None None
else: else:
# we are in a pending jobs section # we are in a pending jobs section
@ -209,30 +215,31 @@ class QstatParser:
if not matchObj: if not matchObj:
# unexpected line # unexpected line
print('line = "' + line + '"') print('line = "' + line + '"')
assert( False ) assert False
None None
line = f.readline() line = f.readline()
f.close() f.close()
return jobsState return jobsState
def parseJobDetails( self, qstatOutput, job ):
def parseJobDetails(self, qstatOutput, job):
""" """
adds to job the details parsed from the output of the "qstat -j <jobid>" command adds to job the details parsed from the output of the "qstat -j <jobid>" command
""" """
f = io.StringIO(qstatOutput) f = io.StringIO(qstatOutput)
line = f.readline() line = f.readline()
fieldRegularExp = re.compile( '^(?P<fieldName>[^:]+):[ ]+(?P<fieldValue>[?]*)$' ) fieldRegularExp = re.compile('^(?P<fieldName>[^:]+):[ ]+(?P<fieldValue>[?]*)$')
while( len(line) > 0 ): while len(line) > 0:
# print line # print line
# check if the current line is a line describing a job running on a machine # check if the current line is a line describing a job running on a machine
matchObj = fieldRegularExp.match( line ) matchObj = fieldRegularExp.match(line)
if matchObj: if matchObj:
fieldName = matchObj.group('fieldName') fieldName = matchObj.group('fieldName')
strFieldValue = matchObj.group('fieldValue') strFieldValue = matchObj.group('fieldValue')
if fieldName == 'job_number': if fieldName == 'job_number':
assert( job.getId().asStr() == strFieldValue ) assert job.getId().asStr() == strFieldValue
elif fieldName == 'hard_queue_list': elif fieldName == 'hard_queue_list':
allowedQueues = strFieldValue.split(',') allowedQueues = strFieldValue.split(',')
assert(len(allowedQueues) > 0) assert len(allowedQueues) > 0
job.m_jobRequirements.m_queues = allowedQueues job.m_jobRequirements.m_queues = allowedQueues
elif fieldName == 'parallel environment': elif fieldName == 'parallel environment':
# the value could be 'ompi range: 32' # the value could be 'ompi range: 32'
@ -240,10 +247,9 @@ class QstatParser:
if matchObj: if matchObj:
job.m_jobRequirements.m_parallelEnvironment = ParallelEnvironment.MPI job.m_jobRequirements.m_parallelEnvironment = ParallelEnvironment.MPI
else: else:
assert( False ) assert False
else: else:
# ignore he other fields # ignore he other fields
None None
line = f.readline() line = f.readline()
f.close() f.close()

View File

@ -1,17 +1,18 @@
class QueueMachineStateFlags: # class QueueMachineStateFlags: #
DISABLED=1 # the queue machine is disabled DISABLED = 1 # the queue machine is disabled
ALARM=2 # the queue machine is in alarm state (see man qstat) ALARM = 2 # the queue machine is in alarm state (see man qstat)
UNKNOWN=4 # the queue machine is in unknown state because sge_execd cannot be contected (see man qstat) UNKNOWN = 4 # the queue machine is in unknown state because sge_execd cannot be contected (see man qstat)
ERROR=8 # the queue is in error state ERROR = 8 # the queue is in error state
OBSOLETE=16 # the queue no longer exists but it is still visible because it still contains running jobs OBSOLETE = 16 # the queue no longer exists but it is still visible because it still contains running jobs
SUSPENDED=32 # the queue machine is suspended SUSPENDED = 32 # the queue machine is suspended
class QueueMachine: class QueueMachine:
""" """
a QueueMachine instance represents a given SGE queue on a given machine (eg allintel.q@simpatix10) a QueueMachine instance represents a given SGE queue on a given machine (eg allintel.q@simpatix10)
""" """
def __init__( self, queueName, machineName ): def __init__(self, queueName, machineName):
self.m_queueName = queueName self.m_queueName = queueName
self.m_machineName = machineName self.m_machineName = machineName
self.m_numSlots = None self.m_numSlots = None
@ -19,47 +20,62 @@ class QueueMachine:
self.m_fCpuLoad = None self.m_fCpuLoad = None
self.m_stateFlags = 0 self.m_stateFlags = 0
self.m_strDisableMessage = '' self.m_strDisableMessage = ''
def getName( self ):
def getName(self):
""" """
returns the name of the machine queue (such as allintel.q@simpatix10) returns the name of the machine queue (such as allintel.q@simpatix10)
""" """
return self.m_queueName + '@' + self.m_machineName return self.m_queueName + '@' + self.m_machineName
def getQueueName( self ): def getQueueName(self):
return self.m_queueName return self.m_queueName
def getMachineName( self ):
def getMachineName(self):
return self.m_machineName return self.m_machineName
def setNumSlots( self, numSlots ):
def setNumSlots(self, numSlots):
self.m_numSlots = numSlots self.m_numSlots = numSlots
def setNumUsedSlots( self, numSlots ):
def setNumUsedSlots(self, numSlots):
self.m_numUsedSlots = numSlots self.m_numUsedSlots = numSlots
def getNumSlots( self ):
assert( self.m_numSlots != None ) def getNumSlots(self):
assert self.m_numSlots is not None
return self.m_numSlots return self.m_numSlots
def getNumUsedSlots( self ):
assert( self.m_numUsedSlots != None ) def getNumUsedSlots(self):
assert self.m_numUsedSlots is not None
return self.m_numUsedSlots return self.m_numUsedSlots
def setCpuLoad( self, fCpuLoad ):
def setCpuLoad(self, fCpuLoad):
self.m_fCpuLoad = fCpuLoad self.m_fCpuLoad = fCpuLoad
def cpuLoadIsAvailable( self ):
return self.m_fCpuLoad != None def cpuLoadIsAvailable(self):
def getCpuLoad( self ): return self.m_fCpuLoad is not None
assert( self.m_fCpuLoad != None )
def getCpuLoad(self):
assert self.m_fCpuLoad is not None
return self.m_fCpuLoad return self.m_fCpuLoad
def setState( self, state ):
def setState(self, state):
self.m_stateFlags = state self.m_stateFlags = state
def isDisabled( self ):
def isDisabled(self):
return self.m_stateFlags & QueueMachineStateFlags.DISABLED return self.m_stateFlags & QueueMachineStateFlags.DISABLED
def isInErrorState( self ):
def isInErrorState(self):
return self.m_stateFlags & QueueMachineStateFlags.ERROR return self.m_stateFlags & QueueMachineStateFlags.ERROR
def isResponding( self ):
def isResponding(self):
return not (self.m_stateFlags & QueueMachineStateFlags.UNKNOWN) return not (self.m_stateFlags & QueueMachineStateFlags.UNKNOWN)
def isInAlarmState( self ):
def isInAlarmState(self):
return self.m_stateFlags & QueueMachineStateFlags.ALARM return self.m_stateFlags & QueueMachineStateFlags.ALARM
def isSuspended( self ):
def isSuspended(self):
return self.m_stateFlags & QueueMachineStateFlags.SUSPENDED return self.m_stateFlags & QueueMachineStateFlags.SUSPENDED
""" """
def getStateAsString( self ): def getStateAsString(self):
assert( self.m_strState != None ) assert(self.m_strState is not None)
return self.m_strState return self.m_strState
""" """

View File

@ -1,80 +1,85 @@
from PowerState import * from PowerState import PowerState
from Log import * from Log import logInfo
import time import time
import copy import copy
class Slot: class Slot:
def __init__( self ): def __init__(self):
self.m_queueMachine = None self.m_queueMachine = None
self.m_numSlots = None self.m_numSlots = None
self.m_job = None # job for which this slot is allocated self.m_job = None # job for which this slot is allocated
class SlotAllocator: class SlotAllocator:
""" """
a class that defines a strategy for allocating free slots for the given pending jobs a class that defines a strategy for allocating free slots for the given pending jobs
""" """
def getMachinesThatNeedWakeUp( self, pendingJobs, clusterState ): def getMachinesThatNeedWakeUp(self, pendingJobs, clusterState):
""" """
returns the list of machines that need to wake up to make pending jobs running returns the list of machines that need to wake up to make pending jobs running
""" """
assert( False ) # this method is abstract assert False # this method is abstract
class SimpleSlotAllocator( SlotAllocator ):
def getMachinesThatNeedWakeUp( self, pendingJobs, clusterState ): class SimpleSlotAllocator(SlotAllocator):
def getMachinesThatNeedWakeUp(self, pendingJobs, clusterState):
machinesThatNeedWakeUp = {} machinesThatNeedWakeUp = {}
highestPriorityPendingJob = pendingJobs.values()[0] highestPriorityPendingJob = pendingJobs.values()[0]
logInfo('SimpleSlotAllocator::getMachinesThatNeedWakeUp : looking for free slots for job ' + highestPriorityPendingJob.getId().asStr() ) logInfo('SimpleSlotAllocator::getMachinesThatNeedWakeUp : looking for free slots for job ' + highestPriorityPendingJob.getId().asStr())
numFreeSlots = {} # contains the number of free slots for each queueMachine numFreeSlots = {} # contains the number of free slots for each queueMachine
for queueMachine in clusterState.getJobsState().getQueueMachines().values(): for queueMachine in clusterState.getJobsState().getQueueMachines().values():
numFreeSlots[ queueMachine ] = clusterState.getJobsState().getNumFreeSlotsOnQueueMachine( queueMachine ) numFreeSlots[queueMachine] = clusterState.getJobsState().getNumFreeSlotsOnQueueMachine(queueMachine)
logInfo('SimpleSlotAllocator::getMachinesThatNeedWakeUp : init numFreeSlots[ %s ] with %d ' % (queueMachine.getName(), numFreeSlots[ queueMachine ]) ) logInfo('SimpleSlotAllocator::getMachinesThatNeedWakeUp : init numFreeSlots[%s] with %d ' % (queueMachine.getName(), numFreeSlots[queueMachine]))
remainingNumSlotsToAllocate = highestPriorityPendingJob.m_jobRequirements.m_numSlots remainingNumSlotsToAllocate = highestPriorityPendingJob.m_jobRequirements.m_numSlots
logInfo('SimpleSlotAllocator::getMachinesThatNeedWakeUp : still %d slots to find' % remainingNumSlotsToAllocate ) logInfo('SimpleSlotAllocator::getMachinesThatNeedWakeUp : still %d slots to find' % remainingNumSlotsToAllocate)
# first look in running machines if there are available slots # first look in running machines if there are available slots
for queueMachine in clusterState.getJobsState().getQueueMachines().values(): for queueMachine in clusterState.getJobsState().getQueueMachines().values():
logInfo('SimpleSlotAllocator::getMachinesThatNeedWakeUp : examining queueMachine %s ' % queueMachine.getName() ) logInfo('SimpleSlotAllocator::getMachinesThatNeedWakeUp : examining queueMachine %s ' % queueMachine.getName())
machine = clusterState.getMachines()[ queueMachine.getMachineName() ] machine = clusterState.getMachines()[queueMachine.getMachineName()]
if machine.getPowerState() == PowerState.ON: if machine.getPowerState() == PowerState.ON:
if clusterState.queueMachineFitsJobRequirements( queueMachine, highestPriorityPendingJob.m_jobRequirements ): if clusterState.queueMachineFitsJobRequirements(queueMachine, highestPriorityPendingJob.m_jobRequirements):
numSlotsAllocatedOnThisMachine = min( numFreeSlots[ queueMachine ], remainingNumSlotsToAllocate ) numSlotsAllocatedOnThisMachine = min(numFreeSlots[queueMachine], remainingNumSlotsToAllocate)
logInfo('SimpleSlotAllocator::getMachinesThatNeedWakeUp : found %d slots on already running %s ' % (numSlotsAllocatedOnThisMachine, queueMachine.getMachineName() ) ) logInfo('SimpleSlotAllocator::getMachinesThatNeedWakeUp : found %d slots on already running %s ' % (numSlotsAllocatedOnThisMachine, queueMachine.getMachineName()))
remainingNumSlotsToAllocate -= numSlotsAllocatedOnThisMachine remainingNumSlotsToAllocate -= numSlotsAllocatedOnThisMachine
numFreeSlots[ queueMachine ] -= numSlotsAllocatedOnThisMachine numFreeSlots[queueMachine] -= numSlotsAllocatedOnThisMachine
logInfo('SimpleSlotAllocator::getMachinesThatNeedWakeUp : still %d slots to find' % remainingNumSlotsToAllocate ) logInfo('SimpleSlotAllocator::getMachinesThatNeedWakeUp : still %d slots to find' % remainingNumSlotsToAllocate)
assert( remainingNumSlotsToAllocate >= 0 ) assert remainingNumSlotsToAllocate >= 0
if remainingNumSlotsToAllocate == 0: if remainingNumSlotsToAllocate == 0:
break break
if remainingNumSlotsToAllocate > 0: if remainingNumSlotsToAllocate > 0:
# now look into machines that are asleep # now look into machines that are asleep
for queueMachine in clusterState.getJobsState().getQueueMachines().values(): for queueMachine in clusterState.getJobsState().getQueueMachines().values():
logInfo('SimpleSlotAllocator::getMachinesThatNeedWakeUp : examining queueMachine %s ' % queueMachine.getName() ) logInfo('SimpleSlotAllocator::getMachinesThatNeedWakeUp : examining queueMachine %s ' % queueMachine.getName())
machine = clusterState.getMachines()[ queueMachine.getMachineName() ] machine = clusterState.getMachines()[queueMachine.getMachineName()]
if machine.getPowerState() == PowerState.SLEEP: if machine.getPowerState() == PowerState.SLEEP:
if clusterState.queueMachineFitsJobRequirements( queueMachine, highestPriorityPendingJob.m_jobRequirements ): if clusterState.queueMachineFitsJobRequirements(queueMachine, highestPriorityPendingJob.m_jobRequirements):
numSlotsAllocatedOnThisMachine = min( numFreeSlots[ queueMachine ], remainingNumSlotsToAllocate ) numSlotsAllocatedOnThisMachine = min(numFreeSlots[queueMachine], remainingNumSlotsToAllocate)
logInfo('SimpleSlotAllocator::getMachinesThatNeedWakeUp : found %d slots on sleeping %s ' % (numSlotsAllocatedOnThisMachine, queueMachine.getMachineName() ) ) logInfo('SimpleSlotAllocator::getMachinesThatNeedWakeUp : found %d slots on sleeping %s ' % (numSlotsAllocatedOnThisMachine, queueMachine.getMachineName()))
remainingNumSlotsToAllocate -= numSlotsAllocatedOnThisMachine remainingNumSlotsToAllocate -= numSlotsAllocatedOnThisMachine
numFreeSlots[ queueMachine ] -= numSlotsAllocatedOnThisMachine numFreeSlots[queueMachine] -= numSlotsAllocatedOnThisMachine
machinesThatNeedWakeUp[ machine.getName() ] = machine machinesThatNeedWakeUp[machine.getName()] = machine
logInfo('SimpleSlotAllocator::getMachinesThatNeedWakeUp : still %d slots to find' % remainingNumSlotsToAllocate ) logInfo('SimpleSlotAllocator::getMachinesThatNeedWakeUp : still %d slots to find' % remainingNumSlotsToAllocate)
assert( remainingNumSlotsToAllocate >= 0 ) assert remainingNumSlotsToAllocate >= 0
if remainingNumSlotsToAllocate == 0: if remainingNumSlotsToAllocate == 0:
break break
if remainingNumSlotsToAllocate != 0: if remainingNumSlotsToAllocate != 0:
return {} # not enough slots available return {} # not enough slots available
return machinesThatNeedWakeUp return machinesThatNeedWakeUp
class DecoupledSlotAllocator( SlotAllocator ):
class DecoupledSlotAllocator(SlotAllocator):
""" """
a slot allocator that doesn't know much about sge, and does not attempts to guess what sge'sceduler would do a slot allocator that doesn't know much about sge, and does not attempts to guess what sge'sceduler would do
Instead, it uses a very simple strategy : it wakes up all the machines periodically to allow jobs to get in. Instead, it uses a very simple strategy : it wakes up all the machines periodically to allow jobs to get in.
""" """
def __init__( self ): def __init__(self):
self.m_delayBetweenPeriodicChecks = -1 # in seconds. Disable periodic checks by setting this to -1 self.m_delayBetweenPeriodicChecks = -1 # in seconds. Disable periodic checks by setting this to -1
self.m_lastCheckTime = time.time() self.m_lastCheckTime = time.time()
self.m_lastClusterState = None self.m_lastClusterState = None
def jobsStateHasChanged( self, newClusterState ):
def jobsStateHasChanged(self, newClusterState):
""" """
returns true if there is a change in the cluster state that can cause a pending job returns true if there is a change in the cluster state that can cause a pending job
to start (provided all machines are enabled) to start (provided all machines are enabled)
@ -85,8 +90,8 @@ class DecoupledSlotAllocator( SlotAllocator ):
newJobs = newClusterState.m_jobsState.m_jobs newJobs = newClusterState.m_jobsState.m_jobs
bJobsHaveChanged = False bJobsHaveChanged = False
oldJobsOnly = oldJobs.copy() # shallow copy oldJobsOnly = oldJobs.copy() # shallow copy
#print 'oldJobs : ', oldJobs # print 'oldJobs : ', oldJobs
#print 'newJobs : ', newJobs # print 'newJobs : ', newJobs
""" """
print 'self.m_lastClusterState', self.m_lastClusterState print 'self.m_lastClusterState', self.m_lastClusterState
print 'newClusterState', newClusterState print 'newClusterState', newClusterState
@ -101,23 +106,24 @@ class DecoupledSlotAllocator( SlotAllocator ):
print 'id(newJobs) : ', id(newJobs) print 'id(newJobs) : ', id(newJobs)
""" """
for newJob in newJobs.values(): for newJob in newJobs.values():
#logDebug('DecoupledSlotAllocator::jobsStateHasChanged newJob id=%s' % newJob.getId().asStr()) # logDebug('DecoupledSlotAllocator::jobsStateHasChanged newJob id=%s' % newJob.getId().asStr())
if newJob.getId() in oldJobs: if newJob.getId() in oldJobs:
#logDebug('DecoupledSlotAllocator::jobsStateHasChanged job id=%d is in old jobs' % newJob.getId()) # logDebug('DecoupledSlotAllocator::jobsStateHasChanged job id=%d is in old jobs' % newJob.getId())
del oldJobsOnly[newJob.getId()] del oldJobsOnly[newJob.getId()]
else: else:
# ah ... a new job has arrived # ah ... a new job has arrived
logInfo('A new job (jobId =%s) has been detected ' % newJob.getId().asStr() ) logInfo('A new job (jobId =%s) has been detected ' % newJob.getId().asStr())
bJobsHaveChanged = True bJobsHaveChanged = True
if len(oldJobsOnly) != 0: if len(oldJobsOnly) != 0:
for oldJob in oldJobsOnly.values(): for oldJob in oldJobsOnly.values():
logInfo('Job (jobId =%s) has finished' % oldJob.getId().asStr() ) logInfo('Job (jobId =%s) has finished' % oldJob.getId().asStr())
# at least one old job has finished, freeing some slots # at least one old job has finished, freeing some slots
bJobsHaveChanged = True bJobsHaveChanged = True
return bJobsHaveChanged return bJobsHaveChanged
def getMachinesThatNeedWakeUp( self, pendingJobs, clusterState ):
def getMachinesThatNeedWakeUp(self, pendingJobs, clusterState):
machinesThatNeedWakeUp = {} machinesThatNeedWakeUp = {}
bJobsStateHasChanged = self.jobsStateHasChanged( clusterState ) bJobsStateHasChanged = self.jobsStateHasChanged(clusterState)
currentTime = time.time() currentTime = time.time()
# we do periodic checks to detect changes in cluster state that are not detected by jobsStateHasChanged # we do periodic checks to detect changes in cluster state that are not detected by jobsStateHasChanged
# for example changes in the requirements, in the allocation policy, etc... # for example changes in the requirements, in the allocation policy, etc...
@ -132,10 +138,10 @@ class DecoupledSlotAllocator( SlotAllocator ):
for queueMachine in clusterState.getJobsState().getQueueMachines().values(): for queueMachine in clusterState.getJobsState().getQueueMachines().values():
if queueMachine.getMachineName() in clusterState.getMachines(): if queueMachine.getMachineName() in clusterState.getMachines():
# this means that the machine is under the cluster controller's control # this means that the machine is under the cluster controller's control
machine = clusterState.getMachines()[ queueMachine.getMachineName() ] machine = clusterState.getMachines()[queueMachine.getMachineName()]
if machine.getPowerState() == PowerState.SLEEP: if machine.getPowerState() == PowerState.SLEEP:
machinesThatNeedWakeUp[ machine.getName() ] = machine machinesThatNeedWakeUp[machine.getName()] = machine
self.m_lastCheckTime = currentTime self.m_lastCheckTime = currentTime
self.m_lastClusterState = copy.copy(clusterState) self.m_lastClusterState = copy.copy(clusterState)
#print 'self.m_lastClusterState', self.m_lastClusterState # print 'self.m_lastClusterState', self.m_lastClusterState
return machinesThatNeedWakeUp return machinesThatNeedWakeUp

View File

@ -1,9 +1,12 @@
import Util import time
from QstatParser import * from Util import executeProgram
from QstatParser import QstatParser
from Log import logDebug, logWarning
class SunGridEngine: class SunGridEngine:
def getCurrentJobsState( self ): def getCurrentJobsState(self):
bBUG_00000009_IS_STILL_ALIVE = True bBUG_00000009_IS_STILL_ALIVE = True
if bBUG_00000009_IS_STILL_ALIVE: if bBUG_00000009_IS_STILL_ALIVE:
logDebug('Querying the current state of jobs') logDebug('Querying the current state of jobs')
@ -11,34 +14,33 @@ class SunGridEngine:
delayBetweenAttemps = 5 # in seconds delayBetweenAttemps = 5 # in seconds
while returnCode != 0: while returnCode != 0:
command = ['qstat', '-f', '-u', '*'] command = ['qstat', '-f', '-u', '*']
(returnCode, qstatOutput, stderr) = executeProgram( command ) (returnCode, qstatOutput, stderr) = executeProgram(command)
if returnCode != 0: if returnCode != 0:
logWarning('command "%s" failed (returnCode = %d, stdout="%s", stderr="%s"). Retrying in %d seconds' % (' '.join(command), returnCode, qstatOutput, stderr, delayBetweenAttemps)) logWarning('command "%s" failed (returnCode = %d, stdout="%s", stderr="%s"). Retrying in %d seconds' % (' '.join(command), returnCode, qstatOutput, stderr, delayBetweenAttemps))
time.sleep(delayBetweenAttemps) time.sleep(delayBetweenAttemps)
if bBUG_00000009_IS_STILL_ALIVE: if bBUG_00000009_IS_STILL_ALIVE:
logDebug('Just got current state of jobs') logDebug('Just got current state of jobs')
jobsState = QstatParser().parseQstatOutput( qstatOutput ) jobsState = QstatParser().parseQstatOutput(qstatOutput)
jobsState.setTime( time.time() ) jobsState.setTime(time.time())
# read the requirements for pending jobs (which parallel environment, which queue, which architecture) from sge # read the requirements for pending jobs (which parallel environment, which queue, which architecture) from sge
if False: # no need for job details at the moment and since it's very slow, it's been disabled if False: # no need for job details at the moment and since it's very slow, it's been disabled
for unused_jobId, job in jobsState.getPendingJobs().items(): for unused_jobId, job in jobsState.getPendingJobs().items():
(returnCode, stdout, stderr) = executeProgram( ['qstat', '-j', job.getId().asStr()] ) (returnCode, stdout, stderr) = executeProgram(['qstat', '-j', job.getId().asStr()])
assert returnCode != 0, 'prout' assert returnCode != 0, 'prout'
QstatParser().parseJobDetails( stdout, job ) QstatParser().parseJobDetails(stdout, job)
return jobsState return jobsState
def setQueueInstanceActivation( self, strQueueInstanceName, bEnable ): def setQueueInstanceActivation(self, strQueueInstanceName, bEnable):
argument = 'd' argument = 'd'
if bEnable: if bEnable:
argument = 'e' argument = 'e'
bBUG_00000269_IS_STILL_ALIVE = True # for some reason, qmod -d (and maybe any sge command) could fail with error: commlib error: can't connect to service (Address already in use) bBUG_00000269_IS_STILL_ALIVE = True # for some reason, qmod -d (and maybe any sge command) could fail with error: commlib error: can't connect to service (Address already in use)
delayBetweenAttemps = 5 # in seconds delayBetweenAttemps = 5 # in seconds
while True: while True:
errorCode, unused_stdout, unused_stderr = executeProgram(['qmod', '-'+argument, strQueueInstanceName]) errorCode, unused_stdout, unused_stderr = executeProgram(['qmod', '-' + argument, strQueueInstanceName])
if bBUG_00000269_IS_STILL_ALIVE: if bBUG_00000269_IS_STILL_ALIVE:
# if the command failed, try again # if the command failed, try again
if errorCode == 0: if errorCode == 0:
@ -48,11 +50,9 @@ class SunGridEngine:
break break
return (errorCode == 0) return (errorCode == 0)
def queueIsEmpty( self, strMachineName ): def queueIsEmpty(self, strMachineName):
(returnCode, qstatOutput, unused_stderr) = executeProgram( ['qstat', '-f', '-u', '*'] ) (returnCode, qstatOutput, unused_stderr) = executeProgram(['qstat', '-f', '-u', '*'])
assert( returnCode == 0 ) assert returnCode == 0
jobsState = QstatParser().parseQstatOutput( qstatOutput ) jobsState = QstatParser().parseQstatOutput(qstatOutput)
jobs = jobsState.getJobsOnMachine( strMachineName ) jobs = jobsState.getJobsOnMachine(strMachineName)
return (len(jobs) == 0) return (len(jobs) == 0)

View File

@ -1,12 +1,12 @@
#!/usr/bin/env python #!/usr/bin/env python
import sys import sys
sys.path.insert(0, '..') sys.path.insert(0, '..')
from Log import * from Log import logInfo
import Util import Util
from PowerState import * from PowerState import PowerState
from HTMLParser import HTMLParser from HTMLParser import HTMLParser
def Test0000(): def Test0000():
logInfo('Testing bug 00000003 if a series of wake up, goto sleep can shutdown a machine') logInfo('Testing bug 00000003 if a series of wake up, goto sleep can shutdown a machine')
strTargetMachineName = 'simpatix12' strTargetMachineName = 'simpatix12'
@ -14,15 +14,16 @@ def Test0000():
while True: while True:
if ePowerState == PowerState.ON: if ePowerState == PowerState.ON:
bSuccess = Util.blockingPutMachineToSleep(strTargetMachineName) bSuccess = Util.blockingPutMachineToSleep(strTargetMachineName)
assert( bSuccess ) assert bSuccess
bSuccess = Util.blockingPutMachineToSleep(strTargetMachineName) bSuccess = Util.blockingPutMachineToSleep(strTargetMachineName)
ePowerState = PowerState.SLEEP ePowerState = PowerState.SLEEP
elif ePowerState == PowerState.SLEEP: elif ePowerState == PowerState.SLEEP:
bSuccess = Util.blockingWakeUpMachine(strTargetMachineName) bSuccess = Util.blockingWakeUpMachine(strTargetMachineName)
assert( bSuccess ) assert bSuccess
ePowerState = PowerState.ON ePowerState = PowerState.ON
else: else:
assert(False) assert False
def Test0001(): def Test0001():
logInfo('Testing bug 00000003 : could it be caused by a sleep and a power on at the same tim ?') logInfo('Testing bug 00000003 : could it be caused by a sleep and a power on at the same tim ?')
@ -30,12 +31,13 @@ def Test0001():
ePowerState = Util.getPowerState(strTargetMachineName) ePowerState = Util.getPowerState(strTargetMachineName)
if ePowerState == PowerState.SLEEP: if ePowerState == PowerState.SLEEP:
bSuccess = Util.blockingWakeUpMachine(strTargetMachineName) bSuccess = Util.blockingWakeUpMachine(strTargetMachineName)
assert( bSuccess ) assert bSuccess
ePowerState = PowerState.ON ePowerState = PowerState.ON
assert(ePowerState == PowerState.ON) assert ePowerState == PowerState.ON
Util.executeCommand("ssh %s 'pmset sleepnow'" % strTargetMachineName ) Util.executeCommand("ssh %s 'pmset sleepnow'" % strTargetMachineName)
bSuccess = Util.blockingWakeUpMachine(strTargetMachineName) bSuccess = Util.blockingWakeUpMachine(strTargetMachineName)
assert(bSuccess) assert bSuccess
def Test0002(): def Test0002():
logInfo('Testing bug 00000003 : could it be caused by a power on quickly followed by a sleep ?') logInfo('Testing bug 00000003 : could it be caused by a power on quickly followed by a sleep ?')
@ -43,11 +45,12 @@ def Test0002():
ePowerState = Util.getPowerState(strTargetMachineName) ePowerState = Util.getPowerState(strTargetMachineName)
if ePowerState == PowerState.ON: if ePowerState == PowerState.ON:
bSuccess = Util.blockingWakeUpMachine(strTargetMachineName) bSuccess = Util.blockingWakeUpMachine(strTargetMachineName)
assert( bSuccess ) assert bSuccess
ePowerState = PowerState.SLEEP ePowerState = PowerState.SLEEP
assert(ePowerState == PowerState.SLEEP) assert ePowerState == PowerState.SLEEP
Util.executeIpmiCommand( strTargetMachineName, 'chassis power on' ) Util.executeIpmiCommand(strTargetMachineName, 'chassis power on')
Util.executeCommand("ssh %s 'pmset sleepnow'" % strTargetMachineName ) Util.executeCommand("ssh %s 'pmset sleepnow'" % strTargetMachineName)
if __name__ == '__main__': if __name__ == '__main__':
Test0000() Test0000()

View File

@ -1,18 +1,19 @@
#import .Util # import .Util
#import ..SimpaDbUtil # import ..SimpaDbUtil
from .Log import * from .Log import logDebug, logInfo, logWarning, logError
from .PowerState import * from .PowerState import PowerState, PowerStateToStr
import re import re
import io import io
import os import os
import traceback import traceback
import sys import sys
def executeProgram( astrArguments ):
def executeProgram(astrArguments):
bBUG_00000008_IS_STILL_ACTIVE = True bBUG_00000008_IS_STILL_ACTIVE = True
if bBUG_00000008_IS_STILL_ACTIVE: if bBUG_00000008_IS_STILL_ACTIVE:
logDebug('executeProgram : program = [%s]' % (','.join(astrArguments) )) logDebug('executeProgram : program = [%s]' % (','.join(astrArguments)))
(returnCode, stdout, stderr) = Lib.Util.executeProgram( astrArguments ) (returnCode, stdout, stderr) = Lib.Util.executeProgram(astrArguments)
if bBUG_00000008_IS_STILL_ACTIVE: if bBUG_00000008_IS_STILL_ACTIVE:
logDebug('executeCommand : return code of [%s] = %d' % (','.join(astrArguments), returnCode)) logDebug('executeCommand : return code of [%s] = %d' % (','.join(astrArguments), returnCode))
# for debugging purpose, log info in case the command failed # for debugging purpose, log info in case the command failed
@ -22,32 +23,34 @@ def executeProgram( astrArguments ):
logDebug('executeCommand : stderr of [%s] = %s' % (','.join(astrArguments), stderr)) logDebug('executeCommand : stderr of [%s] = %s' % (','.join(astrArguments), stderr))
return (returnCode, stdout, stderr) return (returnCode, stdout, stderr)
def executeCommand( command ):
#logDebug('executeCommand : command = ' + command) def executeCommand(command):
(returnCode, stdout, stderr) = Lib.Util.executeCommand( command ) # logDebug('executeCommand : command = ' + command)
#logDebug('executeCommand : return code of "'+command+'" = '+str(returnCode)) (returnCode, stdout, stderr) = Lib.Util.executeCommand(command)
# logDebug('executeCommand : return code of "'+command+'" = '+str(returnCode))
return (returnCode, stdout, stderr) return (returnCode, stdout, stderr)
def executeIpmiCommand( machineName, ipmiCommandArgs ):
lomIpAddress = Lib.SimpaDbUtil.getLightOutManagementIpAddress( machineName ) def executeIpmiCommand(machineName, ipmiCommandArgs):
lomIpAddress = Lib.SimpaDbUtil.getLightOutManagementIpAddress(machineName)
lomPasswordFilepath = '/usr/local/etc/LightOutManagementPassword.txt' lomPasswordFilepath = '/usr/local/etc/LightOutManagementPassword.txt'
astrProgram = ['ipmitool', '-U', 'admin', '-H', lomIpAddress, '-f', lomPasswordFilepath] astrProgram = ['ipmitool', '-U', 'admin', '-H', lomIpAddress, '-f', lomPasswordFilepath]
astrProgram.extend( ipmiCommandArgs ) astrProgram.extend(ipmiCommandArgs)
#print 'executeIpmiCommand' # print 'executeIpmiCommand'
#print astrProgram # print astrProgram
bBUG_00000005_IS_STILL_ACTIVE = True bBUG_00000005_IS_STILL_ACTIVE = True
if bBUG_00000005_IS_STILL_ACTIVE: if bBUG_00000005_IS_STILL_ACTIVE:
# bug 00000005 causes ipmitool to randomly fail for no apparent reason (two consecutive calls might give different errors, these errors not always being timeouts). Therefore we try and try again, until the command succeeds. If we don't do this, cluster controller keeps stopping because ipmi commands fail. The effect of this hack is that the UNPLUGGED power state is no longer detected; therefore, with this hack, cluster controller is expecting all machines to be plugged. # bug 00000005 causes ipmitool to randomly fail for no apparent reason (two consecutive calls might give different errors, these errors not always being timeouts). Therefore we try and try again, until the command succeeds. If we don't do this, cluster controller keeps stopping because ipmi commands fail. The effect of this hack is that the UNPLUGGED power state is no longer detected; therefore, with this hack, cluster controller is expecting all machines to be plugged.
bCommandSucceeded = False bCommandSucceeded = False
while not bCommandSucceeded: while not bCommandSucceeded:
(returnCode, stdout, stderr) = executeProgram( astrProgram ) (returnCode, stdout, stderr) = executeProgram(astrProgram)
if returnCode == 0: if returnCode == 0:
bCommandSucceeded = True bCommandSucceeded = True
else: else:
logWarning('the command "%s" failed. Retrying a bit later' % ' '.join(astrProgram)) logWarning('the command "%s" failed. Retrying a bit later' % ' '.join(astrProgram))
time.sleep(5) # wait for 5 seconds before the next attempt, in order not to saturate activity time.sleep(5) # wait for 5 seconds before the next attempt, in order not to saturate activity
else: else:
(returnCode, stdout, stderr) = executeProgram( astrProgram ) (returnCode, stdout, stderr) = executeProgram(astrProgram)
""" """
sh-3.2# ipmitool -U admin -H 129.20.27.220 -f /usr/local/etc/LightOutManagementPassword.txt sensor get 'ACPI State' sh-3.2# ipmitool -U admin -H 129.20.27.220 -f /usr/local/etc/LightOutManagementPassword.txt sensor get 'ACPI State'
Unabled to establish a session with the BMC. Unabled to establish a session with the BMC.
@ -65,24 +68,25 @@ def executeIpmiCommand( machineName, ipmiCommandArgs ):
return (returnCode, stdout, stderr) return (returnCode, stdout, stderr)
def getPowerState( machineName ):
def getPowerState(machineName):
ePowerState = PowerState.UNKNOWN ePowerState = PowerState.UNKNOWN
bPowerStateRead = False bPowerStateRead = False
iNumFailedAttempts = 0 iNumFailedAttempts = 0
while not bPowerStateRead: while not bPowerStateRead:
(returnCode, stdout, stderr) = executeIpmiCommand( machineName, ['sensor', 'get', 'ACPI State'] ) (returnCode, stdout, stderr) = executeIpmiCommand(machineName, ['sensor', 'get', 'ACPI State'])
if returnCode == 0: if returnCode == 0:
matchObj = re.search('\[(?P<AcpiState>S[0-9][^\:]*)\:', stdout) matchObj = re.search(r'\[(?P<AcpiState>S[0-9][^\:]*)\:', stdout)
bBUG_00000002_IS_STILL_ACTIVE = True bBUG_00000002_IS_STILL_ACTIVE = True
if bBUG_00000002_IS_STILL_ACTIVE: if bBUG_00000002_IS_STILL_ACTIVE:
if matchObj == None: if matchObj is None:
# the following warning has been commented out because it pollutes the logs and apparently # the following warning has been commented out because it pollutes the logs and apparently
# it's a 'feature' of all 4 core machines : if the machine is woken up using ipmitool, then # it's a 'feature' of all 4 core machines : if the machine is woken up using ipmitool, then
# no power on event is logged ... # no power on event is logged ...
#logWarning('degenerate ipmitool output for machine %s (see bug 00000002). Assuming power in on because that''s what I noticed when I had the case.' % machineName) # logWarning('degenerate ipmitool output for machine %s (see bug 00000002). Assuming power in on because that''s what I noticed when I had the case.' % machineName)
return PowerState.ON return PowerState.ON
else: else:
assert( matchObj ) assert matchObj
strAcpiState = matchObj.group('AcpiState') strAcpiState = matchObj.group('AcpiState')
if strAcpiState == 'S0/G0': if strAcpiState == 'S0/G0':
ePowerState = PowerState.ON ePowerState = PowerState.ON
@ -92,11 +96,11 @@ def getPowerState( machineName ):
ePowerState = PowerState.OFF ePowerState = PowerState.OFF
else: else:
print(strAcpiState) print(strAcpiState)
assert( False ) assert False
bPowerStateRead = True bPowerStateRead = True
else: else:
# error ... it's either because the machine is unplugged or because the machine is busy (well I'm not sure what happened but I had the case where the command failed for no apparent reason, and therefore I suspect it to be busy ). In order to differentiate these 2 cases, we try again and if this caommand fails too many times then we decide it's unplugged (very dodgy I know but I'm disapointed that this command doen't always work, and for now I don't know other ways to differentiate between these cases....) # error ... it's either because the machine is unplugged or because the machine is busy (well I'm not sure what happened but I had the case where the command failed for no apparent reason, and therefore I suspect it to be busy). In order to differentiate these 2 cases, we try again and if this caommand fails too many times then we decide it's unplugged (very dodgy I know but I'm disapointed that this command doen't always work, and for now I don't know other ways to differentiate between these cases....)
iMAX_NUM_ATTEMPTS=5 iMAX_NUM_ATTEMPTS = 5
iNumFailedAttempts += 1 iNumFailedAttempts += 1
if iNumFailedAttempts < iMAX_NUM_ATTEMPTS: if iNumFailedAttempts < iMAX_NUM_ATTEMPTS:
logWarning('failed to read the power state of %s. I\'ll try a again a bit later....' % machineName) logWarning('failed to read the power state of %s. I\'ll try a again a bit later....' % machineName)
@ -107,17 +111,19 @@ def getPowerState( machineName ):
bPowerStateRead = True bPowerStateRead = True
return ePowerState return ePowerState
def wakeUpMachine( machineName ):
def wakeUpMachine(machineName):
""" """
this method seems more reliable than wake on lan (sometimes, sending wake on lan packet seems to have no effect) this method seems more reliable than wake on lan (sometimes, sending wake on lan packet seems to have no effect)
@return true on success, false otherwise @return true on success, false otherwise
@note I once had this method failing for no obvious reason.. maybe this command does not succeed if the machine is in a transition state @note I once had this method failing for no obvious reason.. maybe this command does not succeed if the machine is in a transition state
""" """
(returnCode, stdout, stderr) = executeIpmiCommand( machineName, ['chassis', 'power', 'on'] ) (returnCode, stdout, stderr) = executeIpmiCommand(machineName, ['chassis', 'power', 'on'])
bSuccess = (returnCode == 0) # this command can fail if the machine is manually unplugged for example bSuccess = (returnCode == 0) # this command can fail if the machine is manually unplugged for example
return bSuccess return bSuccess
def blockingPutMachineToSleep( machineName ):
def blockingPutMachineToSleep(machineName):
""" """
@return true on success, false otherwise @return true on success, false otherwise
""" """
@ -136,14 +142,14 @@ def blockingPutMachineToSleep( machineName ):
while iDelay < iMaxGoToSleepDuration: while iDelay < iMaxGoToSleepDuration:
time.sleep(5) time.sleep(5)
iDelay += 5 iDelay += 5
ePowerState = getPowerState( machineName ) ePowerState = getPowerState(machineName)
if ePowerState == PowerState.SLEEP: if ePowerState == PowerState.SLEEP:
logInfo('machine %s is now sleeping (put to sleep succeeded)' % machineName) logInfo('machine %s is now sleeping (put to sleep succeeded)' % machineName)
return True return True
else: else:
if ePowerState != PowerState.ON: if ePowerState != PowerState.ON:
logWarning('unexpectedly, powerState of %s is %s' % (machineName, PowerStateToStr(ePowerState))) logWarning('unexpectedly, powerState of %s is %s' % (machineName, PowerStateToStr(ePowerState)))
assert(ePowerState == PowerState.ON) assert ePowerState == PowerState.ON
iAttempt += 1 iAttempt += 1
if iAttempt > iMaxNumAttempts: if iAttempt > iMaxNumAttempts:
if bBUG_239_IS_STILL_ALIVE: if bBUG_239_IS_STILL_ALIVE:
@ -156,6 +162,7 @@ def blockingPutMachineToSleep( machineName ):
logWarning('the attempt to put %s to sleep failed... trying again' % (machineName)) logWarning('the attempt to put %s to sleep failed... trying again' % (machineName))
return True return True
def blockingWakeUpMachine(machineName): def blockingWakeUpMachine(machineName):
logInfo('waking up machine %s...' % machineName) logInfo('waking up machine %s...' % machineName)
numAttempts = 0 numAttempts = 0
@ -165,11 +172,11 @@ def blockingWakeUpMachine(machineName):
iNumWakeUpAttempts = 0 iNumWakeUpAttempts = 0
bWakeUpMachineSucceeded = False bWakeUpMachineSucceeded = False
while not bWakeUpMachineSucceeded: while not bWakeUpMachineSucceeded:
bWakeUpMachineSucceeded = wakeUpMachine( machineName ) bWakeUpMachineSucceeded = wakeUpMachine(machineName)
iNumWakeUpAttempts += 1 iNumWakeUpAttempts += 1
# the previous command can fail if the machine is already in a transition # the previous command can fail if the machine is already in a transition
# in that case we try sevral times bevire giving up # in that case we try sevral times bevire giving up
if(bWakeUpMachineSucceeded == False): if not bWakeUpMachineSucceeded:
if iNumWakeUpAttempts < iMaxNumWakeUpAttempts: if iNumWakeUpAttempts < iMaxNumWakeUpAttempts:
iDelay = 5 iDelay = 5
logWarning('wake up attempt %d of %s failed... I\'ll try again in %d seconds' % (iNumWakeUpAttempts, machineName, iDelay)) logWarning('wake up attempt %d of %s failed... I\'ll try again in %d seconds' % (iNumWakeUpAttempts, machineName, iDelay))
@ -180,18 +187,18 @@ def blockingWakeUpMachine(machineName):
bWakeUpFailed = False bWakeUpFailed = False
# wait until the machine is operational # wait until the machine is operational
WAKEUPTIMEOUT=5*60 # max number of seconds allowed for a machine to be alive after a wakeup request WAKEUPTIMEOUT = 5 * 60 # max number of seconds allowed for a machine to be alive after a wakeup request
wakeUpToAliveDuration = 0 wakeUpToAliveDuration = 0
while not Lib.SimpaDbUtil.isMachineResponding( machineName ): while not Lib.SimpaDbUtil.isMachineResponding(machineName):
time.sleep(5) time.sleep(5)
wakeUpToAliveDuration+=5 wakeUpToAliveDuration += 5
if wakeUpToAliveDuration > WAKEUPTIMEOUT: if wakeUpToAliveDuration > WAKEUPTIMEOUT:
# the wake up failed for whatever reason (power state changed manually ? wake up order got lost ?) # the wake up failed for whatever reason (power state changed manually ? wake up order got lost ?)
logWarning('%s took too long (more than %d seconds) to respond after a successful wakeup request.' % (machineName, WAKEUPTIMEOUT)) logWarning('%s took too long (more than %d seconds) to respond after a successful wakeup request.' % (machineName, WAKEUPTIMEOUT))
bWakeUpFailed = True bWakeUpFailed = True
break break
if bWakeUpFailed: if bWakeUpFailed:
numAttempts+=1 numAttempts += 1
if numAttempts >= 2: if numAttempts >= 2:
logWarning('giving up waking up %s because the wake up request succeeded but the machine never actually came alive (and this too many times)' % (machineName)) logWarning('giving up waking up %s because the wake up request succeeded but the machine never actually came alive (and this too many times)' % (machineName))
return False # power state changed manually ? return False # power state changed manually ?
@ -202,11 +209,12 @@ def blockingWakeUpMachine(machineName):
logInfo('Waking up of machine %s completed successfully' % machineName) logInfo('Waking up of machine %s completed successfully' % machineName)
return True return True
def onException(exception): def onException(exception):
sys.stdout.flush() sys.stdout.flush()
strExceptionType = type( exception ) strExceptionType = type(exception)
strMessage = 'exception %s : %s\n' % (strExceptionType, exception.message) strMessage = 'exception %s : %s\n' % (strExceptionType, exception.message)
#traceback.print_last() # traceback.print_last()
f = io.StringIO() f = io.StringIO()
traceback.print_exc(file=f) traceback.print_exc(file=f)
strMessage += f.getvalue() strMessage += f.getvalue()
@ -216,13 +224,11 @@ def onException(exception):
try: try:
# I had the case (see bugzilla 234) where an assert failed in a child thread, but the main process kept going. I suspect that it was caused # I had the case (see bugzilla 234) where an assert failed in a child thread, but the main process kept going. I suspect that it was caused
#by a failure of sendTextMail... that's why I've embedded the sendmail inside a try except block, so that is this operation fails, then the # by a failure of sendTextMail... that's why I've embedded the sendmail inside a try except block, so that is this operation fails, then the
# kill of the main process is still executed. # kill of the main process is still executed.
Lib.Util.sendTextMail( 'ClusterController <guillaume.raffy@univ-rennes1.fr>', 'guillaume.raffy@univ-rennes1.fr', 'ClusterController has stopped because of an exception', strMessage) Lib.Util.sendTextMail('ClusterController <guillaume.raffy@univ-rennes1.fr>', 'guillaume.raffy@univ-rennes1.fr', 'ClusterController has stopped because of an exception', strMessage)
except BaseException: except BaseException:
logError("Could not send the email to notify the administrator that cluster controller failed") logError("Could not send the email to notify the administrator that cluster controller failed")
pass pass
executeCommand('kill -9 %d' % os.getpid()) # stop other threads immediately executeCommand('kill -9 %d' % os.getpid()) # stop other threads immediately
exit() exit()

View File

@ -1,5 +1,6 @@
import Sensor import Sensor
class ClusterNodeSensorsReadings: class ClusterNodeSensorsReadings:
""" """
@ -13,22 +14,26 @@ class ClusterNodeSensorsReadings:
def __init__(self, clusterNodeName): def __init__(self, clusterNodeName):
self.m_clusterNodeName = clusterNodeName self.m_clusterNodeName = clusterNodeName
self.m_sensors = {} self.m_sensors = {}
#self.m_powerState = ClusterNodeStatus.POWERSTATE_UNKNOWN # self.m_powerState = ClusterNodeStatus.POWERSTATE_UNKNOWN
return return
def addSensor(self, sensor): def addSensor(self, sensor):
self.m_sensors[sensor.m_name] = sensor self.m_sensors[sensor.m_name] = sensor
def dump(self): def dump(self):
for key,sensor in self.m_sensors.items(): for key, sensor in self.m_sensors.items():
sensor.dump() sensor.dump()
return return
#def getPowerState(self):
# def getPowerState(self):
# return self.m_powerState # return self.m_powerState
def getLowestTemperature( self ):
#log('ClusterNodeSensorsReadings::getLowestTemperature : start') def getLowestTemperature(self):
# log('ClusterNodeSensorsReadings::getLowestTemperature : start')
lowestTemperature = 0.0 lowestTemperature = 0.0
lowestTemperatureIsDefined = False lowestTemperatureIsDefined = False
for key,sensor in self.m_sensors.items(): for key, sensor in self.m_sensors.items():
#log('ClusterNodeSensorsReadings::getLowestTemperature : start') # log('ClusterNodeSensorsReadings::getLowestTemperature : start')
if sensor.typeName() == 'Temperature': if sensor.typeName() == 'Temperature':
sensor.m_temperature sensor.m_temperature
if lowestTemperatureIsDefined: if lowestTemperatureIsDefined:
@ -37,6 +42,6 @@ class ClusterNodeSensorsReadings:
else: else:
lowestTemperature = sensor.m_temperature lowestTemperature = sensor.m_temperature
lowestTemperatureIsDefined = True lowestTemperatureIsDefined = True
assert( lowestTemperatureIsDefined ) assert lowestTemperatureIsDefined
#log('ClusterNodeSensorsReadings::getLowestTemperature : end') # log('ClusterNodeSensorsReadings::getLowestTemperature : end')
return lowestTemperature return lowestTemperature

View File

@ -3,64 +3,66 @@ import re
from Sensor import FanSensor, TemperatureSensor from Sensor import FanSensor, TemperatureSensor
from ClusterNodeSensorsReadings import ClusterNodeSensorsReadings from ClusterNodeSensorsReadings import ClusterNodeSensorsReadings
class IpmiTool202Parser: class IpmiTool202Parser:
def parseSensorOutput( self, strOutput, clusterNodeName ): def parseSensorOutput(self, strOutput, clusterNodeName):
sensorReadings=ClusterNodeSensorsReadings(clusterNodeName) sensorReadings = ClusterNodeSensorsReadings(clusterNodeName)
f = io.StringIO(strOutput) f = io.StringIO(strOutput)
line = f.readline() line = f.readline()
while( len(line) > 0 ): while len(line) > 0:
#print line, # print line,
matchObj = re.match( '^Sensor ID[ ]*\: \'(?P<sensorName>[a-zA-Z 0-9]+)\'', line ) matchObj = re.match(r'^Sensor ID[ ]*\: \'(?P<sensorName>[a-zA-Z 0-9]+)\'', line)
if matchObj: if matchObj:
sensorName = matchObj.group('sensorName') sensorName = matchObj.group('sensorName')
# print sensorName # print sensorName
# read the entity id # read the entity id
line = f.readline() line = f.readline()
matchObj = re.match( '^ Entity ID[ ]*\: (?P<entityId>[0-9\.]+)', line ) matchObj = re.match(r'^ Entity ID[ ]*\: (?P<entityId>[0-9\.]+)', line)
assert(matchObj) assert matchObj
entityId = matchObj.group('entityId') entityId = matchObj.group('entityId')
# print entityId # print entityId
# read the sensor type # read the sensor type
line = f.readline() line = f.readline()
matchObj = re.match( '^ Sensor Type[\(\)a-zA-Z ]*\: (?P<sensorType>[a-zA-Z \(\)]+)', line ) matchObj = re.match(r'^ Sensor Type[\(\)a-zA-Z ]*\: (?P<sensorType>[a-zA-Z \(\)]+)', line)
assert(matchObj) assert matchObj
sensorType = matchObj.group('sensorType') sensorType = matchObj.group('sensorType')
#print sensorType # print sensorType
if sensorType == 'Fan': if sensorType == 'Fan':
rpms = self.parseFanSensorOutput(f) rpms = self.parseFanSensorOutput(f)
if temperature != None: if temperature is not None:
sensor = FanSensor(sensorName) sensor = FanSensor(sensorName)
sensor.m_rpms = rpms sensor.m_rpms = rpms
elif sensorType == 'Temperature': elif sensorType == 'Temperature':
temperature = self.parseTemperatureSensorOutput(f) temperature = self.parseTemperatureSensorOutput(f)
if temperature != None: if temperature is not None:
sensor = TemperatureSensor(sensorName) sensor = TemperatureSensor(sensorName)
sensor.m_temperature = temperature sensor.m_temperature = temperature
else: else:
#ignoring other sensors # ignoring other sensors
sensor = None sensor = None
if sensor: if sensor:
sensorReadings.addSensor( sensor ) sensorReadings.addSensor(sensor)
else: else:
None None
#assert(False) # assert(False)
line = f.readline() line = f.readline()
f.close() f.close()
def parseFanSensorOutput(self, file): def parseFanSensorOutput(self, file):
""" """
reads the fan specific ipdmitool output reads the fan specific ipdmitool output
""" """
line = file.readline() line = file.readline()
#print line # print line
matchObj = re.match( '^ Sensor Reading[ ]*\: (?P<numRpms>[0-9]+) \(\+/\- (?P<rpmsPrecision>[0-9]+)\) RPM', line ) matchObj = re.match(r'^ Sensor Reading[ ]*\: (?P<numRpms>[0-9]+) \(\+/\- (?P<rpmsPrecision>[0-9]+)\) RPM', line)
if(matchObj): if matchObj:
numRpms = matchObj.group('numRpms') numRpms = matchObj.group('numRpms')
#print numRpms # print numRpms
rpms = float( numRpms ) rpms = float(numRpms)
return rpms return rpms
else: else:
matchObj = re.match( '^ Sensor Reading[ ]*\: Not Present', line ) matchObj = re.match(r'^ Sensor Reading[ ]*\: Not Present', line)
assert(matchObj) assert matchObj
return None return None
def parseTemperatureSensorOutput(self, file): def parseTemperatureSensorOutput(self, file):
@ -69,13 +71,13 @@ class IpmiTool202Parser:
""" """
# Sensor Reading : 36 (+/- 0) degrees C # Sensor Reading : 36 (+/- 0) degrees C
line = file.readline() line = file.readline()
#print line # print line
matchObj = re.match( '^ Sensor Reading[ ]*\: (?P<temperature>[0-9]+) \(\+/\- (?P<precision>[0-9]+)\) degrees C', line ) matchObj = re.match(r'^ Sensor Reading[ ]*\: (?P<temperature>[0-9]+) \(\+/\- (?P<precision>[0-9]+)\) degrees C', line)
if(matchObj): if matchObj:
temperature = matchObj.group('temperature') temperature = matchObj.group('temperature')
temperature = float( temperature ) temperature = float(temperature)
return temperature return temperature
else: else:
matchObj = re.match( '^ Sensor Reading[ ]*\: Not Present', line ) matchObj = re.match(r'^ Sensor Reading[ ]*\: Not Present', line)
assert(matchObj) assert matchObj
return None return None

View File

@ -3,37 +3,37 @@ import re
from Sensor import FanSensor, TemperatureSensor from Sensor import FanSensor, TemperatureSensor
from ClusterNodeSensorsReadings import ClusterNodeSensorsReadings from ClusterNodeSensorsReadings import ClusterNodeSensorsReadings
class IpmiTool218Parser: class IpmiTool218Parser:
def parseSensorOutput( self, strOutput, clusterNodeName ): def parseSensorOutput(self, strOutput, clusterNodeName):
sensorReadings=ClusterNodeSensorsReadings(clusterNodeName) sensorReadings = ClusterNodeSensorsReadings(clusterNodeName)
f = io.StringIO(strOutput) f = io.StringIO(strOutput)
line = f.readline() line = f.readline()
while( len(line) > 0 ): while len(line) > 0:
#print line, # print line,
matchObj = re.match( '^(?P<sensorName>[a-zA-Z 0-9]+[a-zA-Z 0-9]*[a-zA-Z0-9])[ ]*\| (?P<sensorValue>[\.0-9]+)[ ]*\| (?P<sensorUnit>[a-zA-Z0-9][a-zA-Z 0-9]*[a-zA-Z0-9])[?]*', line ) matchObj = re.match(r'^(?P<sensorName>[a-zA-Z 0-9]+[a-zA-Z 0-9]*[a-zA-Z0-9])[ ]*\| (?P<sensorValue>[\.0-9]+)[ ]*\| (?P<sensorUnit>[a-zA-Z0-9][a-zA-Z 0-9]*[a-zA-Z0-9])[?]*', line)
if matchObj: if matchObj:
#log('readClusterNodeSensorsIpmiTool2_1_8 : sensorName = '+matchObj.group('sensorName')) # log('readClusterNodeSensorsIpmiTool2_1_8 : sensorName = '+matchObj.group('sensorName'))
#log('readClusterNodeSensorsIpmiTool2_1_8 : sensorValue = '+matchObj.group('sensorValue')) # log('readClusterNodeSensorsIpmiTool2_1_8 : sensorValue = '+matchObj.group('sensorValue'))
#log('readClusterNodeSensorsIpmiTool2_1_8 : sensorUnit = "'+matchObj.group('sensorUnit')+'"') # log('readClusterNodeSensorsIpmiTool2_1_8 : sensorUnit = "'+matchObj.group('sensorUnit')+'"')
sensorName = matchObj.group('sensorName') sensorName = matchObj.group('sensorName')
sensorValue = matchObj.group('sensorValue') sensorValue = matchObj.group('sensorValue')
sensorUnit = matchObj.group('sensorUnit') sensorUnit = matchObj.group('sensorUnit')
sensor = None sensor = None
if sensorUnit == 'degrees C': if sensorUnit == 'degrees C':
sensor = TemperatureSensor(sensorName) sensor = TemperatureSensor(sensorName)
sensor.m_temperature = float( sensorValue ) sensor.m_temperature = float(sensorValue)
elif sensorUnit == 'RPM': elif sensorUnit == 'RPM':
sensor = FanSensor(sensorName) sensor = FanSensor(sensorName)
sensor.m_rpms = float( sensorValue ) sensor.m_rpms = float(sensorValue)
else: else:
None None
if sensor: if sensor:
#log('readClusterNodeSensorsIpmiTool2_1_8 : adding sensor') # log('readClusterNodeSensorsIpmiTool2_1_8 : adding sensor')
sensorReadings.addSensor( sensor ) sensorReadings.addSensor(sensor)
else: else:
None None
#assert(False) # assert(False)
line = f.readline() line = f.readline()
f.close() f.close()
return sensorReadings return sensorReadings

View File

@ -6,9 +6,9 @@ if sys.version_info < (3, 0):
else: else:
from io import StringIO from io import StringIO
import re import re
from .wol import * from .wol import wake_on_lan
import os import os
from .Util import * from .Util import executeProgram, executeCommand, log
import abc import abc
import sqlite3 import sqlite3
from .mysql2sqlite import mysql_to_sqlite from .mysql2sqlite import mysql_to_sqlite
@ -33,7 +33,7 @@ def isMachineResponding(machineName):
# don't stop the program until we understand bug00000004 # don't stop the program until we understand bug00000004
else: else:
log('isMachineResponding : Unexpected return code : returnCode=%d, stdout="%s", stderr="%s" , machineName = %s' % (returnCode, stdout, stderr, machineName)) log('isMachineResponding : Unexpected return code : returnCode=%d, stdout="%s", stderr="%s" , machineName = %s' % (returnCode, stdout, stderr, machineName))
assert(False) assert False
return False return False
@ -63,7 +63,7 @@ class RemoteMysqlDb(ISqlDatabaseBackend):
def _connect(self): def _connect(self):
self._conn = MySQLdb.connect(self._db_server_fqdn, self._db_user, '', self._db_name) self._conn = MySQLdb.connect(self._db_server_fqdn, self._db_user, '', self._db_name)
assert(self._conn) assert self._conn
def query(self, sql_query): def query(self, sql_query):
""" """
@ -163,13 +163,13 @@ class SqlDatabaseReader(object):
def machineNameToMacAddress(machineName): def machineNameToMacAddress(machineName):
conn = MySQLdb.connect('simpatix10', 'simpadb_reader', '', 'simpadb') conn = MySQLdb.connect('simpatix10', 'simpadb_reader', '', 'simpadb')
assert(conn) assert conn
sqlQuery = """SELECT mac_address FROM ethernet_cards WHERE machine_name='""" + machineName + """' AND type='normal'""" sqlQuery = """SELECT mac_address FROM ethernet_cards WHERE machine_name='""" + machineName + """' AND type='normal'"""
# print sqlQuery # print sqlQuery
conn.query(sqlQuery) conn.query(sqlQuery)
r = conn.store_result() r = conn.store_result()
row = r.fetch_row(0) row = r.fetch_row(0)
assert( len(row) == 1) assert len(row) == 1
# print 'row =', row # print 'row =', row
macAddress = row[0][0] macAddress = row[0][0]
# print macAddress # print macAddress
@ -182,13 +182,13 @@ def getLightOutManagementIpAddress(machineName):
the light out management ip of servers allows to talk to the server even when it's asleep the light out management ip of servers allows to talk to the server even when it's asleep
""" """
conn = MySQLdb.connect('simpatix10', 'simpadb_reader', '', 'simpadb') conn = MySQLdb.connect('simpatix10', 'simpadb_reader', '', 'simpadb')
assert(conn) assert conn
sqlQuery = """SELECT ip_address_1,ip_address_2,ip_address_3,ip_address_4 FROM ethernet_cards WHERE machine_name='""" + machineName + """' AND type='light_out_management'""" sqlQuery = """SELECT ip_address_1,ip_address_2,ip_address_3,ip_address_4 FROM ethernet_cards WHERE machine_name='""" + machineName + """' AND type='light_out_management'"""
# print sqlQuery # print sqlQuery
conn.query(sqlQuery) conn.query(sqlQuery)
r = conn.store_result() r = conn.store_result()
row = r.fetch_row(0) row = r.fetch_row(0)
assert(len(row) == 1) assert len(row) == 1
# print 'row =', row # print 'row =', row
ipAddress = ('%s.%s.%s.%s') % (row[0][0], row[0][1], row[0][2], row[0][3]) ipAddress = ('%s.%s.%s.%s') % (row[0][0], row[0][1], row[0][2], row[0][3])
# print macAddress # print macAddress
@ -199,7 +199,7 @@ def getLightOutManagementIpAddress(machineName):
def getClusterMachinesNames(): def getClusterMachinesNames():
clusterMachinesNames = [] clusterMachinesNames = []
conn = MySQLdb.connect('simpatix10', 'simpadb_reader', '', 'simpadb') conn = MySQLdb.connect('simpatix10', 'simpadb_reader', '', 'simpadb')
assert(conn) assert conn
sqlQuery = """SELECT name FROM machines WHERE affectation='cluster'""" sqlQuery = """SELECT name FROM machines WHERE affectation='cluster'"""
# print sqlQuery # print sqlQuery
conn.query(sqlQuery) conn.query(sqlQuery)
@ -231,7 +231,7 @@ def putToSleep(machineName):
print 'stderr :' print 'stderr :'
print stderr print stderr
""" """
assert(returnCode == 0) assert returnCode == 0
# check if the command succeeded by looking at the output (that's the only way I found) # check if the command succeeded by looking at the output (that's the only way I found)
f = StringIO.StringIO(stdout) f = StringIO.StringIO(stdout)
line = f.readline() line = f.readline()

View File

@ -1,22 +1,28 @@
#!/usr/bin/python #!/usr/bin/python
#import sys # import sys
#sys.path.insert(0, '/homes/raffy/SvnGRaffy/dev/Python') # sys.path.insert(0, '/homes/raffy/SvnGRaffy/dev/Python')
import re import re
#import Lib.Util # import Lib.Util
class SgeConfig: class SgeConfig:
def __init__( self ):
self.m_attrs={} def __init__(self):
self.m_attrs = {}
def hasAttr(self, attr_name): def hasAttr(self, attr_name):
return attr_name in self.m_attrs.keys() return attr_name in self.m_attrs.keys()
def getAttr( self, strAttrName ):
return self.m_attrs[ strAttrName ] def getAttr(self, strAttrName):
def setAttr( self, strAttrName, strAttrValue ): return self.m_attrs[strAttrName]
def setAttr(self, strAttrName, strAttrValue):
assert isinstance(strAttrName, str) assert isinstance(strAttrName, str)
assert isinstance(strAttrValue, str) assert isinstance(strAttrValue, str)
self.m_attrs[ strAttrName ] = strAttrValue self.m_attrs[strAttrName] = strAttrValue
def loadFromSgeFormat1String( self, strSgeConfigString ):
def loadFromSgeFormat1String(self, strSgeConfigString):
""" """
loads attrs from a string such as : loads attrs from a string such as :
hostname simpatix11.univ-rennes1.fr hostname simpatix11.univ-rennes1.fr
@ -41,18 +47,18 @@ class SgeConfig:
usage_scaling NONE usage_scaling NONE
report_variables NONE report_variables NONE
""" """
self.m_attrs={} self.m_attrs = {}
# put multiline attributes on one line # put multiline attributes on one line
strSgeConfigString = re.sub(r"\\\n", "", strSgeConfigString) strSgeConfigString = re.sub(r"\\\n", "", strSgeConfigString)
for strAttrDef in strSgeConfigString.split("\n"): for strAttrDef in strSgeConfigString.split("\n"):
# print("strAttrDef=%s" % strAttrDef) # print("strAttrDef=%s" % strAttrDef)
if len(strAttrDef) != 0: if len(strAttrDef) != 0:
matchObj = re.match( "^(?P<attrName>[^\s]+)[ ]+(?P<attrValue>[^\s].*)$", strAttrDef ) matchObj = re.match(r"^(?P<attrName>[^\s]+)[]+(?P<attrValue>[^\s].*)$", strAttrDef)
assert matchObj is not None assert matchObj is not None
#print( '%s = %s\n' % (matchObj.group("attrName"), matchObj.group("attrValue") ) ) # print('%s = %s\n' % (matchObj.group("attrName"), matchObj.group("attrValue")))
self.m_attrs[ matchObj.group("attrName") ] = matchObj.group("attrValue") self.m_attrs[matchObj.group("attrName")] = matchObj.group("attrValue")
def loadFromSgeFormat2String( self, strSgeConfigString ): def loadFromSgeFormat2String(self, strSgeConfigString):
""" """
loads attrs from a string such as : loads attrs from a string such as :
arch=darwin-x86,num_proc=4,mem_total=8192.000000M, \ arch=darwin-x86,num_proc=4,mem_total=8192.000000M, \
@ -67,18 +73,19 @@ class SgeConfig:
np_load_short=1.296631,np_load_medium=1.281616, \ np_load_short=1.296631,np_load_medium=1.281616, \
np_load_long=1.271973 np_load_long=1.271973
""" """
self.m_attrs={} self.m_attrs = {}
if strSgeConfigString != "NONE": if strSgeConfigString != "NONE":
for strAttrDef in strSgeConfigString.split(","): for strAttrDef in strSgeConfigString.split(","):
#print strAttrDef # print strAttrDef
if len(strAttrDef) != 0: if len(strAttrDef) != 0:
matchObj = re.match( "^\s*(?P<attrName>[^=]+)=(?P<attrValue>.*)$", strAttrDef ) matchObj = re.match(r"^\s*(?P<attrName>[^=]+)=(?P<attrValue>.*)$", strAttrDef)
#print matchObj.group("attrName") # print matchObj.group("attrName")
self.m_attrs[ matchObj.group("attrName") ] = matchObj.group("attrValue") self.m_attrs[matchObj.group("attrName")] = matchObj.group("attrValue")
def asFormat1String( self ):
def asFormat1String(self):
strResult = "" strResult = ""
for (k,v) in self.m_attrs.items(): for (k, v) in self.m_attrs.items():
#print "%s %s" % (k,v) # print "%s %s" % (k,v)
# if the attribute's value is a list of comma separated strings, make sure there are no spaces after the commas, otherwise the value is not properly interpreted when read back into sge # if the attribute's value is a list of comma separated strings, make sure there are no spaces after the commas, otherwise the value is not properly interpreted when read back into sge
# for example if the user sets the value of administrator_mail (using qconf -mconf global) to "alice@univ-rennes1.fr, bob@univ-rennes1.fr", then the next call to qconf -sconf global will show a wrong value for administrator_mail, as shown below: # for example if the user sets the value of administrator_mail (using qconf -mconf global) to "alice@univ-rennes1.fr, bob@univ-rennes1.fr", then the next call to qconf -sconf global will show a wrong value for administrator_mail, as shown below:
# pag_cmd none # pag_cmd none
@ -120,33 +127,33 @@ class SgeConfig:
# root@physix-master:~# qconf -Mconf /tmp/global # root@physix-master:~# qconf -Mconf /tmp/global
# only a single value is allowed for configuration attribute "administrator_mail" # only a single value is allowed for configuration attribute "administrator_mail"
cleaned_value = re.sub(',\s*', ',', v) cleaned_value = re.sub(r',\s*', ',', v)
# prevent space pollution in space separated values, such as in reporting_params (see https://bugzilla.ipr.univ-rennes1.fr/show_bug.cgi?id=2812). If spaces are not compacted, the space separated values will contain more and more spaces and at some point corrupt the value : a line containing just a backslash, such as in the following example: # prevent space pollution in space separated values, such as in reporting_params (see https://bugzilla.ipr.univ-rennes1.fr/show_bug.cgi?id=2812). If spaces are not compacted, the space separated values will contain more and more spaces and at some point corrupt the value : a line containing just a backslash, such as in the following example:
# reporting_params accounting=true reporting=false \ # reporting_params accounting=true reporting=false \
# flush_time=00:00:15 joblog=false \ # flush_time=00:00:15 joblog=false \
# sharelog=00:00:00 # sharelog=00:00:00
# \ # \
cleaned_value = re.sub('\s+', ' ', cleaned_value) cleaned_value = re.sub(r'\s+', ' ', cleaned_value)
strResult += "%s %s\n" % (k, cleaned_value) strResult += "%s %s\n" % (k, cleaned_value)
# print("strResult=%s" % strResult) # print("strResult=%s" % strResult)
return strResult return strResult
def asFormat2String( self ):
def asFormat2String(self):
strResult = "" strResult = ""
iNumAttrs = len(self.m_attrs) iNumAttrs = len(self.m_attrs)
if iNumAttrs == 0: if iNumAttrs == 0:
return "NONE" return "NONE"
iAttr = 0 iAttr = 0
for (k,v) in self.m_attrs.items(): for (k, v) in self.m_attrs.items():
#print "%s %s" % (k,v) # print "%s %s" % (k,v)
strResult += "%s=%s" % (k,v) strResult += "%s=%s" % (k, v)
if iAttr != (iNumAttrs - 1): if iAttr != (iNumAttrs - 1):
strResult += "," strResult += ","
iAttr+=1 iAttr += 1
#print strSgeConfigString # print strSgeConfigString
return strResult return strResult
def dump( self ):
for (k,v) in self.m_attrs.items():
print("['%s']='%s'" % (k,v))
def dump(self):
for (k, v) in self.m_attrs.items():
print("['%s']='%s'" % (k, v))

View File

@ -15,6 +15,7 @@ else:
from html.parser import HTMLParser from html.parser import HTMLParser
from email.mime.text import MIMEText from email.mime.text import MIMEText
def sendTextMail(strFrom, to, strSubject, text): def sendTextMail(strFrom, to, strSubject, text):
# from = "SimpaCluster <guillaume.raffy@univ-rennes1.fr>" # from = "SimpaCluster <guillaume.raffy@univ-rennes1.fr>"
mail = MIMEText(text) mail = MIMEText(text)
@ -47,7 +48,7 @@ def log(message):
def executeProgram(astrArguments): def executeProgram(astrArguments):
# log('executeProgram : program [%s]' % (','.join(astrArguments))) # log('executeProgram : program [%s]' % (','.join(astrArguments)))
popen = subprocess.Popen( astrArguments, shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE) # bufsize=1 seems to prevent deadlocks that happen 50% the time popen = subprocess.Popen(astrArguments, shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE) # bufsize=1 seems to prevent deadlocks that happen 50% the time
stdout, stderr = popen.communicate() stdout, stderr = popen.communicate()
# popen.wait() # popen.wait()
result = (popen.returncode, stdout.decode(), stderr) result = (popen.returncode, stdout.decode(), stderr)
@ -60,7 +61,7 @@ def executeCommand(command):
""" """
executes the shell command such as 'set x=1; myprog $x' executes the shell command such as 'set x=1; myprog $x'
""" """
popen = subprocess.Popen( [command], shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, executable='/bin/bash') # bufsize=1 seems to prevent deadlocks that happen 50% the time popen = subprocess.Popen([command], shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, executable='/bin/bash') # bufsize=1 seems to prevent deadlocks that happen 50% the time
# if we don't specify the optional executable argument, then the default non interactive shell will be used. On debian, the default non-interactive shell is dash, which doesn't understand the keyword 'source' that we use in many places # if we don't specify the optional executable argument, then the default non interactive shell will be used. On debian, the default non-interactive shell is dash, which doesn't understand the keyword 'source' that we use in many places
stdout, stderr = popen.communicate() stdout, stderr = popen.communicate()
# popen.wait() # popen.wait()
@ -85,7 +86,6 @@ def executeCommandOn(target_machine_fqdn, command, user=None):
target = '%s@%s' % (user, target_machine_fqdn) target = '%s@%s' % (user, target_machine_fqdn)
else: else:
target = target_machine_fqdn target = target_machine_fqdn
result = executeProgram(['ssh', target, "%s" % command]) result = executeProgram(['ssh', target, "%s" % command])
logging.debug("finished executing %s on %s as %s" % (command, target_machine_fqdn, user)) logging.debug("finished executing %s on %s as %s" % (command, target_machine_fqdn, user))
return result return result
@ -94,11 +94,12 @@ def executeCommandOn(target_machine_fqdn, command, user=None):
def getUpsStatus(): def getUpsStatus():
class MyHTMLParser(HTMLParser): class MyHTMLParser(HTMLParser):
def __init__(self): def __init__(self):
HTMLParser.__init__(self) HTMLParser.__init__(self)
self.TokenList = [] self.TokenList = []
def handle_data( self, data): def handle_data(self, data):
data = data.strip() data = data.strip()
if data and len(data) > 0: if data and len(data) > 0:
self.TokenList.append(data) self.TokenList.append(data)
@ -118,7 +119,8 @@ def getUpsStatus():
return return
h = MyHTMLParser() h = MyHTMLParser()
h.feed(res) h.feed(res)
tokensList = h.GetTokenList() # @UnusedVariable tokensList = h.GetTokenList() # noqa:F841
if __name__ == '__main__': if __name__ == '__main__':
from SimpaDbUtil import wakeUp from SimpaDbUtil import wakeUp

View File

@ -1,63 +0,0 @@
'''
The goal of this application is to convert a mno database into mno's web site compatible database (drupal)
'''
import sqlite3
import os
import re
import sys
from SimpaDbUtil import SqlFile, SqlDatabaseReader
from _sqlite3 import Row
class OrchestraSqlDb( object ):
def __init__(self, sql_reader):
"""
:param SqlDatabaseReader sql_reader: the inventory database
"""
super(OrchestraSqlDb, self).__init__()
self._sql_reader = sql_reader
def query(self, sql_query):
return self._sql_reader.query(sql_query)
class Concert(object):
pass
class Recording(object):
pass
class OrchestraDb(object):
def __init__(self, mno_drupal_db_sql_file_path):
self.concerts = {}
sql_source = SqlFile(mno_drupal_db_sql_file_path)
sql_reader = SqlDatabaseReader(sql_source)
orchestra_sql_db = OrchestraSqlDb(sql_reader)
self._parse_from_orchestra_drupal_db(orchestra_sql_db)
def _parse_from_orchestra_drupal_db(self, orchestra_sql_db):
"""
:param OrchestraSqlDb orchestra_sql_db:
"""
concert_rows = orchestra_sql_db.query("SELECT nid,title FROM node WHERE type is 'concert'")
for concert_row in concert_rows:
(nid, title)=concert_row
print(title)
nid = int(nid)
track_id_rows = orchestra_sql_db.query("SELECT field_tracks_target_id FROM field_revision_field_tracks WHERE entity_id=%d" % nid )
for track_id_row in track_id_rows:
(field_tracks_target_id, ) = track_id_row
#print(field_tracks_target_id)
track_rows = orchestra_sql_db.query("SELECT title FROM node WHERE nid=%d" % field_tracks_target_id)
(recording_title, ) = track_rows[0]
print("\t%s" % recording_title)
mno_db = OrchestraDb('/Users/graffy/data/Perso/MeltingNotes_work.git/website/v2_drupal/melting_drupal.sql')

View File

@ -1,6 +1,7 @@
import re import re
def mysql_to_sqlite( mysql_sql_code, truncate_hex_strings = False ):
def mysql_to_sqlite(mysql_sql_code, truncate_hex_strings=False):
""" """
converts a mysql-compatible sql code into a sqlite-ompatible sql code converts a mysql-compatible sql code into a sqlite-ompatible sql code
@ -29,23 +30,23 @@ def mysql_to_sqlite( mysql_sql_code, truncate_hex_strings = False ):
content = COMMENTS_RE.sub('', content) content = COMMENTS_RE.sub('', content)
# sqlite doesn't like ' being escaped as \', use '' instead # sqlite doesn't like ' being escaped as \', use '' instead
content = re.sub(r'\\\'', '\'\'', content, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL ) content = re.sub(r'\\\'', '\'\'', content, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL)
if truncate_hex_strings: if truncate_hex_strings:
# sqlite doesn't like too big hex strings 0x613a343a7b733a383a # sqlite doesn't like too big hex strings 0x613a343a7b733a383a
content = re.sub(r'0x[0-9a-f]+', '0xdeadbeef', content, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL ) content = re.sub(r'0x[0-9a-f]+', '0xdeadbeef', content, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL)
# sqlite doesn't understand # sqlite doesn't understand
# `format` varchar(100) CHARACTER SET utf8 COLLATE utf8_bin NOT NULL # `format` varchar(100) CHARACTER SET utf8 COLLATE utf8_bin NOT NULL
content = re.sub(r'\s+CHARACTER SET\s+[^\s]+', '', content, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL ) content = re.sub(r'\s+CHARACTER SET\s+[^\s]+', '', content, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL)
# sqlite doesn't know the utf8_bin : # sqlite doesn't know the utf8_bin :
# `format` varchar(100) CHARACTER SET utf8 COLLATE utf8_bin NOT NULL # `format` varchar(100) CHARACTER SET utf8 COLLATE utf8_bin NOT NULL
#no such collation sequence: utf8_bin # no such collation sequence: utf8_bin
content = re.sub(r'\s+COLLATE\s+utf8_bin\s+', ' ', content, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL ) content = re.sub(r'\s+COLLATE\s+utf8_bin\s+', ' ', content, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL)
# sqlite doesn't like 'unsigned' as in `ip_address_3` tinyint(3) unsigned NOT NULL default '27', # sqlite doesn't like 'unsigned' as in `ip_address_3` tinyint(3) unsigned NOT NULL default '27',
content = re.sub(r' unsigned ', ' ', content, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL ) content = re.sub(r' unsigned ', ' ', content, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL)
# sqlite doesn't like 'enum' as in `type` enum('normal','light_out_management') NOT NULL default 'normal',, # sqlite doesn't like 'enum' as in `type` enum('normal','light_out_management') NOT NULL default 'normal',,
content = re.sub(r' enum\([^\)]*\) ', ' varchar(255) ', content) content = re.sub(r' enum\([^\)]*\) ', ' varchar(255) ', content)
@ -54,29 +55,27 @@ def mysql_to_sqlite( mysql_sql_code, truncate_hex_strings = False ):
# ALTER TABLE `blocked_ips` # ALTER TABLE `blocked_ips`
# ADD PRIMARY KEY (`iid`), # ADD PRIMARY KEY (`iid`),
# ADD KEY `blocked_ip` (`ip`); # ADD KEY `blocked_ip` (`ip`);
content = re.sub(r'alter table [^;]*;', '', content, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL ) content = re.sub(r'alter table [^;]*;', '', content, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL)
# COMMIT; # COMMIT;
# sqlite3.OperationalError: cannot commit - no transaction is active # sqlite3.OperationalError: cannot commit - no transaction is active
content = re.sub(r'commit\s*;', '', content, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL ) content = re.sub(r'commit\s*;', '', content, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL)
# insert multiple values # insert multiple values
# INSERTVALS_RE = re.compile(r'^(INSERT INTO.*?VALUES)\s*\((.*)\*;', re.IGNORECASE | re.MULTILINE | re.DOTALL) # INSERTVALS_RE = re.compile(r'^(INSERT INTO.*?VALUES)\s*\((.*)\*;', re.IGNORECASE | re.MULTILINE | re.DOTALL)
INSERTVALS_RE = re.compile(r'^(INSERT INTO.*?VALUES)\s*([^;]*);', re.IGNORECASE | re.MULTILINE | re.DOTALL) INSERTVALS_RE = re.compile(r'^(INSERT INTO.*?VALUES)\s*([^;]*);', re.IGNORECASE | re.MULTILINE | re.DOTALL)
#INSERTVALS_RE = re.compile(r'^(INSERT INTO.*?VALUES)\s*((\[^\)](\)));$', re.IGNORECASE | re.MULTILINE | re.DOTALL) # INSERTVALS_RE = re.compile(r'^(INSERT INTO.*?VALUES)\s*((\[^\)](\)));$', re.IGNORECASE | re.MULTILINE | re.DOTALL)
INSERTVALS_SPLIT_RE = re.compile(r'\)\s*,\s*\(', re.IGNORECASE | re.MULTILINE | re.DOTALL) INSERTVALS_SPLIT_RE = re.compile(r'\)\s*,\s*\(', re.IGNORECASE | re.MULTILINE | re.DOTALL)
def insertvals_replacer(match): def insertvals_replacer(match):
insert, values = match.groups() insert, values = match.groups()
# print("insert=%s"%insert) # print("insert=%s"%insert)
# print("values=%s"%values) # print("values=%s"%values)
values = re.sub('^\s*\(' ,'', values) values = re.sub(r'^\s*\(', '', values)
values = re.sub('\)\s*$' ,'', values) values = re.sub(r'\)\s*$', '', values)
replacement = '' replacement = ''
for vals in INSERTVALS_SPLIT_RE.split(values): for vals in INSERTVALS_SPLIT_RE.split(values):
#print("vals=%s"%vals) # print("vals=%s"%vals)
replacement = '%s\n%s (%s);' % (replacement, insert, vals) replacement = '%s\n%s (%s);' % (replacement, insert, vals)
return replacement return replacement

View File

@ -4,6 +4,7 @@
import socket import socket
import struct import struct
def wake_on_lan(macaddress): def wake_on_lan(macaddress):
""" Switches on remote computers using WOL. """ """ Switches on remote computers using WOL. """
@ -32,11 +33,10 @@ def wake_on_lan(macaddress):
if __name__ == '__main__': if __name__ == '__main__':
# Use macaddresses with any seperators. # Use macaddresses with any seperators.
wake_on_lan('00:1E:52:F3:61:60') # simpatix28 wake_on_lan('00:1E:52:F3:61:60') # simpatix28
#wake_on_lan('00:24:36:F2:D0:FA') # simpatix33 # wake_on_lan('00:24:36:F2:D0:FA') # simpatix33
#wake_on_lan('0F:0F:DF:0F:BF:EF') # wake_on_lan('0F:0F:DF:0F:BF:EF')
#wake_on_lan('0F-0F-DF-0F-BF-EF') # wake_on_lan('0F-0F-DF-0F-BF-EF')
# or without any seperators. # or without any seperators.
#wake_on_lan('0F0FDF0FBFEF') # wake_on_lan('0F0FDF0FBFEF')

View File

@ -1,7 +1,8 @@
from setuptools import setup from setuptools import setup
setup(name='cocluto', setup(
version=1.00, name='cocluto',
version=1.01,
description='compute cluster utility tools', description='compute cluster utility tools',
url='https://git.ipr.univ-rennes1.fr/graffy/cocluto', url='https://git.ipr.univ-rennes1.fr/graffy/cocluto',
author='Guillaume Raffy', author='Guillaume Raffy',