fixed pylint errors and cleaned up
work related to Bug 3315 - make simpaweb django app a packageable application
This commit is contained in:
parent
7a5d32dec0
commit
270304f58e
|
@ -2,57 +2,69 @@
|
|||
import sys
|
||||
sys.path.insert(0, '..')
|
||||
import os
|
||||
import MySQLdb
|
||||
import threading
|
||||
from Lib.Util import *
|
||||
from Lib.SimpaDbUtil import *
|
||||
import time
|
||||
from ClusterStatus import ClusterStatus
|
||||
from SlotAllocator import *
|
||||
from Log import *
|
||||
from ClusterNodeStatusUpdater import *
|
||||
from SlotAllocator import DecoupledSlotAllocator
|
||||
from Log import logDebug, logInfo
|
||||
from ClusterNodeStatusUpdater import IWakeUpCompleteNotifier, ISleepCompleteNotifier
|
||||
from SunGridEngine import SunGridEngine
|
||||
import Util
|
||||
from Util import log, onException
|
||||
from WebServer import WebServerThread
|
||||
|
||||
from PowerState import PowerState
|
||||
from HTMLParser import HTMLParser
|
||||
|
||||
VERSION='1.18'
|
||||
VERSION = '1.18'
|
||||
|
||||
|
||||
class MyHTMLParser(HTMLParser):
|
||||
def __init__(self):
|
||||
HTMLParser.__init__(self)
|
||||
self.TokenList = []
|
||||
def handle_data( self,data):
|
||||
|
||||
def handle_data(self, data):
|
||||
data = data.strip()
|
||||
if data and len(data) > 0:
|
||||
self.TokenList.append(data)
|
||||
#print data
|
||||
# print data
|
||||
|
||||
def GetTokenList(self):
|
||||
return self.TokenList
|
||||
|
||||
|
||||
class WakeUpCompleteNotifier( IWakeUpCompleteNotifier ):
|
||||
class WakeUpCompleteNotifier(IWakeUpCompleteNotifier):
|
||||
|
||||
def __init__(self, machineName, clusterController):
|
||||
self.m_machineName = machineName
|
||||
self.m_clusterController = clusterController
|
||||
def onWakeUpComplete( self ):
|
||||
|
||||
def onWakeUpComplete(self):
|
||||
logDebug('WakeUpCompleteNotifier::onWakeUpComplete : start')
|
||||
self.m_clusterController.onMachineWakeUpComplete( self.m_machineName )
|
||||
self.m_clusterController.onMachineWakeUpComplete(self.m_machineName)
|
||||
|
||||
|
||||
class SleepCompleteNotifier(ISleepCompleteNotifier):
|
||||
|
||||
class SleepCompleteNotifier( ISleepCompleteNotifier ):
|
||||
def __init__(self, machineName, clusterController):
|
||||
self.m_machineName = machineName
|
||||
self.m_clusterController = clusterController
|
||||
def onSleepComplete( self, bSleepSucceeded ):
|
||||
logDebug('SleepCompleteNotifier::onSleepComplete : start')
|
||||
self.m_clusterController.onMachineSleepComplete( self.m_machineName, bSleepSucceeded )
|
||||
|
||||
def jouleToKwh( fEnergyInJoules ):
|
||||
def onSleepComplete(self, bSleepSucceeded):
|
||||
logDebug('SleepCompleteNotifier::onSleepComplete : start')
|
||||
self.m_clusterController.onMachineSleepComplete(self.m_machineName, bSleepSucceeded)
|
||||
|
||||
|
||||
def jouleToKwh(fEnergyInJoules):
|
||||
"""
|
||||
converts joules to kWH
|
||||
"""
|
||||
# 1 kWh = 1000 * 3600 J
|
||||
return fEnergyInJoules / (1000.0 * 3600.0)
|
||||
|
||||
|
||||
class ClusterController:
|
||||
"""
|
||||
The cluster controller monitors the cluster's activity and has multiple purposes :
|
||||
|
@ -67,10 +79,10 @@ class ClusterController:
|
|||
jobs (eg add some machines to a queue).
|
||||
Mechanism to let user get priority
|
||||
"""
|
||||
def __init__( self ):
|
||||
def __init__(self):
|
||||
gridEngine = SunGridEngine()
|
||||
self.m_clusterStatus = ClusterStatus( gridEngine )
|
||||
self.m_slotAllocator = DecoupledSlotAllocator() #SimpleSlotAllocator()
|
||||
self.m_clusterStatus = ClusterStatus(gridEngine)
|
||||
self.m_slotAllocator = DecoupledSlotAllocator() # SimpleSlotAllocator()
|
||||
self.m_machinesThatNeedWakeUp = {}
|
||||
self.m_machinesThatNeedWakeupLock = threading.Lock() # to prevent concurrent access to m_machinesThatNeedWakeUp
|
||||
self.m_machinesThatNeedSleeping = {}
|
||||
|
@ -82,47 +94,47 @@ class ClusterController:
|
|||
self.m_bStop = False
|
||||
self.m_bStopLock = threading.Lock() # to prevent concurrent access to m_bStop
|
||||
|
||||
def getClusterStatus( self ):
|
||||
def getClusterStatus(self):
|
||||
return self.m_clusterStatus
|
||||
|
||||
def log( self, message ):
|
||||
print message
|
||||
def log(self, message):
|
||||
print(message)
|
||||
|
||||
def shutdownLeastImportantNode( self ):
|
||||
def shutdownLeastImportantNode(self):
|
||||
self.log("ClusterController::shutdownLeastImportantNode : start")
|
||||
|
||||
def onMachineWakeUpComplete( self, machineName ):
|
||||
def onMachineWakeUpComplete(self, machineName):
|
||||
self.m_machinesThatNeedWakeupLock.acquire()
|
||||
#logDebug('ClusterController::onMachineWakeUpComplete : machine %s old len(self.m_machinesThatNeedWakeUp) = %d' % (machineName,len(self.m_machinesThatNeedWakeUp)) )
|
||||
del self.m_machinesThatNeedWakeUp[ machineName ]
|
||||
#logDebug('ClusterController::onMachineWakeUpComplete : machine %s new len(self.m_machinesThatNeedWakeUp) = %d' % (machineName,len(self.m_machinesThatNeedWakeUp)) )
|
||||
# logDebug('ClusterController::onMachineWakeUpComplete : machine %s old len(self.m_machinesThatNeedWakeUp) = %d' % (machineName,len(self.m_machinesThatNeedWakeUp)))
|
||||
del self.m_machinesThatNeedWakeUp[machineName]
|
||||
# logDebug('ClusterController::onMachineWakeUpComplete : machine %s new len(self.m_machinesThatNeedWakeUp) = %d' % (machineName,len(self.m_machinesThatNeedWakeUp)))
|
||||
self.m_machinesThatNeedWakeupLock.release()
|
||||
logDebug('ClusterController::onMachineWakeUpComplete : removed %s from the list of machines that need waking up because it\'s now awake' % machineName)
|
||||
|
||||
def onMachineSleepComplete( self, machineName, bSleepSucceeded ):
|
||||
def onMachineSleepComplete(self, machineName, bSleepSucceeded):
|
||||
self.m_machinesThatNeedSleepingLock.acquire()
|
||||
#logDebug('ClusterController::onMachineSleepComplete : machine %s old len(self.m_machinesThatNeedWakeUp) = %d' % (machineName,len(self.m_machinesThatNeedWakeUp)) )
|
||||
del self.m_machinesThatNeedSleeping[ machineName ]
|
||||
#logDebug('ClusterController::onMachineSleepComplete : machine %s new len(self.m_machinesThatNeedWakeUp) = %d' % (machineName,len(self.m_machinesThatNeedWakeUp)) )
|
||||
# logDebug('ClusterController::onMachineSleepComplete : machine %s old len(self.m_machinesThatNeedWakeUp) = %d' % (machineName,len(self.m_machinesThatNeedWakeUp)))
|
||||
del self.m_machinesThatNeedSleeping[machineName]
|
||||
# logDebug('ClusterController::onMachineSleepComplete : machine %s new len(self.m_machinesThatNeedWakeUp) = %d' % (machineName,len(self.m_machinesThatNeedWakeUp)))
|
||||
self.m_machinesThatNeedSleepingLock.release()
|
||||
if bSleepSucceeded:
|
||||
logDebug('ClusterController::onMachineSleepComplete : removed %s from the list of machines that need sleeping because it\'s now sleeping' % machineName)
|
||||
else:
|
||||
logDebug('ClusterController::onMachineSleepComplete : removed %s from the list of machines that need sleeping because it can\'t be put to sleep at the moment (eg a job just arrived)' % machineName)
|
||||
|
||||
def getNumPendingWakeUps( self ):
|
||||
def getNumPendingWakeUps(self):
|
||||
self.m_machinesThatNeedWakeupLock.acquire()
|
||||
numPendingWakeUps = len(self.m_machinesThatNeedWakeUp)
|
||||
self.m_machinesThatNeedWakeupLock.release()
|
||||
return numPendingWakeUps
|
||||
|
||||
def getNumPendingSleeps( self ):
|
||||
def getNumPendingSleeps(self):
|
||||
self.m_machinesThatNeedSleepingLock.acquire()
|
||||
numPendingSleeps = len(self.m_machinesThatNeedSleeping)
|
||||
self.m_machinesThatNeedSleepingLock.release()
|
||||
return numPendingSleeps
|
||||
|
||||
def putIdleMachinesToSleep( self ):
|
||||
def putIdleMachinesToSleep(self):
|
||||
self.m_clusterStatus.m_lock.acquire()
|
||||
idleMachines = self.m_clusterStatus.getIdleMachines()
|
||||
# logInfo('idleMachines :')
|
||||
|
@ -131,20 +143,19 @@ class ClusterController:
|
|||
if idleMachine.getPowerState() == PowerState.ON:
|
||||
# logInfo('\t%s' % machineName)
|
||||
if idleMachine.getName() != 'simpatix10': # never put simpatix10 to sleep because it's the sge master and is also server for other things
|
||||
self.m_machinesThatNeedSleeping[idleMachine.getName()]=idleMachine
|
||||
self.m_machinesThatNeedSleeping[idleMachine.getName()] = idleMachine
|
||||
self.m_clusterStatus.m_lock.release()
|
||||
|
||||
listOfMachinesThatNeedSleeping = self.m_machinesThatNeedSleeping.values() # duplicate the list so that we don't iterate on m_machinesThatNeedSleeping, which could cause a runtime error because callbacks alter m_machinesThatNeedWakeUp
|
||||
for machine in listOfMachinesThatNeedSleeping:
|
||||
logInfo('ClusterController::putIdleMachinesToSleep : requesting sleep for %s because it\'s idle' % machine.getName())
|
||||
machine.requestSleep( SleepCompleteNotifier( machine.getName(), self ) )
|
||||
machine.requestSleep(SleepCompleteNotifier(machine.getName(), self))
|
||||
|
||||
if len(listOfMachinesThatNeedSleeping) != 0:
|
||||
# hack : wait until the sleep requests are handled so that we don't request the same machine to sleep multiple times
|
||||
while self.getNumPendingSleeps() > 0:
|
||||
time.sleep(1)
|
||||
|
||||
|
||||
def wakeUpMachinesForPendingJobs(self):
|
||||
listOfMachinesThatNeedWakeUp = []
|
||||
|
||||
|
@ -156,15 +167,15 @@ class ClusterController:
|
|||
logInfo('\t%d' % job.getId().asStr())
|
||||
"""
|
||||
if len(pendingJobs) != 0:
|
||||
self.m_machinesThatNeedWakeUp = self.m_slotAllocator.getMachinesThatNeedWakeUp( pendingJobs, self.m_clusterStatus )
|
||||
self.m_machinesThatNeedWakeUp = self.m_slotAllocator.getMachinesThatNeedWakeUp(pendingJobs, self.m_clusterStatus)
|
||||
if len(self.m_machinesThatNeedWakeUp) == 0:
|
||||
None
|
||||
#logInfo('ClusterController::updateNormalState : no machine needs waking up' )
|
||||
# logInfo('ClusterController::updateNormalState : no machine needs waking up')
|
||||
else:
|
||||
listOfMachinesThatNeedWakeUp = self.m_machinesThatNeedWakeUp.values() # duplicate the list so that we don't iterate on m_machinesThatNeedWakeUp, which would cause a runtime error because callbacks alter m_machinesThatNeedWakeUp
|
||||
for machine in listOfMachinesThatNeedWakeUp:
|
||||
logInfo('ClusterController::wakeUpMachinesForPendingJobs : requesting wake up for '+machine.getName() )
|
||||
machine.requestWakeUp( WakeUpCompleteNotifier( machine.getName(), self ) )
|
||||
logInfo('ClusterController::wakeUpMachinesForPendingJobs : requesting wake up for ' + machine.getName())
|
||||
machine.requestWakeUp(WakeUpCompleteNotifier(machine.getName(), self))
|
||||
self.m_clusterStatus.m_lock.release()
|
||||
|
||||
if len(listOfMachinesThatNeedWakeUp) != 0:
|
||||
|
@ -178,49 +189,49 @@ class ClusterController:
|
|||
time.sleep(iSGE_CHEK_RUNNABLE_JOBS_DELAY) # note : this is annoying because it blocks the main thread. This could be improved if we forbid the machines to go to sleep for that much time....
|
||||
logInfo('ClusterController::wakeUpMachinesForPendingJobs : end of the delay given to SGE to allocate slots')
|
||||
|
||||
def updateNormalState( self ):
|
||||
def updateNormalState(self):
|
||||
# attempt to shut down machines that are idle
|
||||
self.putIdleMachinesToSleep()
|
||||
# wake up necessary machines if there are pending jobs
|
||||
self.wakeUpMachinesForPendingJobs()
|
||||
|
||||
def storeSessionInDatabase( self ):
|
||||
def storeSessionInDatabase(self):
|
||||
conn = MySQLdb.connect('simpatix10', 'clusterctrl', '', 'clustercontroller')
|
||||
assert(conn)
|
||||
assert conn
|
||||
|
||||
# retrieve the session id, as it's an auto_increment field
|
||||
sqlCommand = "SELECT AUTO_INCREMENT FROM information_schema.TABLES WHERE TABLE_SCHEMA = 'clustercontroller' AND TABLE_NAME = 'sessions_desc'"
|
||||
print sqlCommand
|
||||
print(sqlCommand)
|
||||
conn.query(sqlCommand)
|
||||
r=conn.store_result()
|
||||
r = conn.store_result()
|
||||
iSessionId = r.fetch_row()[0][0]
|
||||
|
||||
# stores information about the session
|
||||
sqlCommand = "INSERT INTO `sessions_desc` (`start_time`, end_time, `program_version`, `machine_name`, `pid`, num_controlled_machines) VALUES (NOW(), NOW(), '%s', 'simpatix10', %d, %d);" % (VERSION, os.getpid(), len(self.m_clusterStatus.m_clusterNodes))
|
||||
print sqlCommand
|
||||
print(sqlCommand)
|
||||
conn.query(sqlCommand)
|
||||
|
||||
# initialize the energy savings table
|
||||
sqlCommand = "INSERT INTO session_to_energy_savings (session_id, energy_savings_kwh) VALUES (%d,0.0);" % (iSessionId)
|
||||
print sqlCommand
|
||||
print(sqlCommand)
|
||||
conn.query(sqlCommand)
|
||||
|
||||
conn.close()
|
||||
print( 'Session Iid = %d' % iSessionId )
|
||||
print('Session Iid = %d' % iSessionId)
|
||||
return iSessionId
|
||||
|
||||
def updateSessionEnergyConsumptionInDatabase( self ):
|
||||
def updateSessionEnergyConsumptionInDatabase(self):
|
||||
conn = MySQLdb.connect('simpatix10', 'root', '', 'clustercontroller')
|
||||
assert(conn)
|
||||
assert conn
|
||||
|
||||
# update energy savings for the current session
|
||||
sqlCommand = "UPDATE session_to_energy_savings SET energy_savings_kwh=%f WHERE session_id=%d;" % ( jouleToKwh(self.m_clusterStatus.getEnergySavings()) ,self.m_iSessionId)
|
||||
print sqlCommand
|
||||
sqlCommand = "UPDATE session_to_energy_savings SET energy_savings_kwh=%f WHERE session_id=%d;" % (jouleToKwh(self.m_clusterStatus.getEnergySavings()), self.m_iSessionId)
|
||||
print(sqlCommand)
|
||||
conn.query(sqlCommand)
|
||||
|
||||
# update the end time of the current session
|
||||
sqlCommand = "UPDATE sessions_desc SET end_time=NOW() WHERE session_id=%d;" % (self.m_iSessionId)
|
||||
print sqlCommand
|
||||
print(sqlCommand)
|
||||
conn.query(sqlCommand)
|
||||
|
||||
conn.close()
|
||||
|
@ -231,7 +242,7 @@ class ClusterController:
|
|||
"""
|
||||
self.m_clusterStatus.setControlOnMachine(machineName, bControl)
|
||||
|
||||
def run( self ):
|
||||
def run(self):
|
||||
"""
|
||||
"""
|
||||
self.m_iSessionId = self.storeSessionInDatabase()
|
||||
|
@ -247,24 +258,24 @@ class ClusterController:
|
|||
startTime = time.localtime()
|
||||
while not self.m_bStop:
|
||||
currentTime = time.time()
|
||||
#clusterStatus.m_nodesStatus['simpatix10'].dump()
|
||||
if (not self.m_lastEnergyStatusLogTime) or (currentTime > (self.m_lastEnergyStatusLogTime +self.DELAY_BETWEEN_ENERGY_STATUS_LOGS)):
|
||||
# clusterStatus.m_nodesStatus['simpatix10'].dump()
|
||||
if (not self.m_lastEnergyStatusLogTime) or (currentTime > (self.m_lastEnergyStatusLogTime + self.DELAY_BETWEEN_ENERGY_STATUS_LOGS)):
|
||||
iNumMachines = len(self.m_clusterStatus.m_clusterNodes)
|
||||
iNumMachinesOn = 0
|
||||
iNumSleepingMachines = 0
|
||||
for machine in self.m_clusterStatus.m_clusterNodes.values():
|
||||
ePowerState = machine.getPowerState()
|
||||
if ePowerState == PowerState.ON:
|
||||
iNumMachinesOn+=1
|
||||
iNumMachinesOn += 1
|
||||
elif ePowerState == PowerState.SLEEP:
|
||||
iNumSleepingMachines+=1
|
||||
iNumSleepingMachines += 1
|
||||
logInfo('%d machines (%d ON, %d SLEEPING)' % (iNumMachines, iNumMachinesOn, iNumSleepingMachines))
|
||||
iNumSlots = self.m_clusterStatus.getNumControlledSlots()
|
||||
iNumUsedSlots = self.m_clusterStatus.getNumUsedSlots()
|
||||
iNumWastedSlots = self.m_clusterStatus.getNumWastedSlots()
|
||||
iNumSleepingSlots = self.m_clusterStatus.getNumSleepingSlots()
|
||||
logInfo('%d slots (%d used, %d wasted, %d sleeping)' % (iNumSlots, iNumUsedSlots, iNumWastedSlots, iNumSleepingSlots ))
|
||||
logInfo('cluster estimated power consumption : %f W (saving from cluster controller : %f W)' % (self.m_clusterStatus.getCurrentPowerConsumption(), self.m_clusterStatus.getCurrentPowerSavings()) )
|
||||
logInfo('%d slots (%d used, %d wasted, %d sleeping)' % (iNumSlots, iNumUsedSlots, iNumWastedSlots, iNumSleepingSlots))
|
||||
logInfo('cluster estimated power consumption : %f W (saving from cluster controller : %f W)' % (self.m_clusterStatus.getCurrentPowerConsumption(), self.m_clusterStatus.getCurrentPowerSavings()))
|
||||
logInfo('cluster estimated energy consumption since %s : %f kWh (saving from cluster controller : %f kWh)' % (time.asctime(startTime), jouleToKwh(self.m_clusterStatus.getEnergyConsumption()), jouleToKwh(self.m_clusterStatus.getEnergySavings())))
|
||||
self.updateSessionEnergyConsumptionInDatabase()
|
||||
self.m_lastEnergyStatusLogTime = currentTime
|
||||
|
@ -274,11 +285,11 @@ class ClusterController:
|
|||
self.m_clusterStatus.stopReadingThreads()
|
||||
|
||||
|
||||
def storeClusterNodeStatus( clusterNodeStatus ):
|
||||
#conn = MySQLdb.connect('simpatix10', 'measures_writer', '', 'simpa_measurements')
|
||||
def storeClusterNodeStatus(clusterNodeStatus):
|
||||
# conn = MySQLdb.connect('simpatix10', 'measures_writer', '', 'simpa_measurements')
|
||||
conn = MySQLdb.connect('simpatix10', 'root', '', 'simpa_measurements')
|
||||
assert(conn)
|
||||
#conn.query("""INSERT INTO `fan_rpm_logs` (`fan_id`, `rpm`, `date`) VALUES ('titi', 2000, NOW());""")
|
||||
assert conn
|
||||
# conn.query("""INSERT INTO `fan_rpm_logs` (`fan_id`, `rpm`, `date`) VALUES ('titi', 2000, NOW());""")
|
||||
'''
|
||||
conn.query("""SELECT * FROM fan_rpm_logs""")
|
||||
r=conn.store_result()
|
||||
|
@ -287,28 +298,29 @@ def storeClusterNodeStatus( clusterNodeStatus ):
|
|||
for key, sensor in clusterNodeStatus.m_sensors.items():
|
||||
sensorId = clusterNodeStatus.m_clusterNodeName + '_' + sensor.m_name
|
||||
if sensor.typeName() == 'Fan':
|
||||
sqlCommand = """INSERT INTO `fan_rpm_logs` (`fan_id`, `rpm`, `date`) VALUES ('"""+sensorId+"""', """+str(sensor.m_rpms)+""", NOW());"""
|
||||
print sqlCommand
|
||||
sqlCommand = """INSERT INTO `fan_rpm_logs` (`fan_id`, `rpm`, `date`) VALUES ('""" + sensorId + """', """ + str(sensor.m_rpms) + """, NOW());"""
|
||||
print(sqlCommand)
|
||||
conn.query(sqlCommand)
|
||||
elif sensor.typeName() == 'Temperature':
|
||||
sqlCommand = """INSERT INTO `temperature_logs` (`temp_sensor_id`, `temperature`, `date`) VALUES ('"""+sensorId+"""', """+str(sensor.m_temperature)+""", NOW());"""
|
||||
print sqlCommand
|
||||
sqlCommand = """INSERT INTO `temperature_logs` (`temp_sensor_id`, `temperature`, `date`) VALUES ('""" + sensorId + """', """ + str(sensor.m_temperature) + """, NOW());"""
|
||||
print(sqlCommand)
|
||||
conn.query(sqlCommand)
|
||||
else:
|
||||
assert(False)
|
||||
assert False
|
||||
conn.close()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
#Lib.Util.sendTextMail( 'SimpaCluster <guillaume.raffy@univ-rennes1.fr>', 'guillaume.raffy@univ-rennes1.fr', 'mail subject', 'mail content')
|
||||
# Lib.Util.sendTextMail('SimpaCluster <guillaume.raffy@univ-rennes1.fr>', 'guillaume.raffy@univ-rennes1.fr', 'mail subject', 'mail content')
|
||||
try:
|
||||
logInfo('ClusterController v. %s starting....' % VERSION)
|
||||
#executeCommand('ping -o -t 1 simpatix310 > /dev/null')
|
||||
#print executeCommand('ssh simpatix10 "ipmitool sensor"')
|
||||
#assert False, 'prout'
|
||||
# executeCommand('ping -o -t 1 simpatix310 > /dev/null')
|
||||
# print executeCommand('ssh simpatix10 "ipmitool sensor"')
|
||||
# assert False, 'prout'
|
||||
controller = ClusterController()
|
||||
controller.run()
|
||||
#machineNameToMacAddress( 'simpatix10' )
|
||||
#except AssertionError, error:
|
||||
#except KeyboardInterrupt, error:
|
||||
except BaseException, exception: # catches all exceptions, including the ctrl+C (KeyboardInterrupt)
|
||||
Util.onException(exception)
|
||||
# machineNameToMacAddress('simpatix10')
|
||||
# except AssertionError, error:
|
||||
# except KeyboardInterrupt, error:
|
||||
except BaseException as exception: # catches all exceptions, including the ctrl+C (KeyboardInterrupt)
|
||||
onException(exception)
|
||||
|
|
|
@ -1,48 +1,50 @@
|
|||
import threading
|
||||
from PowerState import *
|
||||
from ClusterNodeStatusUpdater import *
|
||||
from PowerState import PowerState, PowerStateToStr
|
||||
from ClusterNodeStatusUpdater import ClusterNodeStatusUpdater
|
||||
import Lib.Util
|
||||
import Lib.SimpaDbUtil
|
||||
from Log import logInfo, logWarning
|
||||
|
||||
from datetime import datetime
|
||||
|
||||
from datetime import *
|
||||
|
||||
class ClusterNode:
|
||||
"""
|
||||
the state of a machine node
|
||||
"""
|
||||
def __init__( self, machineName, cluster, gridEngine ):
|
||||
def __init__(self, machineName, cluster, gridEngine):
|
||||
self.m_name = machineName
|
||||
self.m_cluster = cluster # the cluster this machine belongs to
|
||||
self.m_requestedPowerState = PowerState.ON
|
||||
self.m_powerState = PowerState.UNKNOWN
|
||||
self.m_lastPowerStateTime = None # time at which the last value of self.m_powerState has been set
|
||||
self.m_machineStatusUpdater = ClusterNodeStatusUpdater( machineName, self, gridEngine )
|
||||
self.m_machineStatusUpdater = ClusterNodeStatusUpdater(machineName, self, gridEngine)
|
||||
self.m_energyConsumption = 0.0 # estimate of the energy consumption of this machine since the start of cluster controller (in joules)
|
||||
self.m_energySavings = 0.0 # estimate of the energy savings on this machine caused by the cluster controller since it started (in joules)
|
||||
|
||||
def getName( self ):
|
||||
def getName(self):
|
||||
return self.m_name
|
||||
|
||||
def isReady( self ):
|
||||
def isReady(self):
|
||||
if self.m_powerState == PowerState.UNKNOWN:
|
||||
#logInfo( self.m_name + ' is not ready (waiting for power state)' )
|
||||
# logInfo(self.m_name + ' is not ready (waiting for power state)')
|
||||
return False
|
||||
if self.m_powerState == PowerState.ON:
|
||||
return True
|
||||
#log( self.m_name + ' is ready' )
|
||||
# log(self.m_name + ' is ready')
|
||||
return True
|
||||
|
||||
def getPowerState( self ):
|
||||
def getPowerState(self):
|
||||
return self.m_powerState
|
||||
|
||||
def setShouldAlwaysBeOn( self ):
|
||||
self.m_machineStatusUpdater.setShouldAlwaysBeOn( )
|
||||
self.setPowerState( PowerState.ON )
|
||||
def setShouldAlwaysBeOn(self):
|
||||
self.m_machineStatusUpdater.setShouldAlwaysBeOn()
|
||||
self.setPowerState(PowerState.ON)
|
||||
|
||||
def setPowerState( self, powerState ):
|
||||
def setPowerState(self, powerState):
|
||||
bUpdateRequiredChecks = False
|
||||
if self.m_powerState == PowerState.UNKNOWN:
|
||||
logInfo('ClusterNode::setPowerState : '+self.m_name+'\'s power state has been initialized to '+PowerStateToStr( powerState ))
|
||||
logInfo('ClusterNode::setPowerState : ' + self.m_name + '\'s power state has been initialized to ' + PowerStateToStr(powerState))
|
||||
self.m_powerState = powerState
|
||||
self.m_lastPowerStateTime = datetime.now()
|
||||
bUpdateRequiredChecks = True
|
||||
|
@ -51,7 +53,7 @@ class ClusterNode:
|
|||
self.updateEnergyMeasurements()
|
||||
# then change the power state
|
||||
if self.m_powerState != powerState:
|
||||
logInfo('ClusterNode::setPowerState : '+self.m_name+'\'s power state has been changed to '+PowerStateToStr( powerState ))
|
||||
logInfo('ClusterNode::setPowerState : ' + self.m_name + '\'s power state has been changed to ' + PowerStateToStr(powerState))
|
||||
self.m_powerState = powerState
|
||||
self.m_lastPowerStateTime = datetime.now()
|
||||
bUpdateRequiredChecks = True
|
||||
|
@ -69,18 +71,18 @@ class ClusterNode:
|
|||
self.m_machineStatusUpdater.m_bCheckPowerState = True
|
||||
self.m_machineStatusUpdater.m_bCheckSensors = False
|
||||
else:
|
||||
assert( False )
|
||||
assert False
|
||||
|
||||
def onNewPowerStateReading( self, powerState ):
|
||||
def onNewPowerStateReading(self, powerState):
|
||||
"""
|
||||
called when a new powerstate reading arrives
|
||||
"""
|
||||
if powerState != self.getPowerState():
|
||||
if self.getPowerState() != PowerState.UNKNOWN:
|
||||
logWarning('ClusterNode::onNewPowerStateReading : '+self.m_name+'\'s power state has been (manually it seems) changed to '+PowerStateToStr( powerState ))
|
||||
self.setPowerState( powerState )
|
||||
logWarning('ClusterNode::onNewPowerStateReading : ' + self.m_name + '\'s power state has been (manually it seems) changed to ' + PowerStateToStr(powerState))
|
||||
self.setPowerState(powerState)
|
||||
|
||||
def getPowerConsumptionForPowerState( self, ePowerState ):
|
||||
def getPowerConsumptionForPowerState(self, ePowerState):
|
||||
"""
|
||||
returns the power consumption estimation (in watts) of this machine for the given power state
|
||||
"""
|
||||
|
@ -96,45 +98,45 @@ class ClusterNode:
|
|||
elif ePowerState == PowerState.UNPLUGGED:
|
||||
fCurrentIntensity = 0.0
|
||||
else:
|
||||
assert(False)
|
||||
assert False
|
||||
return fCurrentIntensity * fCurrentVoltage
|
||||
|
||||
def updateEnergyMeasurements( self ):
|
||||
def updateEnergyMeasurements(self):
|
||||
timeInterval = datetime.now() - self.m_lastPowerStateTime
|
||||
self.m_energyConsumption += self.getPowerConsumptionForPowerState( self.m_powerState ) * timeInterval.seconds
|
||||
self.m_energySavings += ( self.getPowerConsumptionForPowerState( PowerState.ON ) - self.getPowerConsumptionForPowerState( self.m_powerState ) ) * timeInterval.seconds
|
||||
self.m_energyConsumption += self.getPowerConsumptionForPowerState(self.m_powerState) * timeInterval.seconds
|
||||
self.m_energySavings += (self.getPowerConsumptionForPowerState(PowerState.ON) - self.getPowerConsumptionForPowerState(self.m_powerState)) * timeInterval.seconds
|
||||
self.m_lastPowerStateTime = datetime.now()
|
||||
#logDebug('energy savings on %s : %f J' %(self.getName(), self.m_energySavings))
|
||||
# logDebug('energy savings on %s : %f J' %(self.getName(), self.m_energySavings))
|
||||
|
||||
def getEnergyConsumption( self ):
|
||||
def getEnergyConsumption(self):
|
||||
"""
|
||||
in joules
|
||||
"""
|
||||
self.updateEnergyMeasurements()
|
||||
return self.m_energyConsumption
|
||||
|
||||
def getPowerConsumption( self ):
|
||||
fCurrentPowerConsumption = self.getPowerConsumptionForPowerState( self.m_powerState )
|
||||
#logDebug('getPowerConsumption of %s : %f (powerstate = %d)' % (self.getName(), fCurrentPowerConsumption, self.m_powerState))
|
||||
def getPowerConsumption(self):
|
||||
fCurrentPowerConsumption = self.getPowerConsumptionForPowerState(self.m_powerState)
|
||||
# logDebug('getPowerConsumption of %s : %f (powerstate = %d)' % (self.getName(), fCurrentPowerConsumption, self.m_powerState))
|
||||
return fCurrentPowerConsumption
|
||||
|
||||
def getEnergySavings( self ):
|
||||
def getEnergySavings(self):
|
||||
self.updateEnergyMeasurements()
|
||||
return self.m_energySavings
|
||||
|
||||
def onSleepFailedBecauseAJobJustArrived( self ):
|
||||
def onSleepFailedBecauseAJobJustArrived(self):
|
||||
logInfo('%s was scheduled to sleep but the sleep is canceled because it\'s currently executing a new job' % self.m_name)
|
||||
|
||||
def requestSleep( self, sleepCompleteNotifier = None ):
|
||||
self.m_machineStatusUpdater.requestSleep( sleepCompleteNotifier )
|
||||
def requestSleep(self, sleepCompleteNotifier=None):
|
||||
self.m_machineStatusUpdater.requestSleep(sleepCompleteNotifier)
|
||||
|
||||
def requestWakeUp( self, wakeUpCompleteNotifier = None ):
|
||||
self.m_machineStatusUpdater.requestWakeUp( wakeUpCompleteNotifier )
|
||||
def requestWakeUp(self, wakeUpCompleteNotifier=None):
|
||||
self.m_machineStatusUpdater.requestWakeUp(wakeUpCompleteNotifier)
|
||||
|
||||
def getQueueMachineName( self ):
|
||||
return self.getCluster().getJobsState().getQueueMachine( self.m_name ).getName()
|
||||
assert( self.m_queueName != None )
|
||||
def getQueueMachineName(self):
|
||||
return self.getCluster().getJobsState().getQueueMachine(self.m_name).getName()
|
||||
assert self.m_queueName is not None
|
||||
return self.m_queueName
|
||||
|
||||
def getCluster( self ):
|
||||
def getCluster(self):
|
||||
return self.m_cluster
|
||||
|
|
|
@ -2,143 +2,147 @@ import threading
|
|||
import time
|
||||
import Lib.Util
|
||||
import Lib.SimpaDbUtil
|
||||
import os
|
||||
import traceback
|
||||
import sys
|
||||
from PowerState import *
|
||||
from QstatParser import *
|
||||
import Util
|
||||
from PowerState import PowerState
|
||||
from Log import logInfo, logDebug
|
||||
from Util import blockingWakeUpMachine, blockingPutMachineToSleep, getPowerState, onException
|
||||
|
||||
|
||||
class IWakeUpCompleteNotifier:
|
||||
"""
|
||||
interface for wakeup notifiers
|
||||
"""
|
||||
def onWakeUpComplete( self ):
|
||||
assert( False )
|
||||
def onWakeUpComplete(self):
|
||||
assert False
|
||||
|
||||
|
||||
class ISleepCompleteNotifier:
|
||||
"""
|
||||
interface for sleep notifiers
|
||||
"""
|
||||
def onSleepComplete( self, bSleepSucceeded ):
|
||||
assert( False )
|
||||
def onSleepComplete(self, bSleepSucceeded):
|
||||
assert False
|
||||
|
||||
|
||||
class IRequest:
|
||||
GO_TO_SLEEP = 1
|
||||
WAKE_UP = 2
|
||||
CHECK_POWER_STATE = 3
|
||||
|
||||
def __init__( self, requestType ):
|
||||
def __init__(self, requestType):
|
||||
self.m_type = requestType
|
||||
|
||||
def getType( self ):
|
||||
def getType(self):
|
||||
return self.m_type
|
||||
|
||||
def process( self, clusterNodeStatusUpdater ):
|
||||
def process(self, clusterNodeStatusUpdater):
|
||||
"""
|
||||
processes this request
|
||||
"""
|
||||
assert( False ) # this method is abstract
|
||||
assert False # this method is abstract
|
||||
|
||||
class WakeUpRequest( IRequest ):
|
||||
|
||||
def __init__( self, wakeUpNotifier ):
|
||||
IRequest.__init__( self, IRequest.WAKE_UP )
|
||||
class WakeUpRequest(IRequest):
|
||||
|
||||
def __init__(self, wakeUpNotifier):
|
||||
IRequest.__init__(self, IRequest.WAKE_UP)
|
||||
self.m_wakeUpNotifier = wakeUpNotifier
|
||||
|
||||
def process( self, clusterNodeStatusUpdater ):
|
||||
assert( clusterNodeStatusUpdater.m_bShouldAlwaysBeOn == False ) # are we attempting to wake up a machine that should always be on ?
|
||||
logInfo('Handling wakeup request for %s' % clusterNodeStatusUpdater.getName() )
|
||||
bSuccess = blockingWakeUpMachine( clusterNodeStatusUpdater.getName() )
|
||||
assert( bSuccess )
|
||||
def process(self, clusterNodeStatusUpdater):
|
||||
assert clusterNodeStatusUpdater.m_bShouldAlwaysBeOn is False # are we attempting to wake up a machine that should always be on ?
|
||||
logInfo('Handling wakeup request for %s' % clusterNodeStatusUpdater.getName())
|
||||
bSuccess = blockingWakeUpMachine(clusterNodeStatusUpdater.getName())
|
||||
assert bSuccess
|
||||
# activate the associated machine queue
|
||||
if clusterNodeStatusUpdater.setQueueActivation( True ):
|
||||
None # all is ok
|
||||
if clusterNodeStatusUpdater.setQueueActivation(True):
|
||||
pass # all is ok
|
||||
else:
|
||||
assert( False )
|
||||
assert False
|
||||
clusterNodeStatusUpdater.m_stateLock.acquire()
|
||||
clusterNodeStatusUpdater.m_clusterNode.setPowerState( PowerState.ON )
|
||||
clusterNodeStatusUpdater.m_clusterNode.setPowerState(PowerState.ON)
|
||||
clusterNodeStatusUpdater.m_stateLock.release()
|
||||
if self.m_wakeUpNotifier:
|
||||
logDebug('ClusterNodeStatusUpdater::run : Sending wakeup notification')
|
||||
self.m_wakeUpNotifier.onWakeUpComplete()
|
||||
|
||||
class SleepRequest( IRequest ):
|
||||
|
||||
def __init__( self, sleepCompleteNotifier ):
|
||||
IRequest.__init__( self, IRequest.GO_TO_SLEEP )
|
||||
class SleepRequest(IRequest):
|
||||
|
||||
def __init__(self, sleepCompleteNotifier):
|
||||
IRequest.__init__(self, IRequest.GO_TO_SLEEP)
|
||||
self.m_sleepCompleteNotifier = sleepCompleteNotifier
|
||||
|
||||
def process( self, clusterNodeStatusUpdater ):
|
||||
assert( clusterNodeStatusUpdater.m_bShouldAlwaysBeOn == False ) # are we attempting to put a machine the should stay on to sleep ?
|
||||
logInfo('Handling sleep request for %s' % clusterNodeStatusUpdater.getName() )
|
||||
if clusterNodeStatusUpdater.setQueueActivation( False ):
|
||||
def process(self, clusterNodeStatusUpdater):
|
||||
assert not clusterNodeStatusUpdater.m_bShouldAlwaysBeOn # are we attempting to put a machine the should stay on to sleep ?
|
||||
logInfo('Handling sleep request for %s' % clusterNodeStatusUpdater.getName())
|
||||
if clusterNodeStatusUpdater.setQueueActivation(False):
|
||||
if clusterNodeStatusUpdater.queueIsEmpty():
|
||||
if blockingPutMachineToSleep( clusterNodeStatusUpdater.m_clusterNodeName ):
|
||||
if blockingPutMachineToSleep(clusterNodeStatusUpdater.m_clusterNodeName):
|
||||
# now we know that the machine is asleep
|
||||
clusterNodeStatusUpdater.m_stateLock.acquire()
|
||||
clusterNodeStatusUpdater.m_clusterNode.setPowerState( PowerState.SLEEP )
|
||||
clusterNodeStatusUpdater.m_clusterNode.setPowerState(PowerState.SLEEP)
|
||||
clusterNodeStatusUpdater.m_stateLock.release()
|
||||
if self.m_sleepCompleteNotifier:
|
||||
self.m_sleepCompleteNotifier.onSleepComplete( True )
|
||||
self.m_sleepCompleteNotifier.onSleepComplete(True)
|
||||
else:
|
||||
assert( False )
|
||||
assert False
|
||||
else:
|
||||
# reactivate the queue
|
||||
if not clusterNodeStatusUpdater.setQueueActivation( True ):
|
||||
assert( False )
|
||||
if not clusterNodeStatusUpdater.setQueueActivation(True):
|
||||
assert False
|
||||
clusterNodeStatusUpdater.m_stateLock.acquire()
|
||||
clusterNodeStatusUpdater.m_clusterNode.setPowerState( PowerState.ON ) # this is necessary to reenable the various cyclic checks that were disabled on sleep request
|
||||
clusterNodeStatusUpdater.m_clusterNode.setPowerState(PowerState.ON) # this is necessary to reenable the various cyclic checks that were disabled on sleep request
|
||||
clusterNodeStatusUpdater.m_stateLock.release()
|
||||
clusterNodeStatusUpdater.m_clusterNode.onSleepFailedBecauseAJobJustArrived()
|
||||
if self.m_sleepCompleteNotifier:
|
||||
self.m_sleepCompleteNotifier.onSleepComplete( False )
|
||||
self.m_sleepCompleteNotifier.onSleepComplete(False)
|
||||
else:
|
||||
assert( False )
|
||||
assert False
|
||||
|
||||
class CheckPowerStateRequest( IRequest ):
|
||||
|
||||
def __init__( self ):
|
||||
IRequest.__init__( self, IRequest.CHECK_POWER_STATE )
|
||||
class CheckPowerStateRequest(IRequest):
|
||||
|
||||
def process( self, clusterNodeStatusUpdater ):
|
||||
powerState = Util.getPowerState( clusterNodeStatusUpdater.m_clusterNodeName )
|
||||
def __init__(self):
|
||||
IRequest.__init__(self, IRequest.CHECK_POWER_STATE)
|
||||
|
||||
def process(self, clusterNodeStatusUpdater):
|
||||
powerState = getPowerState(clusterNodeStatusUpdater.m_clusterNodeName)
|
||||
clusterNodeStatusUpdater.m_stateLock.acquire()
|
||||
clusterNodeStatusUpdater.m_clusterNode.onNewPowerStateReading( powerState )
|
||||
clusterNodeStatusUpdater.m_clusterNode.onNewPowerStateReading(powerState)
|
||||
clusterNodeStatusUpdater.m_lastPowerStateCheckTime = time.time()
|
||||
clusterNodeStatusUpdater.m_stateLock.release()
|
||||
|
||||
class ClusterNodeStatusUpdater( threading.Thread ):
|
||||
DELAY_BETWEEN_POWERSTATE_CHECKS=5*60 # in seconds
|
||||
|
||||
def __init__( self, machineName, clusterNode, gridEngine ):
|
||||
class ClusterNodeStatusUpdater(threading.Thread):
|
||||
DELAY_BETWEEN_POWERSTATE_CHECKS = 5 * 60 # in seconds
|
||||
|
||||
def __init__(self, machineName, clusterNode, gridEngine):
|
||||
threading.Thread.__init__(self)
|
||||
self.m_clusterNodeName = machineName
|
||||
self.m_clusterNode = clusterNode
|
||||
self.m_gridEngine = gridEngine
|
||||
self.m_bStop = False
|
||||
self.m_lastPowerStateCheckTime = None #time.time()
|
||||
self.m_lastPowerStateCheckTime = None # time.time()
|
||||
self.m_bCheckPowerState = True
|
||||
self.m_stateLock = threading.Lock() # lock that prevents concurrent access to the state of this instance
|
||||
self.m_bShouldAlwaysBeOn = False # indicates that the machine should never go to sleep or off for whatever reason (eg simpatix10)
|
||||
self.m_pendingRequestsQueue = []
|
||||
|
||||
def getGridEngine( self ):
|
||||
def getGridEngine(self):
|
||||
return self.m_gridEngine
|
||||
|
||||
def getName( self ):
|
||||
def getName(self):
|
||||
return self.m_clusterNodeName
|
||||
|
||||
def setShouldAlwaysBeOn( self ):
|
||||
print('%s should always be on' % (self.getName()) )
|
||||
def setShouldAlwaysBeOn(self):
|
||||
print('%s should always be on' % (self.getName()))
|
||||
self.m_bShouldAlwaysBeOn = True
|
||||
|
||||
def pushRequest( self, request ):
|
||||
def pushRequest(self, request):
|
||||
self.m_stateLock.acquire()
|
||||
self.m_pendingRequestsQueue.append(request)
|
||||
self.m_stateLock.release()
|
||||
|
||||
def popRequest( self ):
|
||||
def popRequest(self):
|
||||
oldestRequest = None
|
||||
self.m_stateLock.acquire()
|
||||
if len(self.m_pendingRequestsQueue) != 0:
|
||||
|
@ -146,14 +150,14 @@ class ClusterNodeStatusUpdater( threading.Thread ):
|
|||
self.m_stateLock.release()
|
||||
return oldestRequest
|
||||
|
||||
def run( self ):
|
||||
def run(self):
|
||||
try:
|
||||
|
||||
while not self.m_bStop :
|
||||
while not self.m_bStop:
|
||||
# handle the oldest request
|
||||
request = self.popRequest()
|
||||
if request != None :
|
||||
request.process( self )
|
||||
if request is not None:
|
||||
request.process(self)
|
||||
|
||||
# schedule a power state check if required
|
||||
currentTime = time.time()
|
||||
|
@ -161,28 +165,28 @@ class ClusterNodeStatusUpdater( threading.Thread ):
|
|||
if not self.m_bShouldAlwaysBeOn: # don't do power checks on such machines because some current implementations of
|
||||
# operations involved might cause the machine to go to sleep
|
||||
if (not self.m_lastPowerStateCheckTime) or (currentTime > (self.m_lastPowerStateCheckTime + ClusterNodeStatusUpdater.DELAY_BETWEEN_POWERSTATE_CHECKS)):
|
||||
self.pushRequest( CheckPowerStateRequest() )
|
||||
self.pushRequest(CheckPowerStateRequest())
|
||||
|
||||
time.sleep(1)
|
||||
except BaseException, exception: # catches all exceptions, including the ctrl+C (KeyboardInterrupt)
|
||||
Util.onException(exception)
|
||||
except BaseException as exception: # catches all exceptions, including the ctrl+C (KeyboardInterrupt)
|
||||
onException(exception)
|
||||
|
||||
def requestSleep( self, sleepCompleteNotifier = None ):
|
||||
assert( self.m_bShouldAlwaysBeOn == False )
|
||||
self.pushRequest( SleepRequest( sleepCompleteNotifier ) )
|
||||
def requestSleep(self, sleepCompleteNotifier=None):
|
||||
assert not self.m_bShouldAlwaysBeOn
|
||||
self.pushRequest(SleepRequest(sleepCompleteNotifier))
|
||||
|
||||
def requestWakeUp( self, wakeUpNotifier = None ):
|
||||
assert( self.m_bShouldAlwaysBeOn == False )
|
||||
self.pushRequest( WakeUpRequest( wakeUpNotifier ) )
|
||||
def requestWakeUp(self, wakeUpNotifier=None):
|
||||
assert self.m_bShouldAlwaysBeOn is False
|
||||
self.pushRequest(WakeUpRequest(wakeUpNotifier))
|
||||
|
||||
def getQueueMachineName( self ):
|
||||
def getQueueMachineName(self):
|
||||
return self.m_clusterNode.getQueueMachineName()
|
||||
|
||||
def setQueueActivation( self, bEnable ):
|
||||
def setQueueActivation(self, bEnable):
|
||||
"""
|
||||
@return true on success, false otherwise
|
||||
"""
|
||||
return self.getGridEngine().setQueueInstanceActivation( self.getQueueMachineName(), bEnable )
|
||||
return self.getGridEngine().setQueueInstanceActivation(self.getQueueMachineName(), bEnable)
|
||||
|
||||
def queueIsEmpty( self ):
|
||||
return self.getGridEngine().queueIsEmpty( self.getName() )
|
||||
def queueIsEmpty(self):
|
||||
return self.getGridEngine().queueIsEmpty(self.getName())
|
||||
|
|
|
@ -1,10 +1,13 @@
|
|||
import threading
|
||||
from JobsStateUpdater import *
|
||||
from JobsStateUpdater import JobsStateUpdater
|
||||
import Lib.Util
|
||||
import Lib.SimpaDbUtil
|
||||
from ClusterNode import *
|
||||
from ClusterNode import ClusterNode
|
||||
from Log import logInfo, logError
|
||||
from PowerState import PowerState
|
||||
import time
|
||||
|
||||
|
||||
class ClusterStatus:
|
||||
"""
|
||||
The current state (jobs, sensors) of the cluster
|
||||
|
@ -15,39 +18,38 @@ class ClusterStatus:
|
|||
self.m_gridEngine = gridEngine
|
||||
self.m_clusterNodes = {}
|
||||
self.m_lock = threading.Lock() # to prevent concurrent access to this instance
|
||||
self.m_jobsStateUpdater = JobsStateUpdater( self )
|
||||
self.m_jobsStateUpdater = JobsStateUpdater(self)
|
||||
self.m_jobsState = None
|
||||
#self.m_controlledMachineNames = [ 'simpatix30' ]
|
||||
self.m_controlledMachineNames = [] # [ 'simpatix30' ]
|
||||
# self.m_controlledMachineNames = ['simpatix30']
|
||||
self.m_controlledMachineNames = [] # ['simpatix30']
|
||||
if False:
|
||||
for iMachine in range(11, 40):
|
||||
if (iMachine == 31) or (iMachine == 32):
|
||||
continue # these machines don't seem to be able to go to sleep properly (bug 00000010)
|
||||
if (iMachine == 18):
|
||||
continue # this machine needs maintenance (restarting because it's very slow for an unknown reason)
|
||||
self.m_controlledMachineNames.append( 'simpatix%d' % iMachine )
|
||||
self.m_controlledMachineNames.append('simpatix%d' % iMachine)
|
||||
nodeNames = Lib.SimpaDbUtil.getClusterMachinesNames()
|
||||
for nodeName in nodeNames:
|
||||
if nodeName in self.m_controlledMachineNames:
|
||||
logInfo( 'machine %s is under the cluster controller\'s control' % nodeName )
|
||||
clusterNode = ClusterNode( nodeName, self, gridEngine )
|
||||
logInfo('machine %s is under the cluster controller\'s control' % nodeName)
|
||||
clusterNode = ClusterNode(nodeName, self, gridEngine)
|
||||
if nodeName == 'simpatix10':
|
||||
clusterNode.setShouldAlwaysBeOn()
|
||||
self.m_clusterNodes[ nodeName ] = clusterNode
|
||||
self.m_clusterNodes[nodeName] = clusterNode
|
||||
return
|
||||
|
||||
|
||||
def setControlOnMachine(self, machineName, bControl):
|
||||
if bControl:
|
||||
# add machineName under control of ClusterController
|
||||
for k, v in self.m_clusterNodes.items():
|
||||
if v.getName() == machineName :
|
||||
if v.getName() == machineName:
|
||||
return # nothing to do : machineName is already under the control of ClusterController
|
||||
|
||||
clusterNode = ClusterNode( machineName, self, self.m_gridEngine )
|
||||
clusterNode = ClusterNode(machineName, self, self.m_gridEngine)
|
||||
if machineName == 'simpatix10':
|
||||
clusterNode.setShouldAlwaysBeOn()
|
||||
self.m_clusterNodes[ machineName ] = clusterNode
|
||||
self.m_clusterNodes[machineName] = clusterNode
|
||||
clusterNode.m_machineStatusUpdater.start()
|
||||
else:
|
||||
# remove machineName from control of ClusterController
|
||||
|
@ -57,48 +59,48 @@ class ClusterStatus:
|
|||
clusterNode.m_machineStatusUpdater.join()
|
||||
self.m_clusterNodes.pop(machineName)
|
||||
|
||||
def getGridEngine( self ):
|
||||
def getGridEngine(self):
|
||||
return self.m_gridEngine
|
||||
|
||||
def getMachines( self ):
|
||||
def getMachines(self):
|
||||
return self.m_clusterNodes
|
||||
|
||||
def startReadingThreads( self ):
|
||||
def startReadingThreads(self):
|
||||
for k, v in self.m_clusterNodes.items():
|
||||
v.m_machineStatusUpdater.start()
|
||||
self.m_jobsStateUpdater.start()
|
||||
|
||||
def stopReadingThreads( self ):
|
||||
def stopReadingThreads(self):
|
||||
for k, v in self.m_clusterNodes.items():
|
||||
v.m_machineStatusUpdater.m_bStop = True
|
||||
v.m_machineStatusUpdater.join()
|
||||
self.m_jobsStateUpdater.m_bStop = True
|
||||
self.m_jobsStateUpdater.join()
|
||||
|
||||
def onNewJobsState( self, newJobsState ):
|
||||
#logDebug( 'ClusterStatus::onNewJobsState : attempting to acquire lock to access m_jobsState' )
|
||||
def onNewJobsState(self, newJobsState):
|
||||
# logDebug('ClusterStatus::onNewJobsState : attempting to acquire lock to access m_jobsState')
|
||||
self.m_lock.acquire()
|
||||
#logDebug( 'ClusterStatus::onNewJobsState : got lock to access m_jobsState' )
|
||||
# logDebug('ClusterStatus::onNewJobsState : got lock to access m_jobsState')
|
||||
self.m_jobsState = newJobsState
|
||||
self.m_lock.release()
|
||||
|
||||
def getJobsOnMachine( self, machineName ):
|
||||
return self.m_jobsState.getJobsOnMachine( machineName )
|
||||
def getJobsOnMachine(self, machineName):
|
||||
return self.m_jobsState.getJobsOnMachine(machineName)
|
||||
|
||||
def isReady( self ):
|
||||
def isReady(self):
|
||||
for k, v in self.m_clusterNodes.items():
|
||||
if not v.isReady():
|
||||
logInfo( 'ClusterStatus::isReady : not ready because of ' + v.getName() )
|
||||
logInfo('ClusterStatus::isReady : not ready because of ' + v.getName())
|
||||
return False
|
||||
#log('ClusterStatus::isReady() : '+k+' is ready')
|
||||
#assert( False )
|
||||
if self.m_jobsState == None:
|
||||
logInfo( 'ClusterStatus::isReady : not ready because waiting for jobs state' )
|
||||
# log('ClusterStatus::isReady() : '+k+' is ready')
|
||||
# assert(False)
|
||||
if self.m_jobsState is None:
|
||||
logInfo('ClusterStatus::isReady : not ready because waiting for jobs state')
|
||||
return False
|
||||
return True
|
||||
|
||||
def getIdleMachines( self ):
|
||||
assert( self.isReady )
|
||||
def getIdleMachines(self):
|
||||
assert self.isReady
|
||||
bBUG_00000009_IS_STILL_ALIVE = True
|
||||
if bBUG_00000009_IS_STILL_ALIVE:
|
||||
currentTime = time.time()
|
||||
|
@ -106,33 +108,33 @@ class ClusterStatus:
|
|||
fJobsStateAge = currentTime - self.m_jobsState.getTime()
|
||||
if fJobsStateAge > fJOBS_STATE_MAX_ALLOWED_AGE:
|
||||
logError('ClusterStatus::getIdleMachines : age of jobs state is too old (%f s). This is bug 00000009.' % (fJobsStateAge))
|
||||
assert( False )
|
||||
assert False
|
||||
idleMachines = {}
|
||||
for machineName, machine in self.m_clusterNodes.items():
|
||||
if machine.getPowerState() == PowerState.ON:
|
||||
jobsOnThisMachine = self.getJobsOnMachine( machineName )
|
||||
jobsOnThisMachine = self.getJobsOnMachine(machineName)
|
||||
if len(jobsOnThisMachine) == 0:
|
||||
idleMachines[ machineName ] = machine
|
||||
idleMachines[machineName] = machine
|
||||
return idleMachines
|
||||
|
||||
def getPendingJobs( self ):
|
||||
def getPendingJobs(self):
|
||||
return self.m_jobsState.getPendingJobs()
|
||||
|
||||
def getJobsState( self ):
|
||||
def getJobsState(self):
|
||||
return self.m_jobsState
|
||||
|
||||
def queueMachineFitsJobRequirements( self, queueMachine, jobRequirements ):
|
||||
def queueMachineFitsJobRequirements(self, queueMachine, jobRequirements):
|
||||
if jobRequirements.m_queues:
|
||||
bQueueIsInAllowedQueues = False
|
||||
for queueName in jobRequirements.m_queues:
|
||||
if queueName == queueMachine.getQueueName():
|
||||
bQueueIsInAllowedQueues = True
|
||||
if not bQueueIsInAllowedQueues:
|
||||
logInfo('queueMachineFitsJobRequirements : queueMachine '+queueMachine.getName()+' rejected because it\'s not in the allowed queues')
|
||||
logInfo('queueMachineFitsJobRequirements : queueMachine ' + queueMachine.getName() + ' rejected because it\'s not in the allowed queues')
|
||||
return False
|
||||
return True
|
||||
|
||||
def getEnergyConsumption( self ):
|
||||
def getEnergyConsumption(self):
|
||||
"""
|
||||
returns an estimate of the energy consumption since the start of the cluster controller (in joules)
|
||||
"""
|
||||
|
@ -142,7 +144,7 @@ class ClusterStatus:
|
|||
fEnergyConsumption += machine.getEnergyConsumption()
|
||||
return fEnergyConsumption
|
||||
|
||||
def getEnergySavings( self ):
|
||||
def getEnergySavings(self):
|
||||
"""
|
||||
returns an estimate of the energy saving since the start of the cluster controller (in joules)
|
||||
"""
|
||||
|
@ -152,58 +154,56 @@ class ClusterStatus:
|
|||
fEnergySavings += machine.getEnergySavings()
|
||||
return fEnergySavings
|
||||
|
||||
def getCurrentPowerConsumption( self ):
|
||||
def getCurrentPowerConsumption(self):
|
||||
fPowerConsumption = 0.0
|
||||
for machine in self.m_clusterNodes.values():
|
||||
if machine.isReady():
|
||||
fPowerConsumption += machine.getPowerConsumption()
|
||||
return fPowerConsumption
|
||||
|
||||
def getCurrentPowerSavings( self ):
|
||||
def getCurrentPowerSavings(self):
|
||||
fPowerSavings = 0.0
|
||||
for machine in self.m_clusterNodes.values():
|
||||
if machine.isReady():
|
||||
fPowerSavings += machine.getPowerConsumptionForPowerState( PowerState.ON ) - machine.getPowerConsumption()
|
||||
fPowerSavings += machine.getPowerConsumptionForPowerState(PowerState.ON) - machine.getPowerConsumption()
|
||||
return fPowerSavings
|
||||
|
||||
def getNumControlledSlots( self ):
|
||||
def getNumControlledSlots(self):
|
||||
self.m_lock.acquire()
|
||||
iNumControlledSlots = 0
|
||||
for machine in self.m_clusterNodes.values():
|
||||
queueMachine = self.m_jobsState.getQueueMachine( machine.getName() )
|
||||
queueMachine = self.m_jobsState.getQueueMachine(machine.getName())
|
||||
iNumControlledSlots += queueMachine.getNumSlots()
|
||||
self.m_lock.release()
|
||||
return iNumControlledSlots
|
||||
|
||||
def getNumUsedSlots( self ):
|
||||
def getNumUsedSlots(self):
|
||||
self.m_lock.acquire()
|
||||
iNumUsedSlots = 0
|
||||
for machine in self.m_clusterNodes.values():
|
||||
queueMachine = self.m_jobsState.getQueueMachine( machine.getName() )
|
||||
queueMachine = self.m_jobsState.getQueueMachine(machine.getName())
|
||||
iNumUsedSlotsOnThisMachine = queueMachine.getNumSlots() - self.m_jobsState.getNumFreeSlotsOnQueueMachine(queueMachine)
|
||||
assert(iNumUsedSlotsOnThisMachine >= 0)
|
||||
assert iNumUsedSlotsOnThisMachine >= 0
|
||||
iNumUsedSlots += iNumUsedSlotsOnThisMachine
|
||||
self.m_lock.release()
|
||||
return iNumUsedSlots
|
||||
|
||||
def getNumWastedSlots( self ):
|
||||
def getNumWastedSlots(self):
|
||||
self.m_lock.acquire()
|
||||
iNumWastedSlots = 0
|
||||
for machine in self.m_clusterNodes.values():
|
||||
if machine.getPowerState() == PowerState.ON:
|
||||
queueMachine = self.m_jobsState.getQueueMachine( machine.getName() )
|
||||
queueMachine = self.m_jobsState.getQueueMachine(machine.getName())
|
||||
iNumWastedSlots += self.m_jobsState.getNumFreeSlotsOnQueueMachine(queueMachine)
|
||||
self.m_lock.release()
|
||||
return iNumWastedSlots
|
||||
|
||||
def getNumSleepingSlots( self ):
|
||||
def getNumSleepingSlots(self):
|
||||
self.m_lock.acquire()
|
||||
iNumSleepingSlots = 0
|
||||
for machine in self.m_clusterNodes.values():
|
||||
if machine.getPowerState() == PowerState.SLEEP:
|
||||
queueMachine = self.m_jobsState.getQueueMachine( machine.getName() )
|
||||
queueMachine = self.m_jobsState.getQueueMachine(machine.getName())
|
||||
iNumSleepingSlots += self.m_jobsState.getNumFreeSlotsOnQueueMachine(queueMachine)
|
||||
self.m_lock.release()
|
||||
return iNumSleepingSlots
|
||||
|
||||
|
||||
|
|
|
@ -1,19 +1,21 @@
|
|||
|
||||
class JobStateFlags:
|
||||
RUNNING=1 # the job is running
|
||||
WAITING=2 # the job is waiting
|
||||
QUEUED=4 # not sure what that exactly means but it reflects the q state of jobs as seen in the pending jobs list from qstat -f -u \*
|
||||
TRANSFERING=8
|
||||
DELETED=16
|
||||
HOLD=32
|
||||
ERROR=64
|
||||
SUSPENDED=128
|
||||
RUNNING = 1 # the job is running
|
||||
WAITING = 2 # the job is waiting
|
||||
QUEUED = 4 # not sure what that exactly means but it reflects the q state of jobs as seen in the pending jobs list from qstat -f -u \*
|
||||
TRANSFERING = 8
|
||||
DELETED = 16
|
||||
HOLD = 32
|
||||
ERROR = 64
|
||||
SUSPENDED = 128
|
||||
|
||||
|
||||
class ParallelEnvironment:
|
||||
MPI=1
|
||||
MPI = 1
|
||||
|
||||
|
||||
class JobRequirements:
|
||||
def __init__( self ):
|
||||
def __init__(self):
|
||||
self.m_numSlots = None
|
||||
self.m_strArchitecture = None # machine architecture
|
||||
self.m_parallelEnvironment = None
|
||||
|
@ -28,13 +30,14 @@ class JobId:
|
|||
share the same sge job identifier. To uniquely define a job array element, we also use the task id.
|
||||
"""
|
||||
MAX_NUM_JOBS_IN_ARRAY = 1000000
|
||||
def __init__( self, iJobId, iJobArrayElementId = None):
|
||||
|
||||
def __init__(self, iJobId, iJobArrayElementId=None):
|
||||
if iJobArrayElementId is not None:
|
||||
assert iJobArrayElementId <= self.MAX_NUM_JOBS_IN_ARRAY
|
||||
self.m_iJobId = iJobId
|
||||
self.m_iJobArrayElementId = iJobArrayElementId # None if this identifier does not refer to a job array element
|
||||
|
||||
def __hash__( self ):
|
||||
def __hash__(self):
|
||||
"""
|
||||
required to use a JobId as a dict hash key
|
||||
"""
|
||||
|
@ -43,7 +46,7 @@ class JobId:
|
|||
hash += self.m_iJobArrayElementId
|
||||
return hash
|
||||
|
||||
def __eq__( self, other ):
|
||||
def __eq__(self, other):
|
||||
"""
|
||||
required to use a JobId as a dict hash key
|
||||
"""
|
||||
|
@ -53,22 +56,21 @@ class JobId:
|
|||
return False
|
||||
return True
|
||||
|
||||
def isJobArrayElement( self ):
|
||||
return (self.m_iJobArrayElementId != None)
|
||||
def isJobArrayElement(self):
|
||||
return (self.m_iJobArrayElementId is not None)
|
||||
|
||||
def getMainId(self):
|
||||
return self.m_iJobId
|
||||
|
||||
def asStr( self ):
|
||||
def asStr(self):
|
||||
strResult = '%s' % self.m_iJobId
|
||||
if self.isJobArrayElement():
|
||||
strResult += '.%d' % self.m_iJobArrayElementId
|
||||
return strResult
|
||||
|
||||
|
||||
|
||||
class Job:
|
||||
def __init__( self, jobId ):
|
||||
def __init__(self, jobId):
|
||||
self.m_jobId = jobId
|
||||
self.m_startTime = None
|
||||
self.m_submitTime = None
|
||||
|
@ -78,53 +80,67 @@ class Job:
|
|||
self.m_stateFlags = 0
|
||||
self.m_jobRequirements = JobRequirements()
|
||||
self.m_requestedRamPerCore = 0
|
||||
def getId( self ):
|
||||
|
||||
def getId(self):
|
||||
return self.m_jobId
|
||||
def setState( self, state ):
|
||||
|
||||
def setState(self, state):
|
||||
self.m_stateFlags = state
|
||||
def setOwner( self, jobOwner ):
|
||||
|
||||
def setOwner(self, jobOwner):
|
||||
if self.m_owner:
|
||||
assert( self.m_owner == jobOwner )
|
||||
assert self.m_owner == jobOwner
|
||||
self.m_owner = jobOwner
|
||||
def getOwner( self ):
|
||||
|
||||
def getOwner(self):
|
||||
return self.m_owner
|
||||
def setStartTime( self, jobStartTime ):
|
||||
|
||||
def setStartTime(self, jobStartTime):
|
||||
if self.m_startTime:
|
||||
assert( self.m_startTime == jobStartTime )
|
||||
assert self.m_startTime == jobStartTime
|
||||
self.m_startTime = jobStartTime
|
||||
def setSubmitTime( self, jobSubmitTime ):
|
||||
|
||||
def setSubmitTime(self, jobSubmitTime):
|
||||
if self.m_submitTime:
|
||||
assert( self.m_submitTime == jobSubmitTime )
|
||||
assert self.m_submitTime == jobSubmitTime
|
||||
self.m_submitTime = jobSubmitTime
|
||||
def getStartTime( self ):
|
||||
|
||||
def getStartTime(self):
|
||||
return self.m_startTime
|
||||
def setScriptName( self, jobScriptName ):
|
||||
|
||||
def setScriptName(self, jobScriptName):
|
||||
if self.m_scriptName:
|
||||
assert( self.m_scriptName == jobScriptName )
|
||||
assert self.m_scriptName == jobScriptName
|
||||
self.m_scriptName = jobScriptName
|
||||
def addSlots( self, queueMachineName, numSlots ):
|
||||
assert( self.m_slots.get( queueMachineName ) == None )
|
||||
if self.m_slots.get( queueMachineName ) == None:
|
||||
self.m_slots[ queueMachineName ] = numSlots
|
||||
|
||||
def addSlots(self, queueMachineName, numSlots):
|
||||
assert self.m_slots.get(queueMachineName) is None
|
||||
if self.m_slots.get(queueMachineName) is None:
|
||||
self.m_slots[queueMachineName] = numSlots
|
||||
else:
|
||||
# should never happen
|
||||
self.m_slots[ queueMachineName ] += numSlots
|
||||
def getSlots( self ):
|
||||
self.m_slots[queueMachineName] += numSlots
|
||||
|
||||
def getSlots(self):
|
||||
return self.m_slots
|
||||
def setNumRequiredSlots( self, numSlots ):
|
||||
|
||||
def setNumRequiredSlots(self, numSlots):
|
||||
self.m_jobRequirements.m_numSlots = numSlots
|
||||
def isPending( self ):
|
||||
|
||||
def isPending(self):
|
||||
"""
|
||||
returns true if this job is waiting in the queue for whatever reason
|
||||
"""
|
||||
return self.m_stateFlags & JobStateFlags.QUEUED
|
||||
def getRequestedRamPerCore( self ):
|
||||
|
||||
def getRequestedRamPerCore(self):
|
||||
"""
|
||||
requested RAM per core in bytes
|
||||
"""
|
||||
return self.m_requestedRamPerCore
|
||||
def setRequestedRamPerCore( self, requestedRam ):
|
||||
|
||||
def setRequestedRamPerCore(self, requestedRam):
|
||||
"""
|
||||
requestedRam : requested RAM per core in bytes
|
||||
"""
|
||||
self.m_requestedRamPerCore=requestedRam
|
||||
self.m_requestedRamPerCore = requestedRam
|
||||
|
|
|
@ -1,85 +1,86 @@
|
|||
from .Log import *
|
||||
|
||||
|
||||
class JobsState:
|
||||
"""
|
||||
represents a snapshot of the state of SGE jobs as seen by the SGE command "qstat -f -u \*"
|
||||
"""
|
||||
def __init__( self ):
|
||||
def __init__(self):
|
||||
self.m_jobs = {} # list of jobs
|
||||
self.m_jobArrayJobs = {} # a dictionary of jobs for each job array, indexed by job array id
|
||||
self.m_queueMachines = {} # list of queue machines such as allintel.q@simpatix10
|
||||
self.m_stateTime = None # the time at which the state was snapshot
|
||||
|
||||
def deleteAllJobs( self ):
|
||||
def deleteAllJobs(self):
|
||||
self.m_jobs = {}
|
||||
self.m_jobArrayJobs = {}
|
||||
|
||||
def addJob( self, job ):
|
||||
def addJob(self, job):
|
||||
jobId = job.getId()
|
||||
self.m_jobs[ jobId ] = job
|
||||
self.m_jobs[jobId] = job
|
||||
if jobId.isJobArrayElement():
|
||||
tasks = self.m_jobArrayJobs.get(jobId.m_iJobId)
|
||||
if tasks == None:
|
||||
if tasks is None:
|
||||
tasks = {}
|
||||
self.m_jobArrayJobs[ jobId.m_iJobId ] = tasks
|
||||
self.m_jobArrayJobs[jobId.m_iJobId] = tasks
|
||||
tasks[jobId] = job
|
||||
|
||||
def getJob( self, jobId ):
|
||||
return self.m_jobs.get( jobId )
|
||||
def getJob(self, jobId):
|
||||
return self.m_jobs.get(jobId)
|
||||
|
||||
def getJobArrayJobs( self, iJobArrayId ):
|
||||
return self.m_jobArrayJobs.get( iJobArrayId )
|
||||
def getJobArrayJobs(self, iJobArrayId):
|
||||
return self.m_jobArrayJobs.get(iJobArrayId)
|
||||
|
||||
def setTime( self, stateTime ):
|
||||
def setTime(self, stateTime):
|
||||
self.m_stateTime = stateTime
|
||||
|
||||
def getTime( self ):
|
||||
def getTime(self):
|
||||
return self.m_stateTime
|
||||
|
||||
def getJobsOnMachine( self, machineName ):
|
||||
def getJobsOnMachine(self, machineName):
|
||||
jobsOnMachine = {}
|
||||
for jobId, job in self.m_jobs.items():
|
||||
for queueMachineName, numSlots in job.getSlots().items():
|
||||
jobMachineName = queueMachineName.split('@')[1]
|
||||
if jobMachineName == machineName:
|
||||
jobsOnMachine[ jobId ] = job
|
||||
jobsOnMachine[jobId] = job
|
||||
return jobsOnMachine
|
||||
|
||||
def getNumFreeSlotsOnQueueMachine( self, queueMachine ):
|
||||
#logInfo('getNumFreeSlotsOnQueueMachine : looking for free slots on queuemachine %s' % queueMachine.getName() )
|
||||
def getNumFreeSlotsOnQueueMachine(self, queueMachine):
|
||||
# logInfo('getNumFreeSlotsOnQueueMachine : looking for free slots on queuemachine %s' % queueMachine.getName())
|
||||
numUsedSlots = 0
|
||||
for job in self.m_jobs.values():
|
||||
numUsedSlotsByThisJob = job.getSlots().get( queueMachine.getName() )
|
||||
if numUsedSlotsByThisJob != None:
|
||||
#logInfo('getNumFreeSlotsOnQueueMachine : job %d uses %d slots' % (job.getId().asStr(), numUsedSlotsByThisJob) )
|
||||
numUsedSlotsByThisJob = job.getSlots().get(queueMachine.getName())
|
||||
if numUsedSlotsByThisJob is not None:
|
||||
# logInfo('getNumFreeSlotsOnQueueMachine : job %d uses %d slots' % (job.getId().asStr(), numUsedSlotsByThisJob))
|
||||
numUsedSlots += numUsedSlotsByThisJob
|
||||
else:
|
||||
None
|
||||
#logInfo('getNumFreeSlotsOnQueueMachine : job %d uses no slot' % job.getId().asStr() )
|
||||
# logInfo('getNumFreeSlotsOnQueueMachine : job %d uses no slot' % job.getId().asStr())
|
||||
numFreeSlots = queueMachine.getNumSlots() - numUsedSlots
|
||||
assert( numFreeSlots >= 0 )
|
||||
assert numFreeSlots >= 0
|
||||
return numFreeSlots
|
||||
|
||||
def addQueueMachine( self, queueMachine ):
|
||||
self.m_queueMachines[ queueMachine.getName() ] = queueMachine
|
||||
def addQueueMachine(self, queueMachine):
|
||||
self.m_queueMachines[queueMachine.getName()] = queueMachine
|
||||
|
||||
def getQueueMachine( self, machineName ):
|
||||
def getQueueMachine(self, machineName):
|
||||
"""
|
||||
finds the queue machine associated with a machine
|
||||
"""
|
||||
queueMachine = None
|
||||
for qmName, qm in self.m_queueMachines.items():
|
||||
if qm.m_machineName == machineName:
|
||||
assert( queueMachine == None ) # to be sure that no more than one queue machine is on a given machine
|
||||
assert queueMachine is None # to be sure that no more than one queue machine is on a given machine
|
||||
queueMachine = qm
|
||||
return queueMachine
|
||||
|
||||
def getQueueMachines( self ):
|
||||
def getQueueMachines(self):
|
||||
return self.m_queueMachines
|
||||
|
||||
def getPendingJobs( self ):
|
||||
def getPendingJobs(self):
|
||||
pendingJobs = {}
|
||||
for jobId, job in self.m_jobs.items():
|
||||
if job.isPending():
|
||||
pendingJobs[ job.getId() ] = job
|
||||
pendingJobs[job.getId()] = job
|
||||
return pendingJobs
|
||||
|
|
|
@ -1,29 +1,33 @@
|
|||
import time
|
||||
import threading
|
||||
|
||||
gLogFilePath = '/tmp/ClusterController.log'#'/var/log/ClusterController.log'
|
||||
gLogFilePath = '/tmp/ClusterController.log' # '/var/log/ClusterController.log'
|
||||
|
||||
def log( message ):
|
||||
|
||||
def log(message):
|
||||
threadName = threading.currentThread().getName()
|
||||
logMessage = time.asctime(time.localtime())+' : '+ threadName + ' : ' + message
|
||||
logMessage = time.asctime(time.localtime()) + ' : ' + threadName + ' : ' + message
|
||||
print(logMessage)
|
||||
f = open(gLogFilePath, 'a+')
|
||||
assert( f )
|
||||
assert f
|
||||
try:
|
||||
f.write( logMessage + '\n' )
|
||||
f.write(logMessage + '\n')
|
||||
finally:
|
||||
f.close()
|
||||
|
||||
def logDebug( message ):
|
||||
log('[D]'+message)
|
||||
|
||||
def logDebug(message):
|
||||
log('[D]' + message)
|
||||
return
|
||||
|
||||
def logInfo( message ):
|
||||
log('[I]'+message)
|
||||
|
||||
def logWarning( message ):
|
||||
log('[W]'+message)
|
||||
def logInfo(message):
|
||||
log('[I]' + message)
|
||||
|
||||
def logError( message ):
|
||||
log('[E]'+message)
|
||||
|
||||
def logWarning(message):
|
||||
log('[W]' + message)
|
||||
|
||||
|
||||
def logError(message):
|
||||
log('[E]' + message)
|
||||
|
|
|
@ -1,12 +1,13 @@
|
|||
|
||||
class PowerState:
|
||||
UNKNOWN=0
|
||||
OFF=1
|
||||
ON=2
|
||||
SLEEP=3
|
||||
UNPLUGGED=4
|
||||
UNKNOWN = 0
|
||||
OFF = 1
|
||||
ON = 2
|
||||
SLEEP = 3
|
||||
UNPLUGGED = 4
|
||||
|
||||
def PowerStateToStr( powerState ):
|
||||
|
||||
def PowerStateToStr(powerState):
|
||||
if powerState == PowerState.UNKNOWN:
|
||||
return 'UNKNOWN'
|
||||
if powerState == PowerState.OFF:
|
||||
|
@ -18,4 +19,4 @@ def PowerStateToStr( powerState ):
|
|||
if powerState == PowerState.UNPLUGGED:
|
||||
return 'UNPLUGGED'
|
||||
else:
|
||||
assert( False )
|
||||
assert False
|
||||
|
|
|
@ -1,15 +1,17 @@
|
|||
import io
|
||||
import re
|
||||
from .JobsState import *
|
||||
from .QueueMachine import *
|
||||
from .JobsState import JobsState
|
||||
from .QueueMachine import QueueMachine, QueueMachineStateFlags
|
||||
from .Util import *
|
||||
from .Log import *
|
||||
from .Job import *
|
||||
from .Log import logError
|
||||
from .Job import JobStateFlags, JobId, Job, ParallelEnvironment
|
||||
import logging
|
||||
|
||||
|
||||
class QstatParser:
|
||||
def parseJobState( self, strJobStatus ):
|
||||
def parseJobState(self, strJobStatus):
|
||||
jobState = 0
|
||||
for i in range(0, len(strJobStatus) ):
|
||||
for i in range(0, len(strJobStatus)):
|
||||
c = strJobStatus[i]
|
||||
if c == 'r':
|
||||
jobState += JobStateFlags.RUNNING
|
||||
|
@ -30,9 +32,10 @@ class QstatParser:
|
|||
else:
|
||||
assert False, 'unhandled job state flag :"' + c + '"'
|
||||
return jobState
|
||||
def parseQueueMachineState( self, strQueueMachineStatus ):
|
||||
|
||||
def parseQueueMachineState(self, strQueueMachineStatus):
|
||||
queueMachineState = 0
|
||||
for i in range(0, len(strQueueMachineStatus) ):
|
||||
for i in range(0, len(strQueueMachineStatus)):
|
||||
c = strQueueMachineStatus[i]
|
||||
if c == 'd':
|
||||
queueMachineState += QueueMachineStateFlags.DISABLED
|
||||
|
@ -49,7 +52,8 @@ class QstatParser:
|
|||
else:
|
||||
assert False, 'unhandled queue machine state flag :"' + c + '"'
|
||||
return queueMachineState
|
||||
def parseQstatOutput( self, qstatOutput ):
|
||||
|
||||
def parseQstatOutput(self, qstatOutput):
|
||||
"""
|
||||
parses result of command 'qstat -f -u \* -pri'
|
||||
"""
|
||||
|
@ -69,30 +73,29 @@ class QstatParser:
|
|||
singleIndexMatch = re.match('^(?P<elementIndex>[0-9]+)$', strRange)
|
||||
if singleIndexMatch:
|
||||
iElementIndex = int(singleIndexMatch.group('elementIndex'))
|
||||
task_ids.extend(range(iElementIndex, iElementIndex+1))
|
||||
task_ids.extend(range(iElementIndex, iElementIndex + 1))
|
||||
else:
|
||||
# we expect strRange to be of the form "1-4:1", where :
|
||||
# the 1st number is the min element index (sge imposes it to be greater than 0)
|
||||
# the 2nd number is the max element index
|
||||
# the 3rd number is the step between consecutive element indices
|
||||
rangeMatch = re.match( '^(?P<minElementIndex>[0-9]+)-(?P<maxElementIndex>[0-9]+):(?P<stepBetweenIndices>[0-9]+)$', strRange)
|
||||
if rangeMatch == None:
|
||||
logError('unexpected format for job array details : "%s" (line="%s"' % (strRange, line) )
|
||||
assert(False)
|
||||
iMinElementIndex=int(rangeMatch.group('minElementIndex'))
|
||||
iMaxElementIndex=int(rangeMatch.group('maxElementIndex'))
|
||||
iStepBetweenIndices=int(rangeMatch.group('stepBetweenIndices'))
|
||||
task_ids.extend(range(iMinElementIndex, iMaxElementIndex+1, iStepBetweenIndices))
|
||||
rangeMatch = re.match('^(?P<minElementIndex>[0-9]+)-(?P<maxElementIndex>[0-9]+):(?P<stepBetweenIndices>[0-9]+)$', strRange)
|
||||
if rangeMatch is None:
|
||||
logError('unexpected format for job array details : "%s" (line="%s"' % (strRange, line))
|
||||
assert False
|
||||
iMinElementIndex = int(rangeMatch.group('minElementIndex'))
|
||||
iMaxElementIndex = int(rangeMatch.group('maxElementIndex'))
|
||||
iStepBetweenIndices = int(rangeMatch.group('stepBetweenIndices'))
|
||||
task_ids.extend(range(iMinElementIndex, iMaxElementIndex + 1, iStepBetweenIndices))
|
||||
return task_ids
|
||||
|
||||
|
||||
# ugly hack to work around the fact that qstat truncates the fqdn of cluster nodes
|
||||
# graffy@physix-master:~$ qstat -f -u \*
|
||||
# queuename qtype resv/used/tot. load_avg arch states
|
||||
# ---------------------------------------------------------------------------------
|
||||
# main.q@physix88.ipr.univ-renne BIP 0/0/36 14.03 lx-amd64
|
||||
# TODO: fix this properly by parsing the output of 'qstat -f -u \* -xml' instead of 'qstat -f -u \*'
|
||||
qstatOutput = re.sub('\.univ[^ ]*', '.univ-rennes1.fr', qstatOutput)
|
||||
qstatOutput = re.sub(r'\.univ[^ ]*', '.univ-rennes1.fr', qstatOutput)
|
||||
|
||||
jobsState = JobsState()
|
||||
f = io.StringIO(qstatOutput)
|
||||
|
@ -113,95 +116,98 @@ class QstatParser:
|
|||
# ntckts The job's ticket amount in normalized fashion.
|
||||
# ppri The job's -p priority as specified by the user.
|
||||
|
||||
jobRegularExp = re.compile( '^[ ]*(?P<jobId>[^ ]+)[ ]+(?P<JobPriority>[0-9.]+)[ ]+(?P<nurg>[0-9.]+)[ ]+(?P<npprior>[0-9.]+)[ ]+(?P<ntckts>[0-9.]+)[ ]+(?P<ppri>-?[0-9]+)[ ]+(?P<jobScriptName>[^ ]+)[ ]+(?P<jobOwner>[^ ]+)[ ]+(?P<jobStatus>[^ ]+)[ ]+(?P<jobStartOrSubmitTime>[0-9][0-9]/[0-9][0-9]/[0-9][0-9][0-9][0-9] [0-9][0-9]:[0-9][0-9]:[0-9][0-9])[ ]+(?P<numSlots>[0-9]+)[ ]+(?P<jobArrayDetails>[^\n]*)[\s]*$' )
|
||||
jobRegularExp = re.compile(r'^[ ]*(?P<jobId>[^ ]+)[ ]+(?P<JobPriority>[0-9.]+)[ ]+(?P<nurg>[0-9.]+)[ ]+(?P<npprior>[0-9.]+)[ ]+(?P<ntckts>[0-9.]+)[ ]+(?P<ppri>-?[0-9]+)[ ]+(?P<jobScriptName>[^ ]+)[ ]+(?P<jobOwner>[^ ]+)[ ]+(?P<jobStatus>[^ ]+)[ ]+(?P<jobStartOrSubmitTime>[0-9][0-9]/[0-9][0-9]/[0-9][0-9][0-9][0-9] [0-9][0-9]:[0-9][0-9]:[0-9][0-9])[ ]+(?P<numSlots>[0-9]+)[ ]+(?P<jobArrayDetails>[^\n]*)[\s]*$')
|
||||
# example of machine line :
|
||||
# allintel.q@simpatix34.univ-ren BIP 0/6/8 6.00 darwin-x86
|
||||
machineRegularExp = re.compile( '^(?P<queueName>[^@]+)@(?P<machineName>[^ ]+)[ ]+(?P<queueTypeString>[^ ]+)[ ]+(?P<numReservedSlots>[^/]+)/(?P<numUsedSlots>[^/]+)/(?P<numTotalSlots>[^ ]+)[ ]+(?P<cpuLoad>[^ ]+)[\s]+(?P<archName>[^ ]+)[\s]+(?P<queueMachineStatus>[^\s]*)' )
|
||||
pendingJobsHeaderRegularExp = re.compile( '^ - PENDING JOBS - PENDING JOBS - PENDING JOBS - PENDING JOBS - PENDING JOBS[?]*' )
|
||||
while( len(line) > 0 ):
|
||||
machineRegularExp = re.compile(r'^(?P<queueName>[^@]+)@(?P<machineName>[^ ]+)[ ]+(?P<queueTypeString>[^ ]+)[ ]+(?P<numReservedSlots>[^/]+)/(?P<numUsedSlots>[^/]+)/(?P<numTotalSlots>[^ ]+)[ ]+(?P<cpuLoad>[^ ]+)[\s]+(?P<archName>[^ ]+)[\s]+(?P<queueMachineStatus>[^\s]*)')
|
||||
pendingJobsHeaderRegularExp = re.compile('^ - PENDING JOBS - PENDING JOBS - PENDING JOBS - PENDING JOBS - PENDING JOBS[?]*')
|
||||
while len(line) > 0:
|
||||
# print line
|
||||
# check if the current line is a line describing a job running on a machine
|
||||
matchObj = jobRegularExp.match( line )
|
||||
matchObj = jobRegularExp.match(line)
|
||||
if matchObj:
|
||||
# we are dealing with a job line
|
||||
if not bInPendingJobsSection:
|
||||
assert( currentQueueMachine )
|
||||
#log('QstatParser::parseQstatOutput : jobId = "'+matchObj.group('jobId')+'"')
|
||||
assert currentQueueMachine
|
||||
# log('QstatParser::parseQstatOutput : jobId = "'+matchObj.group('jobId')+'"')
|
||||
iJobId = int(matchObj.group('jobId'))
|
||||
jobState = self.parseJobState( matchObj.group('jobStatus') )
|
||||
logging.debug('iJobId = %d' % iJobId)
|
||||
jobState = self.parseJobState(matchObj.group('jobStatus'))
|
||||
strJobArrayDetails = matchObj.group('jobArrayDetails')
|
||||
bIsJobArray = (len(strJobArrayDetails) != 0)
|
||||
#logDebug('strJobArrayDetails = "%s", bIsJobArray=%d' % (strJobArrayDetails, int(bIsJobArray)))
|
||||
# logDebug('strJobArrayDetails = "%s", bIsJobArray=%d' % (strJobArrayDetails, int(bIsJobArray)))
|
||||
# each element of a job array is treated as a separate job for the sake of simplicity.
|
||||
# For these elements, the job id in sge sense is the same, but they are different in this program's sense
|
||||
task_ids = range(0,1) # just one element, unless it's a job array
|
||||
task_ids = range(0, 1) # just one element, unless it's a job array
|
||||
if bIsJobArray:
|
||||
if bInPendingJobsSection:
|
||||
task_ids = parse_pending_tasks(strJobArrayDetails)
|
||||
else:
|
||||
# we are in the running jobs section, and here we expect the strJobArrayDetails to just contain the index of the job array element
|
||||
iJobArrayElementIndex = int(strJobArrayDetails)
|
||||
assert(iJobArrayElementIndex != 0) # sge does not allow element indices to be 0
|
||||
task_ids = range(iJobArrayElementIndex,iJobArrayElementIndex+1)
|
||||
assert iJobArrayElementIndex != 0 # sge does not allow element indices to be 0
|
||||
task_ids = range(iJobArrayElementIndex, iJobArrayElementIndex + 1)
|
||||
logging.debug('task_ids = %s' % task_ids)
|
||||
for task_id in task_ids:
|
||||
logging.debug('task_id = %s' % task_id)
|
||||
jobId = None
|
||||
if bIsJobArray:
|
||||
jobId = JobId(iJobId, task_id)
|
||||
else:
|
||||
jobId = JobId(iJobId)
|
||||
job = jobsState.getJob(jobId)
|
||||
#logDebug('iElementIndex = %d job id = %s' % (iElementIndex, jobId.asStr()))
|
||||
if job == None:
|
||||
# logDebug('iElementIndex = %d job id = %s' % (iElementIndex, jobId.asStr()))
|
||||
if job is None:
|
||||
# this job hasn't been encountered yet in the output of qstat ...
|
||||
# we could either be in the pending jobs section or in the running jobs section
|
||||
job = Job(jobId)
|
||||
jobsState.addJob( job )
|
||||
job.setState( jobState )
|
||||
jobsState.addJob(job)
|
||||
job.setState(jobState)
|
||||
strJobStartOrSubmitTime = matchObj.group('jobStartOrSubmitTime')
|
||||
jobStartOrSubmitTime = time.strptime(strJobStartOrSubmitTime, '%m/%d/%Y %H:%M:%S')
|
||||
if bInPendingJobsSection:
|
||||
job.setSubmitTime( jobStartOrSubmitTime )
|
||||
job.setSubmitTime(jobStartOrSubmitTime)
|
||||
else:
|
||||
job.setStartTime( jobStartOrSubmitTime )
|
||||
job.setOwner( matchObj.group('jobOwner') )
|
||||
job.setScriptName( matchObj.group('jobScriptName') )
|
||||
job.setStartTime(jobStartOrSubmitTime)
|
||||
job.setOwner(matchObj.group('jobOwner'))
|
||||
job.setScriptName(matchObj.group('jobScriptName'))
|
||||
if bInPendingJobsSection:
|
||||
job.setNumRequiredSlots(int(matchObj.group('numSlots')))
|
||||
else:
|
||||
assert( not bInPendingJobsSection ) # if we are in the pending jobs section, the job should be new
|
||||
assert not bInPendingJobsSection # if we are in the pending jobs section, the job should be new
|
||||
if not bInPendingJobsSection:
|
||||
job.addSlots( currentQueueMachine.getName(), int(matchObj.group('numSlots')) )
|
||||
job.addSlots(currentQueueMachine.getName(), int(matchObj.group('numSlots')))
|
||||
else:
|
||||
# the current line does not describe a job
|
||||
if not bInPendingJobsSection:
|
||||
# check if this line describes the status of a machine
|
||||
matchObj = machineRegularExp.match( line )
|
||||
matchObj = machineRegularExp.match(line)
|
||||
if matchObj:
|
||||
queueName = matchObj.group('queueName')
|
||||
machineName = matchObj.group('machineName')
|
||||
queueMachine = QueueMachine( queueName, machineName )
|
||||
#log(line)
|
||||
#log('matchObj.group(queueTypeString) :' + matchObj.group('queueTypeString'))
|
||||
#log('matchObj.group(numTotalSlots) :' + matchObj.group('numTotalSlots'))
|
||||
queueMachine.setNumSlots( int( matchObj.group('numTotalSlots') ) )
|
||||
queueMachine.setNumUsedSlots( int( matchObj.group('numUsedSlots') ) )
|
||||
queueMachine = QueueMachine(queueName, machineName)
|
||||
# log(line)
|
||||
# log('matchObj.group(queueTypeString) :' + matchObj.group('queueTypeString'))
|
||||
# log('matchObj.group(numTotalSlots) :' + matchObj.group('numTotalSlots'))
|
||||
queueMachine.setNumSlots(int(matchObj.group('numTotalSlots')))
|
||||
queueMachine.setNumUsedSlots(int(matchObj.group('numUsedSlots')))
|
||||
strCpuLoad = matchObj.group('cpuLoad')
|
||||
if strCpuLoad != '-NA-':
|
||||
queueMachine.setCpuLoad( float(strCpuLoad) )
|
||||
queueMachine.setCpuLoad(float(strCpuLoad))
|
||||
|
||||
strQueueMachineState = matchObj.group('queueMachineStatus')
|
||||
queueMachine.setState( self.parseQueueMachineState( strQueueMachineState ) )
|
||||
#log('QstatParser::parseQstatOutput : queueName = "'+matchObj.group('queueName')+'"')
|
||||
#log('QstatParser::parseQstatOutput : machineName = "'+matchObj.group('machineName')+'"')
|
||||
queueMachine.setState(self.parseQueueMachineState(strQueueMachineState))
|
||||
# log('QstatParser::parseQstatOutput : queueName = "'+matchObj.group('queueName')+'"')
|
||||
# log('QstatParser::parseQstatOutput : machineName = "'+matchObj.group('machineName')+'"')
|
||||
currentQueueMachine = queueMachine
|
||||
jobsState.addQueueMachine( queueMachine )
|
||||
jobsState.addQueueMachine(queueMachine)
|
||||
else:
|
||||
matchObj = pendingJobsHeaderRegularExp.match( line )
|
||||
matchObj = pendingJobsHeaderRegularExp.match(line)
|
||||
if matchObj:
|
||||
bInPendingJobsSection = True
|
||||
currentQueueMachine = None
|
||||
else:
|
||||
#print line
|
||||
# print line
|
||||
None
|
||||
else:
|
||||
# we are in a pending jobs section
|
||||
|
@ -209,30 +215,31 @@ class QstatParser:
|
|||
if not matchObj:
|
||||
# unexpected line
|
||||
print('line = "' + line + '"')
|
||||
assert( False )
|
||||
assert False
|
||||
None
|
||||
line = f.readline()
|
||||
f.close()
|
||||
return jobsState
|
||||
def parseJobDetails( self, qstatOutput, job ):
|
||||
|
||||
def parseJobDetails(self, qstatOutput, job):
|
||||
"""
|
||||
adds to job the details parsed from the output of the "qstat -j <jobid>" command
|
||||
"""
|
||||
f = io.StringIO(qstatOutput)
|
||||
line = f.readline()
|
||||
fieldRegularExp = re.compile( '^(?P<fieldName>[^:]+):[ ]+(?P<fieldValue>[?]*)$' )
|
||||
while( len(line) > 0 ):
|
||||
fieldRegularExp = re.compile('^(?P<fieldName>[^:]+):[ ]+(?P<fieldValue>[?]*)$')
|
||||
while len(line) > 0:
|
||||
# print line
|
||||
# check if the current line is a line describing a job running on a machine
|
||||
matchObj = fieldRegularExp.match( line )
|
||||
matchObj = fieldRegularExp.match(line)
|
||||
if matchObj:
|
||||
fieldName = matchObj.group('fieldName')
|
||||
strFieldValue = matchObj.group('fieldValue')
|
||||
if fieldName == 'job_number':
|
||||
assert( job.getId().asStr() == strFieldValue )
|
||||
assert job.getId().asStr() == strFieldValue
|
||||
elif fieldName == 'hard_queue_list':
|
||||
allowedQueues = strFieldValue.split(',')
|
||||
assert(len(allowedQueues) > 0)
|
||||
assert len(allowedQueues) > 0
|
||||
job.m_jobRequirements.m_queues = allowedQueues
|
||||
elif fieldName == 'parallel environment':
|
||||
# the value could be 'ompi range: 32'
|
||||
|
@ -240,10 +247,9 @@ class QstatParser:
|
|||
if matchObj:
|
||||
job.m_jobRequirements.m_parallelEnvironment = ParallelEnvironment.MPI
|
||||
else:
|
||||
assert( False )
|
||||
assert False
|
||||
else:
|
||||
# ignore he other fields
|
||||
None
|
||||
line = f.readline()
|
||||
f.close()
|
||||
|
|
@ -1,17 +1,18 @@
|
|||
|
||||
class QueueMachineStateFlags: #
|
||||
DISABLED=1 # the queue machine is disabled
|
||||
ALARM=2 # the queue machine is in alarm state (see man qstat)
|
||||
UNKNOWN=4 # the queue machine is in unknown state because sge_execd cannot be contected (see man qstat)
|
||||
ERROR=8 # the queue is in error state
|
||||
OBSOLETE=16 # the queue no longer exists but it is still visible because it still contains running jobs
|
||||
SUSPENDED=32 # the queue machine is suspended
|
||||
DISABLED = 1 # the queue machine is disabled
|
||||
ALARM = 2 # the queue machine is in alarm state (see man qstat)
|
||||
UNKNOWN = 4 # the queue machine is in unknown state because sge_execd cannot be contected (see man qstat)
|
||||
ERROR = 8 # the queue is in error state
|
||||
OBSOLETE = 16 # the queue no longer exists but it is still visible because it still contains running jobs
|
||||
SUSPENDED = 32 # the queue machine is suspended
|
||||
|
||||
|
||||
class QueueMachine:
|
||||
"""
|
||||
a QueueMachine instance represents a given SGE queue on a given machine (eg allintel.q@simpatix10)
|
||||
"""
|
||||
def __init__( self, queueName, machineName ):
|
||||
def __init__(self, queueName, machineName):
|
||||
self.m_queueName = queueName
|
||||
self.m_machineName = machineName
|
||||
self.m_numSlots = None
|
||||
|
@ -19,47 +20,62 @@ class QueueMachine:
|
|||
self.m_fCpuLoad = None
|
||||
self.m_stateFlags = 0
|
||||
self.m_strDisableMessage = ''
|
||||
def getName( self ):
|
||||
|
||||
def getName(self):
|
||||
"""
|
||||
returns the name of the machine queue (such as allintel.q@simpatix10)
|
||||
"""
|
||||
return self.m_queueName + '@' + self.m_machineName
|
||||
|
||||
def getQueueName( self ):
|
||||
def getQueueName(self):
|
||||
return self.m_queueName
|
||||
def getMachineName( self ):
|
||||
|
||||
def getMachineName(self):
|
||||
return self.m_machineName
|
||||
def setNumSlots( self, numSlots ):
|
||||
|
||||
def setNumSlots(self, numSlots):
|
||||
self.m_numSlots = numSlots
|
||||
def setNumUsedSlots( self, numSlots ):
|
||||
|
||||
def setNumUsedSlots(self, numSlots):
|
||||
self.m_numUsedSlots = numSlots
|
||||
def getNumSlots( self ):
|
||||
assert( self.m_numSlots != None )
|
||||
|
||||
def getNumSlots(self):
|
||||
assert self.m_numSlots is not None
|
||||
return self.m_numSlots
|
||||
def getNumUsedSlots( self ):
|
||||
assert( self.m_numUsedSlots != None )
|
||||
|
||||
def getNumUsedSlots(self):
|
||||
assert self.m_numUsedSlots is not None
|
||||
return self.m_numUsedSlots
|
||||
def setCpuLoad( self, fCpuLoad ):
|
||||
|
||||
def setCpuLoad(self, fCpuLoad):
|
||||
self.m_fCpuLoad = fCpuLoad
|
||||
def cpuLoadIsAvailable( self ):
|
||||
return self.m_fCpuLoad != None
|
||||
def getCpuLoad( self ):
|
||||
assert( self.m_fCpuLoad != None )
|
||||
|
||||
def cpuLoadIsAvailable(self):
|
||||
return self.m_fCpuLoad is not None
|
||||
|
||||
def getCpuLoad(self):
|
||||
assert self.m_fCpuLoad is not None
|
||||
return self.m_fCpuLoad
|
||||
def setState( self, state ):
|
||||
|
||||
def setState(self, state):
|
||||
self.m_stateFlags = state
|
||||
def isDisabled( self ):
|
||||
|
||||
def isDisabled(self):
|
||||
return self.m_stateFlags & QueueMachineStateFlags.DISABLED
|
||||
def isInErrorState( self ):
|
||||
|
||||
def isInErrorState(self):
|
||||
return self.m_stateFlags & QueueMachineStateFlags.ERROR
|
||||
def isResponding( self ):
|
||||
|
||||
def isResponding(self):
|
||||
return not (self.m_stateFlags & QueueMachineStateFlags.UNKNOWN)
|
||||
def isInAlarmState( self ):
|
||||
|
||||
def isInAlarmState(self):
|
||||
return self.m_stateFlags & QueueMachineStateFlags.ALARM
|
||||
def isSuspended( self ):
|
||||
|
||||
def isSuspended(self):
|
||||
return self.m_stateFlags & QueueMachineStateFlags.SUSPENDED
|
||||
"""
|
||||
def getStateAsString( self ):
|
||||
assert( self.m_strState != None )
|
||||
def getStateAsString(self):
|
||||
assert(self.m_strState is not None)
|
||||
return self.m_strState
|
||||
"""
|
|
@ -1,80 +1,85 @@
|
|||
from PowerState import *
|
||||
from Log import *
|
||||
from PowerState import PowerState
|
||||
from Log import logInfo
|
||||
import time
|
||||
import copy
|
||||
|
||||
|
||||
class Slot:
|
||||
def __init__( self ):
|
||||
def __init__(self):
|
||||
self.m_queueMachine = None
|
||||
self.m_numSlots = None
|
||||
self.m_job = None # job for which this slot is allocated
|
||||
|
||||
|
||||
class SlotAllocator:
|
||||
"""
|
||||
a class that defines a strategy for allocating free slots for the given pending jobs
|
||||
"""
|
||||
def getMachinesThatNeedWakeUp( self, pendingJobs, clusterState ):
|
||||
def getMachinesThatNeedWakeUp(self, pendingJobs, clusterState):
|
||||
"""
|
||||
returns the list of machines that need to wake up to make pending jobs running
|
||||
"""
|
||||
assert( False ) # this method is abstract
|
||||
assert False # this method is abstract
|
||||
|
||||
class SimpleSlotAllocator( SlotAllocator ):
|
||||
def getMachinesThatNeedWakeUp( self, pendingJobs, clusterState ):
|
||||
|
||||
class SimpleSlotAllocator(SlotAllocator):
|
||||
def getMachinesThatNeedWakeUp(self, pendingJobs, clusterState):
|
||||
machinesThatNeedWakeUp = {}
|
||||
highestPriorityPendingJob = pendingJobs.values()[0]
|
||||
logInfo('SimpleSlotAllocator::getMachinesThatNeedWakeUp : looking for free slots for job ' + highestPriorityPendingJob.getId().asStr() )
|
||||
logInfo('SimpleSlotAllocator::getMachinesThatNeedWakeUp : looking for free slots for job ' + highestPriorityPendingJob.getId().asStr())
|
||||
numFreeSlots = {} # contains the number of free slots for each queueMachine
|
||||
for queueMachine in clusterState.getJobsState().getQueueMachines().values():
|
||||
numFreeSlots[ queueMachine ] = clusterState.getJobsState().getNumFreeSlotsOnQueueMachine( queueMachine )
|
||||
logInfo('SimpleSlotAllocator::getMachinesThatNeedWakeUp : init numFreeSlots[ %s ] with %d ' % (queueMachine.getName(), numFreeSlots[ queueMachine ]) )
|
||||
numFreeSlots[queueMachine] = clusterState.getJobsState().getNumFreeSlotsOnQueueMachine(queueMachine)
|
||||
logInfo('SimpleSlotAllocator::getMachinesThatNeedWakeUp : init numFreeSlots[%s] with %d ' % (queueMachine.getName(), numFreeSlots[queueMachine]))
|
||||
remainingNumSlotsToAllocate = highestPriorityPendingJob.m_jobRequirements.m_numSlots
|
||||
logInfo('SimpleSlotAllocator::getMachinesThatNeedWakeUp : still %d slots to find' % remainingNumSlotsToAllocate )
|
||||
logInfo('SimpleSlotAllocator::getMachinesThatNeedWakeUp : still %d slots to find' % remainingNumSlotsToAllocate)
|
||||
# first look in running machines if there are available slots
|
||||
for queueMachine in clusterState.getJobsState().getQueueMachines().values():
|
||||
logInfo('SimpleSlotAllocator::getMachinesThatNeedWakeUp : examining queueMachine %s ' % queueMachine.getName() )
|
||||
machine = clusterState.getMachines()[ queueMachine.getMachineName() ]
|
||||
logInfo('SimpleSlotAllocator::getMachinesThatNeedWakeUp : examining queueMachine %s ' % queueMachine.getName())
|
||||
machine = clusterState.getMachines()[queueMachine.getMachineName()]
|
||||
if machine.getPowerState() == PowerState.ON:
|
||||
if clusterState.queueMachineFitsJobRequirements( queueMachine, highestPriorityPendingJob.m_jobRequirements ):
|
||||
numSlotsAllocatedOnThisMachine = min( numFreeSlots[ queueMachine ], remainingNumSlotsToAllocate )
|
||||
logInfo('SimpleSlotAllocator::getMachinesThatNeedWakeUp : found %d slots on already running %s ' % (numSlotsAllocatedOnThisMachine, queueMachine.getMachineName() ) )
|
||||
if clusterState.queueMachineFitsJobRequirements(queueMachine, highestPriorityPendingJob.m_jobRequirements):
|
||||
numSlotsAllocatedOnThisMachine = min(numFreeSlots[queueMachine], remainingNumSlotsToAllocate)
|
||||
logInfo('SimpleSlotAllocator::getMachinesThatNeedWakeUp : found %d slots on already running %s ' % (numSlotsAllocatedOnThisMachine, queueMachine.getMachineName()))
|
||||
|
||||
remainingNumSlotsToAllocate -= numSlotsAllocatedOnThisMachine
|
||||
numFreeSlots[ queueMachine ] -= numSlotsAllocatedOnThisMachine
|
||||
logInfo('SimpleSlotAllocator::getMachinesThatNeedWakeUp : still %d slots to find' % remainingNumSlotsToAllocate )
|
||||
assert( remainingNumSlotsToAllocate >= 0 )
|
||||
numFreeSlots[queueMachine] -= numSlotsAllocatedOnThisMachine
|
||||
logInfo('SimpleSlotAllocator::getMachinesThatNeedWakeUp : still %d slots to find' % remainingNumSlotsToAllocate)
|
||||
assert remainingNumSlotsToAllocate >= 0
|
||||
if remainingNumSlotsToAllocate == 0:
|
||||
break
|
||||
if remainingNumSlotsToAllocate > 0:
|
||||
# now look into machines that are asleep
|
||||
for queueMachine in clusterState.getJobsState().getQueueMachines().values():
|
||||
logInfo('SimpleSlotAllocator::getMachinesThatNeedWakeUp : examining queueMachine %s ' % queueMachine.getName() )
|
||||
machine = clusterState.getMachines()[ queueMachine.getMachineName() ]
|
||||
logInfo('SimpleSlotAllocator::getMachinesThatNeedWakeUp : examining queueMachine %s ' % queueMachine.getName())
|
||||
machine = clusterState.getMachines()[queueMachine.getMachineName()]
|
||||
if machine.getPowerState() == PowerState.SLEEP:
|
||||
if clusterState.queueMachineFitsJobRequirements( queueMachine, highestPriorityPendingJob.m_jobRequirements ):
|
||||
numSlotsAllocatedOnThisMachine = min( numFreeSlots[ queueMachine ], remainingNumSlotsToAllocate )
|
||||
logInfo('SimpleSlotAllocator::getMachinesThatNeedWakeUp : found %d slots on sleeping %s ' % (numSlotsAllocatedOnThisMachine, queueMachine.getMachineName() ) )
|
||||
if clusterState.queueMachineFitsJobRequirements(queueMachine, highestPriorityPendingJob.m_jobRequirements):
|
||||
numSlotsAllocatedOnThisMachine = min(numFreeSlots[queueMachine], remainingNumSlotsToAllocate)
|
||||
logInfo('SimpleSlotAllocator::getMachinesThatNeedWakeUp : found %d slots on sleeping %s ' % (numSlotsAllocatedOnThisMachine, queueMachine.getMachineName()))
|
||||
remainingNumSlotsToAllocate -= numSlotsAllocatedOnThisMachine
|
||||
numFreeSlots[ queueMachine ] -= numSlotsAllocatedOnThisMachine
|
||||
machinesThatNeedWakeUp[ machine.getName() ] = machine
|
||||
logInfo('SimpleSlotAllocator::getMachinesThatNeedWakeUp : still %d slots to find' % remainingNumSlotsToAllocate )
|
||||
assert( remainingNumSlotsToAllocate >= 0 )
|
||||
numFreeSlots[queueMachine] -= numSlotsAllocatedOnThisMachine
|
||||
machinesThatNeedWakeUp[machine.getName()] = machine
|
||||
logInfo('SimpleSlotAllocator::getMachinesThatNeedWakeUp : still %d slots to find' % remainingNumSlotsToAllocate)
|
||||
assert remainingNumSlotsToAllocate >= 0
|
||||
if remainingNumSlotsToAllocate == 0:
|
||||
break
|
||||
if remainingNumSlotsToAllocate != 0:
|
||||
return {} # not enough slots available
|
||||
return machinesThatNeedWakeUp
|
||||
|
||||
class DecoupledSlotAllocator( SlotAllocator ):
|
||||
|
||||
class DecoupledSlotAllocator(SlotAllocator):
|
||||
"""
|
||||
a slot allocator that doesn't know much about sge, and does not attempts to guess what sge'sceduler would do
|
||||
Instead, it uses a very simple strategy : it wakes up all the machines periodically to allow jobs to get in.
|
||||
"""
|
||||
def __init__( self ):
|
||||
def __init__(self):
|
||||
self.m_delayBetweenPeriodicChecks = -1 # in seconds. Disable periodic checks by setting this to -1
|
||||
self.m_lastCheckTime = time.time()
|
||||
self.m_lastClusterState = None
|
||||
def jobsStateHasChanged( self, newClusterState ):
|
||||
|
||||
def jobsStateHasChanged(self, newClusterState):
|
||||
"""
|
||||
returns true if there is a change in the cluster state that can cause a pending job
|
||||
to start (provided all machines are enabled)
|
||||
|
@ -85,8 +90,8 @@ class DecoupledSlotAllocator( SlotAllocator ):
|
|||
newJobs = newClusterState.m_jobsState.m_jobs
|
||||
bJobsHaveChanged = False
|
||||
oldJobsOnly = oldJobs.copy() # shallow copy
|
||||
#print 'oldJobs : ', oldJobs
|
||||
#print 'newJobs : ', newJobs
|
||||
# print 'oldJobs : ', oldJobs
|
||||
# print 'newJobs : ', newJobs
|
||||
"""
|
||||
print 'self.m_lastClusterState', self.m_lastClusterState
|
||||
print 'newClusterState', newClusterState
|
||||
|
@ -101,23 +106,24 @@ class DecoupledSlotAllocator( SlotAllocator ):
|
|||
print 'id(newJobs) : ', id(newJobs)
|
||||
"""
|
||||
for newJob in newJobs.values():
|
||||
#logDebug('DecoupledSlotAllocator::jobsStateHasChanged newJob id=%s' % newJob.getId().asStr())
|
||||
# logDebug('DecoupledSlotAllocator::jobsStateHasChanged newJob id=%s' % newJob.getId().asStr())
|
||||
if newJob.getId() in oldJobs:
|
||||
#logDebug('DecoupledSlotAllocator::jobsStateHasChanged job id=%d is in old jobs' % newJob.getId())
|
||||
# logDebug('DecoupledSlotAllocator::jobsStateHasChanged job id=%d is in old jobs' % newJob.getId())
|
||||
del oldJobsOnly[newJob.getId()]
|
||||
else:
|
||||
# ah ... a new job has arrived
|
||||
logInfo('A new job (jobId =%s) has been detected ' % newJob.getId().asStr() )
|
||||
logInfo('A new job (jobId =%s) has been detected ' % newJob.getId().asStr())
|
||||
bJobsHaveChanged = True
|
||||
if len(oldJobsOnly) != 0:
|
||||
for oldJob in oldJobsOnly.values():
|
||||
logInfo('Job (jobId =%s) has finished' % oldJob.getId().asStr() )
|
||||
logInfo('Job (jobId =%s) has finished' % oldJob.getId().asStr())
|
||||
# at least one old job has finished, freeing some slots
|
||||
bJobsHaveChanged = True
|
||||
return bJobsHaveChanged
|
||||
def getMachinesThatNeedWakeUp( self, pendingJobs, clusterState ):
|
||||
|
||||
def getMachinesThatNeedWakeUp(self, pendingJobs, clusterState):
|
||||
machinesThatNeedWakeUp = {}
|
||||
bJobsStateHasChanged = self.jobsStateHasChanged( clusterState )
|
||||
bJobsStateHasChanged = self.jobsStateHasChanged(clusterState)
|
||||
currentTime = time.time()
|
||||
# we do periodic checks to detect changes in cluster state that are not detected by jobsStateHasChanged
|
||||
# for example changes in the requirements, in the allocation policy, etc...
|
||||
|
@ -132,10 +138,10 @@ class DecoupledSlotAllocator( SlotAllocator ):
|
|||
for queueMachine in clusterState.getJobsState().getQueueMachines().values():
|
||||
if queueMachine.getMachineName() in clusterState.getMachines():
|
||||
# this means that the machine is under the cluster controller's control
|
||||
machine = clusterState.getMachines()[ queueMachine.getMachineName() ]
|
||||
machine = clusterState.getMachines()[queueMachine.getMachineName()]
|
||||
if machine.getPowerState() == PowerState.SLEEP:
|
||||
machinesThatNeedWakeUp[ machine.getName() ] = machine
|
||||
machinesThatNeedWakeUp[machine.getName()] = machine
|
||||
self.m_lastCheckTime = currentTime
|
||||
self.m_lastClusterState = copy.copy(clusterState)
|
||||
#print 'self.m_lastClusterState', self.m_lastClusterState
|
||||
# print 'self.m_lastClusterState', self.m_lastClusterState
|
||||
return machinesThatNeedWakeUp
|
||||
|
|
|
@ -1,9 +1,12 @@
|
|||
import Util
|
||||
from QstatParser import *
|
||||
import time
|
||||
from Util import executeProgram
|
||||
from QstatParser import QstatParser
|
||||
from Log import logDebug, logWarning
|
||||
|
||||
|
||||
class SunGridEngine:
|
||||
|
||||
def getCurrentJobsState( self ):
|
||||
def getCurrentJobsState(self):
|
||||
bBUG_00000009_IS_STILL_ALIVE = True
|
||||
if bBUG_00000009_IS_STILL_ALIVE:
|
||||
logDebug('Querying the current state of jobs')
|
||||
|
@ -11,34 +14,33 @@ class SunGridEngine:
|
|||
delayBetweenAttemps = 5 # in seconds
|
||||
while returnCode != 0:
|
||||
command = ['qstat', '-f', '-u', '*']
|
||||
(returnCode, qstatOutput, stderr) = executeProgram( command )
|
||||
(returnCode, qstatOutput, stderr) = executeProgram(command)
|
||||
if returnCode != 0:
|
||||
logWarning('command "%s" failed (returnCode = %d, stdout="%s", stderr="%s"). Retrying in %d seconds' % (' '.join(command), returnCode, qstatOutput, stderr, delayBetweenAttemps))
|
||||
time.sleep(delayBetweenAttemps)
|
||||
if bBUG_00000009_IS_STILL_ALIVE:
|
||||
logDebug('Just got current state of jobs')
|
||||
|
||||
jobsState = QstatParser().parseQstatOutput( qstatOutput )
|
||||
jobsState.setTime( time.time() )
|
||||
|
||||
jobsState = QstatParser().parseQstatOutput(qstatOutput)
|
||||
jobsState.setTime(time.time())
|
||||
|
||||
# read the requirements for pending jobs (which parallel environment, which queue, which architecture) from sge
|
||||
if False: # no need for job details at the moment and since it's very slow, it's been disabled
|
||||
for unused_jobId, job in jobsState.getPendingJobs().items():
|
||||
(returnCode, stdout, stderr) = executeProgram( ['qstat', '-j', job.getId().asStr()] )
|
||||
(returnCode, stdout, stderr) = executeProgram(['qstat', '-j', job.getId().asStr()])
|
||||
assert returnCode != 0, 'prout'
|
||||
QstatParser().parseJobDetails( stdout, job )
|
||||
QstatParser().parseJobDetails(stdout, job)
|
||||
|
||||
return jobsState
|
||||
|
||||
def setQueueInstanceActivation( self, strQueueInstanceName, bEnable ):
|
||||
def setQueueInstanceActivation(self, strQueueInstanceName, bEnable):
|
||||
argument = 'd'
|
||||
if bEnable:
|
||||
argument = 'e'
|
||||
bBUG_00000269_IS_STILL_ALIVE = True # for some reason, qmod -d (and maybe any sge command) could fail with error: commlib error: can't connect to service (Address already in use)
|
||||
delayBetweenAttemps = 5 # in seconds
|
||||
while True:
|
||||
errorCode, unused_stdout, unused_stderr = executeProgram(['qmod', '-'+argument, strQueueInstanceName])
|
||||
errorCode, unused_stdout, unused_stderr = executeProgram(['qmod', '-' + argument, strQueueInstanceName])
|
||||
if bBUG_00000269_IS_STILL_ALIVE:
|
||||
# if the command failed, try again
|
||||
if errorCode == 0:
|
||||
|
@ -48,11 +50,9 @@ class SunGridEngine:
|
|||
break
|
||||
return (errorCode == 0)
|
||||
|
||||
def queueIsEmpty( self, strMachineName ):
|
||||
(returnCode, qstatOutput, unused_stderr) = executeProgram( ['qstat', '-f', '-u', '*'] )
|
||||
assert( returnCode == 0 )
|
||||
jobsState = QstatParser().parseQstatOutput( qstatOutput )
|
||||
jobs = jobsState.getJobsOnMachine( strMachineName )
|
||||
def queueIsEmpty(self, strMachineName):
|
||||
(returnCode, qstatOutput, unused_stderr) = executeProgram(['qstat', '-f', '-u', '*'])
|
||||
assert returnCode == 0
|
||||
jobsState = QstatParser().parseQstatOutput(qstatOutput)
|
||||
jobs = jobsState.getJobsOnMachine(strMachineName)
|
||||
return (len(jobs) == 0)
|
||||
|
||||
|
|
@ -1,12 +1,12 @@
|
|||
#!/usr/bin/env python
|
||||
import sys
|
||||
sys.path.insert(0, '..')
|
||||
from Log import *
|
||||
from Log import logInfo
|
||||
import Util
|
||||
from PowerState import *
|
||||
|
||||
from PowerState import PowerState
|
||||
from HTMLParser import HTMLParser
|
||||
|
||||
|
||||
def Test0000():
|
||||
logInfo('Testing bug 00000003 if a series of wake up, goto sleep can shutdown a machine')
|
||||
strTargetMachineName = 'simpatix12'
|
||||
|
@ -14,15 +14,16 @@ def Test0000():
|
|||
while True:
|
||||
if ePowerState == PowerState.ON:
|
||||
bSuccess = Util.blockingPutMachineToSleep(strTargetMachineName)
|
||||
assert( bSuccess )
|
||||
assert bSuccess
|
||||
bSuccess = Util.blockingPutMachineToSleep(strTargetMachineName)
|
||||
ePowerState = PowerState.SLEEP
|
||||
elif ePowerState == PowerState.SLEEP:
|
||||
bSuccess = Util.blockingWakeUpMachine(strTargetMachineName)
|
||||
assert( bSuccess )
|
||||
assert bSuccess
|
||||
ePowerState = PowerState.ON
|
||||
else:
|
||||
assert(False)
|
||||
assert False
|
||||
|
||||
|
||||
def Test0001():
|
||||
logInfo('Testing bug 00000003 : could it be caused by a sleep and a power on at the same tim ?')
|
||||
|
@ -30,12 +31,13 @@ def Test0001():
|
|||
ePowerState = Util.getPowerState(strTargetMachineName)
|
||||
if ePowerState == PowerState.SLEEP:
|
||||
bSuccess = Util.blockingWakeUpMachine(strTargetMachineName)
|
||||
assert( bSuccess )
|
||||
assert bSuccess
|
||||
ePowerState = PowerState.ON
|
||||
assert(ePowerState == PowerState.ON)
|
||||
Util.executeCommand("ssh %s 'pmset sleepnow'" % strTargetMachineName )
|
||||
assert ePowerState == PowerState.ON
|
||||
Util.executeCommand("ssh %s 'pmset sleepnow'" % strTargetMachineName)
|
||||
bSuccess = Util.blockingWakeUpMachine(strTargetMachineName)
|
||||
assert(bSuccess)
|
||||
assert bSuccess
|
||||
|
||||
|
||||
def Test0002():
|
||||
logInfo('Testing bug 00000003 : could it be caused by a power on quickly followed by a sleep ?')
|
||||
|
@ -43,11 +45,12 @@ def Test0002():
|
|||
ePowerState = Util.getPowerState(strTargetMachineName)
|
||||
if ePowerState == PowerState.ON:
|
||||
bSuccess = Util.blockingWakeUpMachine(strTargetMachineName)
|
||||
assert( bSuccess )
|
||||
assert bSuccess
|
||||
ePowerState = PowerState.SLEEP
|
||||
assert(ePowerState == PowerState.SLEEP)
|
||||
Util.executeIpmiCommand( strTargetMachineName, 'chassis power on' )
|
||||
Util.executeCommand("ssh %s 'pmset sleepnow'" % strTargetMachineName )
|
||||
assert ePowerState == PowerState.SLEEP
|
||||
Util.executeIpmiCommand(strTargetMachineName, 'chassis power on')
|
||||
Util.executeCommand("ssh %s 'pmset sleepnow'" % strTargetMachineName)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
Test0000()
|
|
@ -1,18 +1,19 @@
|
|||
#import .Util
|
||||
#import ..SimpaDbUtil
|
||||
from .Log import *
|
||||
from .PowerState import *
|
||||
# import .Util
|
||||
# import ..SimpaDbUtil
|
||||
from .Log import logDebug, logInfo, logWarning, logError
|
||||
from .PowerState import PowerState, PowerStateToStr
|
||||
import re
|
||||
import io
|
||||
import os
|
||||
import traceback
|
||||
import sys
|
||||
|
||||
def executeProgram( astrArguments ):
|
||||
|
||||
def executeProgram(astrArguments):
|
||||
bBUG_00000008_IS_STILL_ACTIVE = True
|
||||
if bBUG_00000008_IS_STILL_ACTIVE:
|
||||
logDebug('executeProgram : program = [%s]' % (','.join(astrArguments) ))
|
||||
(returnCode, stdout, stderr) = Lib.Util.executeProgram( astrArguments )
|
||||
logDebug('executeProgram : program = [%s]' % (','.join(astrArguments)))
|
||||
(returnCode, stdout, stderr) = Lib.Util.executeProgram(astrArguments)
|
||||
if bBUG_00000008_IS_STILL_ACTIVE:
|
||||
logDebug('executeCommand : return code of [%s] = %d' % (','.join(astrArguments), returnCode))
|
||||
# for debugging purpose, log info in case the command failed
|
||||
|
@ -22,32 +23,34 @@ def executeProgram( astrArguments ):
|
|||
logDebug('executeCommand : stderr of [%s] = %s' % (','.join(astrArguments), stderr))
|
||||
return (returnCode, stdout, stderr)
|
||||
|
||||
def executeCommand( command ):
|
||||
#logDebug('executeCommand : command = ' + command)
|
||||
(returnCode, stdout, stderr) = Lib.Util.executeCommand( command )
|
||||
#logDebug('executeCommand : return code of "'+command+'" = '+str(returnCode))
|
||||
|
||||
def executeCommand(command):
|
||||
# logDebug('executeCommand : command = ' + command)
|
||||
(returnCode, stdout, stderr) = Lib.Util.executeCommand(command)
|
||||
# logDebug('executeCommand : return code of "'+command+'" = '+str(returnCode))
|
||||
return (returnCode, stdout, stderr)
|
||||
|
||||
def executeIpmiCommand( machineName, ipmiCommandArgs ):
|
||||
lomIpAddress = Lib.SimpaDbUtil.getLightOutManagementIpAddress( machineName )
|
||||
|
||||
def executeIpmiCommand(machineName, ipmiCommandArgs):
|
||||
lomIpAddress = Lib.SimpaDbUtil.getLightOutManagementIpAddress(machineName)
|
||||
lomPasswordFilepath = '/usr/local/etc/LightOutManagementPassword.txt'
|
||||
astrProgram = ['ipmitool', '-U', 'admin', '-H', lomIpAddress, '-f', lomPasswordFilepath]
|
||||
astrProgram.extend( ipmiCommandArgs )
|
||||
#print 'executeIpmiCommand'
|
||||
#print astrProgram
|
||||
astrProgram.extend(ipmiCommandArgs)
|
||||
# print 'executeIpmiCommand'
|
||||
# print astrProgram
|
||||
bBUG_00000005_IS_STILL_ACTIVE = True
|
||||
if bBUG_00000005_IS_STILL_ACTIVE:
|
||||
# bug 00000005 causes ipmitool to randomly fail for no apparent reason (two consecutive calls might give different errors, these errors not always being timeouts). Therefore we try and try again, until the command succeeds. If we don't do this, cluster controller keeps stopping because ipmi commands fail. The effect of this hack is that the UNPLUGGED power state is no longer detected; therefore, with this hack, cluster controller is expecting all machines to be plugged.
|
||||
bCommandSucceeded = False
|
||||
while not bCommandSucceeded:
|
||||
(returnCode, stdout, stderr) = executeProgram( astrProgram )
|
||||
(returnCode, stdout, stderr) = executeProgram(astrProgram)
|
||||
if returnCode == 0:
|
||||
bCommandSucceeded = True
|
||||
else:
|
||||
logWarning('the command "%s" failed. Retrying a bit later' % ' '.join(astrProgram))
|
||||
time.sleep(5) # wait for 5 seconds before the next attempt, in order not to saturate activity
|
||||
else:
|
||||
(returnCode, stdout, stderr) = executeProgram( astrProgram )
|
||||
(returnCode, stdout, stderr) = executeProgram(astrProgram)
|
||||
"""
|
||||
sh-3.2# ipmitool -U admin -H 129.20.27.220 -f /usr/local/etc/LightOutManagementPassword.txt sensor get 'ACPI State'
|
||||
Unabled to establish a session with the BMC.
|
||||
|
@ -65,24 +68,25 @@ def executeIpmiCommand( machineName, ipmiCommandArgs ):
|
|||
|
||||
return (returnCode, stdout, stderr)
|
||||
|
||||
def getPowerState( machineName ):
|
||||
|
||||
def getPowerState(machineName):
|
||||
ePowerState = PowerState.UNKNOWN
|
||||
bPowerStateRead = False
|
||||
iNumFailedAttempts = 0
|
||||
while not bPowerStateRead:
|
||||
(returnCode, stdout, stderr) = executeIpmiCommand( machineName, ['sensor', 'get', 'ACPI State'] )
|
||||
(returnCode, stdout, stderr) = executeIpmiCommand(machineName, ['sensor', 'get', 'ACPI State'])
|
||||
if returnCode == 0:
|
||||
matchObj = re.search('\[(?P<AcpiState>S[0-9][^\:]*)\:', stdout)
|
||||
matchObj = re.search(r'\[(?P<AcpiState>S[0-9][^\:]*)\:', stdout)
|
||||
bBUG_00000002_IS_STILL_ACTIVE = True
|
||||
if bBUG_00000002_IS_STILL_ACTIVE:
|
||||
if matchObj == None:
|
||||
if matchObj is None:
|
||||
# the following warning has been commented out because it pollutes the logs and apparently
|
||||
# it's a 'feature' of all 4 core machines : if the machine is woken up using ipmitool, then
|
||||
# no power on event is logged ...
|
||||
#logWarning('degenerate ipmitool output for machine %s (see bug 00000002). Assuming power in on because that''s what I noticed when I had the case.' % machineName)
|
||||
# logWarning('degenerate ipmitool output for machine %s (see bug 00000002). Assuming power in on because that''s what I noticed when I had the case.' % machineName)
|
||||
return PowerState.ON
|
||||
else:
|
||||
assert( matchObj )
|
||||
assert matchObj
|
||||
strAcpiState = matchObj.group('AcpiState')
|
||||
if strAcpiState == 'S0/G0':
|
||||
ePowerState = PowerState.ON
|
||||
|
@ -92,11 +96,11 @@ def getPowerState( machineName ):
|
|||
ePowerState = PowerState.OFF
|
||||
else:
|
||||
print(strAcpiState)
|
||||
assert( False )
|
||||
assert False
|
||||
bPowerStateRead = True
|
||||
else:
|
||||
# error ... it's either because the machine is unplugged or because the machine is busy (well I'm not sure what happened but I had the case where the command failed for no apparent reason, and therefore I suspect it to be busy ). In order to differentiate these 2 cases, we try again and if this caommand fails too many times then we decide it's unplugged (very dodgy I know but I'm disapointed that this command doen't always work, and for now I don't know other ways to differentiate between these cases....)
|
||||
iMAX_NUM_ATTEMPTS=5
|
||||
# error ... it's either because the machine is unplugged or because the machine is busy (well I'm not sure what happened but I had the case where the command failed for no apparent reason, and therefore I suspect it to be busy). In order to differentiate these 2 cases, we try again and if this caommand fails too many times then we decide it's unplugged (very dodgy I know but I'm disapointed that this command doen't always work, and for now I don't know other ways to differentiate between these cases....)
|
||||
iMAX_NUM_ATTEMPTS = 5
|
||||
iNumFailedAttempts += 1
|
||||
if iNumFailedAttempts < iMAX_NUM_ATTEMPTS:
|
||||
logWarning('failed to read the power state of %s. I\'ll try a again a bit later....' % machineName)
|
||||
|
@ -107,17 +111,19 @@ def getPowerState( machineName ):
|
|||
bPowerStateRead = True
|
||||
return ePowerState
|
||||
|
||||
def wakeUpMachine( machineName ):
|
||||
|
||||
def wakeUpMachine(machineName):
|
||||
"""
|
||||
this method seems more reliable than wake on lan (sometimes, sending wake on lan packet seems to have no effect)
|
||||
@return true on success, false otherwise
|
||||
@note I once had this method failing for no obvious reason.. maybe this command does not succeed if the machine is in a transition state
|
||||
"""
|
||||
(returnCode, stdout, stderr) = executeIpmiCommand( machineName, ['chassis', 'power', 'on'] )
|
||||
(returnCode, stdout, stderr) = executeIpmiCommand(machineName, ['chassis', 'power', 'on'])
|
||||
bSuccess = (returnCode == 0) # this command can fail if the machine is manually unplugged for example
|
||||
return bSuccess
|
||||
|
||||
def blockingPutMachineToSleep( machineName ):
|
||||
|
||||
def blockingPutMachineToSleep(machineName):
|
||||
"""
|
||||
@return true on success, false otherwise
|
||||
"""
|
||||
|
@ -136,14 +142,14 @@ def blockingPutMachineToSleep( machineName ):
|
|||
while iDelay < iMaxGoToSleepDuration:
|
||||
time.sleep(5)
|
||||
iDelay += 5
|
||||
ePowerState = getPowerState( machineName )
|
||||
ePowerState = getPowerState(machineName)
|
||||
if ePowerState == PowerState.SLEEP:
|
||||
logInfo('machine %s is now sleeping (put to sleep succeeded)' % machineName)
|
||||
return True
|
||||
else:
|
||||
if ePowerState != PowerState.ON:
|
||||
logWarning('unexpectedly, powerState of %s is %s' % (machineName, PowerStateToStr(ePowerState)))
|
||||
assert(ePowerState == PowerState.ON)
|
||||
assert ePowerState == PowerState.ON
|
||||
iAttempt += 1
|
||||
if iAttempt > iMaxNumAttempts:
|
||||
if bBUG_239_IS_STILL_ALIVE:
|
||||
|
@ -156,6 +162,7 @@ def blockingPutMachineToSleep( machineName ):
|
|||
logWarning('the attempt to put %s to sleep failed... trying again' % (machineName))
|
||||
return True
|
||||
|
||||
|
||||
def blockingWakeUpMachine(machineName):
|
||||
logInfo('waking up machine %s...' % machineName)
|
||||
numAttempts = 0
|
||||
|
@ -165,11 +172,11 @@ def blockingWakeUpMachine(machineName):
|
|||
iNumWakeUpAttempts = 0
|
||||
bWakeUpMachineSucceeded = False
|
||||
while not bWakeUpMachineSucceeded:
|
||||
bWakeUpMachineSucceeded = wakeUpMachine( machineName )
|
||||
bWakeUpMachineSucceeded = wakeUpMachine(machineName)
|
||||
iNumWakeUpAttempts += 1
|
||||
# the previous command can fail if the machine is already in a transition
|
||||
# in that case we try sevral times bevire giving up
|
||||
if(bWakeUpMachineSucceeded == False):
|
||||
if not bWakeUpMachineSucceeded:
|
||||
if iNumWakeUpAttempts < iMaxNumWakeUpAttempts:
|
||||
iDelay = 5
|
||||
logWarning('wake up attempt %d of %s failed... I\'ll try again in %d seconds' % (iNumWakeUpAttempts, machineName, iDelay))
|
||||
|
@ -180,18 +187,18 @@ def blockingWakeUpMachine(machineName):
|
|||
|
||||
bWakeUpFailed = False
|
||||
# wait until the machine is operational
|
||||
WAKEUPTIMEOUT=5*60 # max number of seconds allowed for a machine to be alive after a wakeup request
|
||||
WAKEUPTIMEOUT = 5 * 60 # max number of seconds allowed for a machine to be alive after a wakeup request
|
||||
wakeUpToAliveDuration = 0
|
||||
while not Lib.SimpaDbUtil.isMachineResponding( machineName ):
|
||||
while not Lib.SimpaDbUtil.isMachineResponding(machineName):
|
||||
time.sleep(5)
|
||||
wakeUpToAliveDuration+=5
|
||||
wakeUpToAliveDuration += 5
|
||||
if wakeUpToAliveDuration > WAKEUPTIMEOUT:
|
||||
# the wake up failed for whatever reason (power state changed manually ? wake up order got lost ?)
|
||||
logWarning('%s took too long (more than %d seconds) to respond after a successful wakeup request.' % (machineName, WAKEUPTIMEOUT))
|
||||
bWakeUpFailed = True
|
||||
break
|
||||
if bWakeUpFailed:
|
||||
numAttempts+=1
|
||||
numAttempts += 1
|
||||
if numAttempts >= 2:
|
||||
logWarning('giving up waking up %s because the wake up request succeeded but the machine never actually came alive (and this too many times)' % (machineName))
|
||||
return False # power state changed manually ?
|
||||
|
@ -202,11 +209,12 @@ def blockingWakeUpMachine(machineName):
|
|||
logInfo('Waking up of machine %s completed successfully' % machineName)
|
||||
return True
|
||||
|
||||
|
||||
def onException(exception):
|
||||
sys.stdout.flush()
|
||||
strExceptionType = type( exception )
|
||||
strExceptionType = type(exception)
|
||||
strMessage = 'exception %s : %s\n' % (strExceptionType, exception.message)
|
||||
#traceback.print_last()
|
||||
# traceback.print_last()
|
||||
f = io.StringIO()
|
||||
traceback.print_exc(file=f)
|
||||
strMessage += f.getvalue()
|
||||
|
@ -216,13 +224,11 @@ def onException(exception):
|
|||
|
||||
try:
|
||||
# I had the case (see bugzilla 234) where an assert failed in a child thread, but the main process kept going. I suspect that it was caused
|
||||
#by a failure of sendTextMail... that's why I've embedded the sendmail inside a try except block, so that is this operation fails, then the
|
||||
# by a failure of sendTextMail... that's why I've embedded the sendmail inside a try except block, so that is this operation fails, then the
|
||||
# kill of the main process is still executed.
|
||||
Lib.Util.sendTextMail( 'ClusterController <guillaume.raffy@univ-rennes1.fr>', 'guillaume.raffy@univ-rennes1.fr', 'ClusterController has stopped because of an exception', strMessage)
|
||||
Lib.Util.sendTextMail('ClusterController <guillaume.raffy@univ-rennes1.fr>', 'guillaume.raffy@univ-rennes1.fr', 'ClusterController has stopped because of an exception', strMessage)
|
||||
except BaseException:
|
||||
logError("Could not send the email to notify the administrator that cluster controller failed")
|
||||
pass
|
||||
executeCommand('kill -9 %d' % os.getpid()) # stop other threads immediately
|
||||
exit()
|
||||
|
||||
|
|
@ -1,5 +1,6 @@
|
|||
import Sensor
|
||||
|
||||
|
||||
class ClusterNodeSensorsReadings:
|
||||
"""
|
||||
|
||||
|
@ -13,22 +14,26 @@ class ClusterNodeSensorsReadings:
|
|||
def __init__(self, clusterNodeName):
|
||||
self.m_clusterNodeName = clusterNodeName
|
||||
self.m_sensors = {}
|
||||
#self.m_powerState = ClusterNodeStatus.POWERSTATE_UNKNOWN
|
||||
# self.m_powerState = ClusterNodeStatus.POWERSTATE_UNKNOWN
|
||||
return
|
||||
|
||||
def addSensor(self, sensor):
|
||||
self.m_sensors[sensor.m_name] = sensor
|
||||
|
||||
def dump(self):
|
||||
for key,sensor in self.m_sensors.items():
|
||||
for key, sensor in self.m_sensors.items():
|
||||
sensor.dump()
|
||||
return
|
||||
#def getPowerState(self):
|
||||
|
||||
# def getPowerState(self):
|
||||
# return self.m_powerState
|
||||
def getLowestTemperature( self ):
|
||||
#log('ClusterNodeSensorsReadings::getLowestTemperature : start')
|
||||
|
||||
def getLowestTemperature(self):
|
||||
# log('ClusterNodeSensorsReadings::getLowestTemperature : start')
|
||||
lowestTemperature = 0.0
|
||||
lowestTemperatureIsDefined = False
|
||||
for key,sensor in self.m_sensors.items():
|
||||
#log('ClusterNodeSensorsReadings::getLowestTemperature : start')
|
||||
for key, sensor in self.m_sensors.items():
|
||||
# log('ClusterNodeSensorsReadings::getLowestTemperature : start')
|
||||
if sensor.typeName() == 'Temperature':
|
||||
sensor.m_temperature
|
||||
if lowestTemperatureIsDefined:
|
||||
|
@ -37,6 +42,6 @@ class ClusterNodeSensorsReadings:
|
|||
else:
|
||||
lowestTemperature = sensor.m_temperature
|
||||
lowestTemperatureIsDefined = True
|
||||
assert( lowestTemperatureIsDefined )
|
||||
#log('ClusterNodeSensorsReadings::getLowestTemperature : end')
|
||||
assert lowestTemperatureIsDefined
|
||||
# log('ClusterNodeSensorsReadings::getLowestTemperature : end')
|
||||
return lowestTemperature
|
||||
|
|
|
@ -3,64 +3,66 @@ import re
|
|||
from Sensor import FanSensor, TemperatureSensor
|
||||
from ClusterNodeSensorsReadings import ClusterNodeSensorsReadings
|
||||
|
||||
|
||||
class IpmiTool202Parser:
|
||||
def parseSensorOutput( self, strOutput, clusterNodeName ):
|
||||
sensorReadings=ClusterNodeSensorsReadings(clusterNodeName)
|
||||
def parseSensorOutput(self, strOutput, clusterNodeName):
|
||||
sensorReadings = ClusterNodeSensorsReadings(clusterNodeName)
|
||||
f = io.StringIO(strOutput)
|
||||
line = f.readline()
|
||||
while( len(line) > 0 ):
|
||||
#print line,
|
||||
matchObj = re.match( '^Sensor ID[ ]*\: \'(?P<sensorName>[a-zA-Z 0-9]+)\'', line )
|
||||
while len(line) > 0:
|
||||
# print line,
|
||||
matchObj = re.match(r'^Sensor ID[ ]*\: \'(?P<sensorName>[a-zA-Z 0-9]+)\'', line)
|
||||
if matchObj:
|
||||
sensorName = matchObj.group('sensorName')
|
||||
# print sensorName
|
||||
# read the entity id
|
||||
line = f.readline()
|
||||
matchObj = re.match( '^ Entity ID[ ]*\: (?P<entityId>[0-9\.]+)', line )
|
||||
assert(matchObj)
|
||||
matchObj = re.match(r'^ Entity ID[ ]*\: (?P<entityId>[0-9\.]+)', line)
|
||||
assert matchObj
|
||||
entityId = matchObj.group('entityId')
|
||||
# print entityId
|
||||
# read the sensor type
|
||||
line = f.readline()
|
||||
matchObj = re.match( '^ Sensor Type[\(\)a-zA-Z ]*\: (?P<sensorType>[a-zA-Z \(\)]+)', line )
|
||||
assert(matchObj)
|
||||
matchObj = re.match(r'^ Sensor Type[\(\)a-zA-Z ]*\: (?P<sensorType>[a-zA-Z \(\)]+)', line)
|
||||
assert matchObj
|
||||
sensorType = matchObj.group('sensorType')
|
||||
#print sensorType
|
||||
# print sensorType
|
||||
if sensorType == 'Fan':
|
||||
rpms = self.parseFanSensorOutput(f)
|
||||
if temperature != None:
|
||||
if temperature is not None:
|
||||
sensor = FanSensor(sensorName)
|
||||
sensor.m_rpms = rpms
|
||||
elif sensorType == 'Temperature':
|
||||
temperature = self.parseTemperatureSensorOutput(f)
|
||||
if temperature != None:
|
||||
if temperature is not None:
|
||||
sensor = TemperatureSensor(sensorName)
|
||||
sensor.m_temperature = temperature
|
||||
else:
|
||||
#ignoring other sensors
|
||||
# ignoring other sensors
|
||||
sensor = None
|
||||
if sensor:
|
||||
sensorReadings.addSensor( sensor )
|
||||
sensorReadings.addSensor(sensor)
|
||||
else:
|
||||
None
|
||||
#assert(False)
|
||||
# assert(False)
|
||||
line = f.readline()
|
||||
f.close()
|
||||
|
||||
def parseFanSensorOutput(self, file):
|
||||
"""
|
||||
reads the fan specific ipdmitool output
|
||||
"""
|
||||
line = file.readline()
|
||||
#print line
|
||||
matchObj = re.match( '^ Sensor Reading[ ]*\: (?P<numRpms>[0-9]+) \(\+/\- (?P<rpmsPrecision>[0-9]+)\) RPM', line )
|
||||
if(matchObj):
|
||||
# print line
|
||||
matchObj = re.match(r'^ Sensor Reading[ ]*\: (?P<numRpms>[0-9]+) \(\+/\- (?P<rpmsPrecision>[0-9]+)\) RPM', line)
|
||||
if matchObj:
|
||||
numRpms = matchObj.group('numRpms')
|
||||
#print numRpms
|
||||
rpms = float( numRpms )
|
||||
# print numRpms
|
||||
rpms = float(numRpms)
|
||||
return rpms
|
||||
else:
|
||||
matchObj = re.match( '^ Sensor Reading[ ]*\: Not Present', line )
|
||||
assert(matchObj)
|
||||
matchObj = re.match(r'^ Sensor Reading[ ]*\: Not Present', line)
|
||||
assert matchObj
|
||||
return None
|
||||
|
||||
def parseTemperatureSensorOutput(self, file):
|
||||
|
@ -69,13 +71,13 @@ class IpmiTool202Parser:
|
|||
"""
|
||||
# Sensor Reading : 36 (+/- 0) degrees C
|
||||
line = file.readline()
|
||||
#print line
|
||||
matchObj = re.match( '^ Sensor Reading[ ]*\: (?P<temperature>[0-9]+) \(\+/\- (?P<precision>[0-9]+)\) degrees C', line )
|
||||
if(matchObj):
|
||||
# print line
|
||||
matchObj = re.match(r'^ Sensor Reading[ ]*\: (?P<temperature>[0-9]+) \(\+/\- (?P<precision>[0-9]+)\) degrees C', line)
|
||||
if matchObj:
|
||||
temperature = matchObj.group('temperature')
|
||||
temperature = float( temperature )
|
||||
temperature = float(temperature)
|
||||
return temperature
|
||||
else:
|
||||
matchObj = re.match( '^ Sensor Reading[ ]*\: Not Present', line )
|
||||
assert(matchObj)
|
||||
matchObj = re.match(r'^ Sensor Reading[ ]*\: Not Present', line)
|
||||
assert matchObj
|
||||
return None
|
||||
|
|
|
@ -3,37 +3,37 @@ import re
|
|||
from Sensor import FanSensor, TemperatureSensor
|
||||
from ClusterNodeSensorsReadings import ClusterNodeSensorsReadings
|
||||
|
||||
|
||||
class IpmiTool218Parser:
|
||||
def parseSensorOutput( self, strOutput, clusterNodeName ):
|
||||
sensorReadings=ClusterNodeSensorsReadings(clusterNodeName)
|
||||
def parseSensorOutput(self, strOutput, clusterNodeName):
|
||||
sensorReadings = ClusterNodeSensorsReadings(clusterNodeName)
|
||||
f = io.StringIO(strOutput)
|
||||
line = f.readline()
|
||||
while( len(line) > 0 ):
|
||||
#print line,
|
||||
matchObj = re.match( '^(?P<sensorName>[a-zA-Z 0-9]+[a-zA-Z 0-9]*[a-zA-Z0-9])[ ]*\| (?P<sensorValue>[\.0-9]+)[ ]*\| (?P<sensorUnit>[a-zA-Z0-9][a-zA-Z 0-9]*[a-zA-Z0-9])[?]*', line )
|
||||
while len(line) > 0:
|
||||
# print line,
|
||||
matchObj = re.match(r'^(?P<sensorName>[a-zA-Z 0-9]+[a-zA-Z 0-9]*[a-zA-Z0-9])[ ]*\| (?P<sensorValue>[\.0-9]+)[ ]*\| (?P<sensorUnit>[a-zA-Z0-9][a-zA-Z 0-9]*[a-zA-Z0-9])[?]*', line)
|
||||
if matchObj:
|
||||
#log('readClusterNodeSensorsIpmiTool2_1_8 : sensorName = '+matchObj.group('sensorName'))
|
||||
#log('readClusterNodeSensorsIpmiTool2_1_8 : sensorValue = '+matchObj.group('sensorValue'))
|
||||
#log('readClusterNodeSensorsIpmiTool2_1_8 : sensorUnit = "'+matchObj.group('sensorUnit')+'"')
|
||||
# log('readClusterNodeSensorsIpmiTool2_1_8 : sensorName = '+matchObj.group('sensorName'))
|
||||
# log('readClusterNodeSensorsIpmiTool2_1_8 : sensorValue = '+matchObj.group('sensorValue'))
|
||||
# log('readClusterNodeSensorsIpmiTool2_1_8 : sensorUnit = "'+matchObj.group('sensorUnit')+'"')
|
||||
sensorName = matchObj.group('sensorName')
|
||||
sensorValue = matchObj.group('sensorValue')
|
||||
sensorUnit = matchObj.group('sensorUnit')
|
||||
sensor = None
|
||||
if sensorUnit == 'degrees C':
|
||||
sensor = TemperatureSensor(sensorName)
|
||||
sensor.m_temperature = float( sensorValue )
|
||||
sensor.m_temperature = float(sensorValue)
|
||||
elif sensorUnit == 'RPM':
|
||||
sensor = FanSensor(sensorName)
|
||||
sensor.m_rpms = float( sensorValue )
|
||||
sensor.m_rpms = float(sensorValue)
|
||||
else:
|
||||
None
|
||||
if sensor:
|
||||
#log('readClusterNodeSensorsIpmiTool2_1_8 : adding sensor')
|
||||
sensorReadings.addSensor( sensor )
|
||||
# log('readClusterNodeSensorsIpmiTool2_1_8 : adding sensor')
|
||||
sensorReadings.addSensor(sensor)
|
||||
else:
|
||||
None
|
||||
#assert(False)
|
||||
# assert(False)
|
||||
line = f.readline()
|
||||
f.close()
|
||||
return sensorReadings
|
||||
|
|
@ -6,9 +6,9 @@ if sys.version_info < (3, 0):
|
|||
else:
|
||||
from io import StringIO
|
||||
import re
|
||||
from .wol import *
|
||||
from .wol import wake_on_lan
|
||||
import os
|
||||
from .Util import *
|
||||
from .Util import executeProgram, executeCommand, log
|
||||
import abc
|
||||
import sqlite3
|
||||
from .mysql2sqlite import mysql_to_sqlite
|
||||
|
@ -33,7 +33,7 @@ def isMachineResponding(machineName):
|
|||
# don't stop the program until we understand bug00000004
|
||||
else:
|
||||
log('isMachineResponding : Unexpected return code : returnCode=%d, stdout="%s", stderr="%s" , machineName = %s' % (returnCode, stdout, stderr, machineName))
|
||||
assert(False)
|
||||
assert False
|
||||
return False
|
||||
|
||||
|
||||
|
@ -63,7 +63,7 @@ class RemoteMysqlDb(ISqlDatabaseBackend):
|
|||
|
||||
def _connect(self):
|
||||
self._conn = MySQLdb.connect(self._db_server_fqdn, self._db_user, '', self._db_name)
|
||||
assert(self._conn)
|
||||
assert self._conn
|
||||
|
||||
def query(self, sql_query):
|
||||
"""
|
||||
|
@ -163,13 +163,13 @@ class SqlDatabaseReader(object):
|
|||
|
||||
def machineNameToMacAddress(machineName):
|
||||
conn = MySQLdb.connect('simpatix10', 'simpadb_reader', '', 'simpadb')
|
||||
assert(conn)
|
||||
assert conn
|
||||
sqlQuery = """SELECT mac_address FROM ethernet_cards WHERE machine_name='""" + machineName + """' AND type='normal'"""
|
||||
# print sqlQuery
|
||||
conn.query(sqlQuery)
|
||||
r = conn.store_result()
|
||||
row = r.fetch_row(0)
|
||||
assert( len(row) == 1)
|
||||
assert len(row) == 1
|
||||
# print 'row =', row
|
||||
macAddress = row[0][0]
|
||||
# print macAddress
|
||||
|
@ -182,13 +182,13 @@ def getLightOutManagementIpAddress(machineName):
|
|||
the light out management ip of servers allows to talk to the server even when it's asleep
|
||||
"""
|
||||
conn = MySQLdb.connect('simpatix10', 'simpadb_reader', '', 'simpadb')
|
||||
assert(conn)
|
||||
assert conn
|
||||
sqlQuery = """SELECT ip_address_1,ip_address_2,ip_address_3,ip_address_4 FROM ethernet_cards WHERE machine_name='""" + machineName + """' AND type='light_out_management'"""
|
||||
# print sqlQuery
|
||||
conn.query(sqlQuery)
|
||||
r = conn.store_result()
|
||||
row = r.fetch_row(0)
|
||||
assert(len(row) == 1)
|
||||
assert len(row) == 1
|
||||
# print 'row =', row
|
||||
ipAddress = ('%s.%s.%s.%s') % (row[0][0], row[0][1], row[0][2], row[0][3])
|
||||
# print macAddress
|
||||
|
@ -199,7 +199,7 @@ def getLightOutManagementIpAddress(machineName):
|
|||
def getClusterMachinesNames():
|
||||
clusterMachinesNames = []
|
||||
conn = MySQLdb.connect('simpatix10', 'simpadb_reader', '', 'simpadb')
|
||||
assert(conn)
|
||||
assert conn
|
||||
sqlQuery = """SELECT name FROM machines WHERE affectation='cluster'"""
|
||||
# print sqlQuery
|
||||
conn.query(sqlQuery)
|
||||
|
@ -231,7 +231,7 @@ def putToSleep(machineName):
|
|||
print 'stderr :'
|
||||
print stderr
|
||||
"""
|
||||
assert(returnCode == 0)
|
||||
assert returnCode == 0
|
||||
# check if the command succeeded by looking at the output (that's the only way I found)
|
||||
f = StringIO.StringIO(stdout)
|
||||
line = f.readline()
|
||||
|
|
|
@ -1,22 +1,28 @@
|
|||
#!/usr/bin/python
|
||||
|
||||
#import sys
|
||||
#sys.path.insert(0, '/homes/raffy/SvnGRaffy/dev/Python')
|
||||
# import sys
|
||||
# sys.path.insert(0, '/homes/raffy/SvnGRaffy/dev/Python')
|
||||
import re
|
||||
#import Lib.Util
|
||||
# import Lib.Util
|
||||
|
||||
|
||||
class SgeConfig:
|
||||
def __init__( self ):
|
||||
self.m_attrs={}
|
||||
|
||||
def __init__(self):
|
||||
self.m_attrs = {}
|
||||
|
||||
def hasAttr(self, attr_name):
|
||||
return attr_name in self.m_attrs.keys()
|
||||
def getAttr( self, strAttrName ):
|
||||
return self.m_attrs[ strAttrName ]
|
||||
def setAttr( self, strAttrName, strAttrValue ):
|
||||
|
||||
def getAttr(self, strAttrName):
|
||||
return self.m_attrs[strAttrName]
|
||||
|
||||
def setAttr(self, strAttrName, strAttrValue):
|
||||
assert isinstance(strAttrName, str)
|
||||
assert isinstance(strAttrValue, str)
|
||||
self.m_attrs[ strAttrName ] = strAttrValue
|
||||
def loadFromSgeFormat1String( self, strSgeConfigString ):
|
||||
self.m_attrs[strAttrName] = strAttrValue
|
||||
|
||||
def loadFromSgeFormat1String(self, strSgeConfigString):
|
||||
"""
|
||||
loads attrs from a string such as :
|
||||
hostname simpatix11.univ-rennes1.fr
|
||||
|
@ -41,18 +47,18 @@ class SgeConfig:
|
|||
usage_scaling NONE
|
||||
report_variables NONE
|
||||
"""
|
||||
self.m_attrs={}
|
||||
self.m_attrs = {}
|
||||
# put multiline attributes on one line
|
||||
strSgeConfigString = re.sub(r"\\\n", "", strSgeConfigString)
|
||||
for strAttrDef in strSgeConfigString.split("\n"):
|
||||
# print("strAttrDef=%s" % strAttrDef)
|
||||
if len(strAttrDef) != 0:
|
||||
matchObj = re.match( "^(?P<attrName>[^\s]+)[ ]+(?P<attrValue>[^\s].*)$", strAttrDef )
|
||||
matchObj = re.match(r"^(?P<attrName>[^\s]+)[]+(?P<attrValue>[^\s].*)$", strAttrDef)
|
||||
assert matchObj is not None
|
||||
#print( '%s = %s\n' % (matchObj.group("attrName"), matchObj.group("attrValue") ) )
|
||||
self.m_attrs[ matchObj.group("attrName") ] = matchObj.group("attrValue")
|
||||
# print('%s = %s\n' % (matchObj.group("attrName"), matchObj.group("attrValue")))
|
||||
self.m_attrs[matchObj.group("attrName")] = matchObj.group("attrValue")
|
||||
|
||||
def loadFromSgeFormat2String( self, strSgeConfigString ):
|
||||
def loadFromSgeFormat2String(self, strSgeConfigString):
|
||||
"""
|
||||
loads attrs from a string such as :
|
||||
arch=darwin-x86,num_proc=4,mem_total=8192.000000M, \
|
||||
|
@ -67,18 +73,19 @@ class SgeConfig:
|
|||
np_load_short=1.296631,np_load_medium=1.281616, \
|
||||
np_load_long=1.271973
|
||||
"""
|
||||
self.m_attrs={}
|
||||
self.m_attrs = {}
|
||||
if strSgeConfigString != "NONE":
|
||||
for strAttrDef in strSgeConfigString.split(","):
|
||||
#print strAttrDef
|
||||
# print strAttrDef
|
||||
if len(strAttrDef) != 0:
|
||||
matchObj = re.match( "^\s*(?P<attrName>[^=]+)=(?P<attrValue>.*)$", strAttrDef )
|
||||
#print matchObj.group("attrName")
|
||||
self.m_attrs[ matchObj.group("attrName") ] = matchObj.group("attrValue")
|
||||
def asFormat1String( self ):
|
||||
matchObj = re.match(r"^\s*(?P<attrName>[^=]+)=(?P<attrValue>.*)$", strAttrDef)
|
||||
# print matchObj.group("attrName")
|
||||
self.m_attrs[matchObj.group("attrName")] = matchObj.group("attrValue")
|
||||
|
||||
def asFormat1String(self):
|
||||
strResult = ""
|
||||
for (k,v) in self.m_attrs.items():
|
||||
#print "%s %s" % (k,v)
|
||||
for (k, v) in self.m_attrs.items():
|
||||
# print "%s %s" % (k,v)
|
||||
# if the attribute's value is a list of comma separated strings, make sure there are no spaces after the commas, otherwise the value is not properly interpreted when read back into sge
|
||||
# for example if the user sets the value of administrator_mail (using qconf -mconf global) to "alice@univ-rennes1.fr, bob@univ-rennes1.fr", then the next call to qconf -sconf global will show a wrong value for administrator_mail, as shown below:
|
||||
# pag_cmd none
|
||||
|
@ -120,33 +127,33 @@ class SgeConfig:
|
|||
# root@physix-master:~# qconf -Mconf /tmp/global
|
||||
# only a single value is allowed for configuration attribute "administrator_mail"
|
||||
|
||||
cleaned_value = re.sub(',\s*', ',', v)
|
||||
cleaned_value = re.sub(r',\s*', ',', v)
|
||||
|
||||
# prevent space pollution in space separated values, such as in reporting_params (see https://bugzilla.ipr.univ-rennes1.fr/show_bug.cgi?id=2812). If spaces are not compacted, the space separated values will contain more and more spaces and at some point corrupt the value : a line containing just a backslash, such as in the following example:
|
||||
# reporting_params accounting=true reporting=false \
|
||||
# flush_time=00:00:15 joblog=false \
|
||||
# sharelog=00:00:00
|
||||
# \
|
||||
cleaned_value = re.sub('\s+', ' ', cleaned_value)
|
||||
cleaned_value = re.sub(r'\s+', ' ', cleaned_value)
|
||||
strResult += "%s %s\n" % (k, cleaned_value)
|
||||
# print("strResult=%s" % strResult)
|
||||
return strResult
|
||||
def asFormat2String( self ):
|
||||
|
||||
def asFormat2String(self):
|
||||
strResult = ""
|
||||
iNumAttrs = len(self.m_attrs)
|
||||
if iNumAttrs == 0:
|
||||
return "NONE"
|
||||
iAttr = 0
|
||||
for (k,v) in self.m_attrs.items():
|
||||
#print "%s %s" % (k,v)
|
||||
strResult += "%s=%s" % (k,v)
|
||||
for (k, v) in self.m_attrs.items():
|
||||
# print "%s %s" % (k,v)
|
||||
strResult += "%s=%s" % (k, v)
|
||||
if iAttr != (iNumAttrs - 1):
|
||||
strResult += ","
|
||||
iAttr+=1
|
||||
#print strSgeConfigString
|
||||
iAttr += 1
|
||||
# print strSgeConfigString
|
||||
return strResult
|
||||
def dump( self ):
|
||||
for (k,v) in self.m_attrs.items():
|
||||
print("['%s']='%s'" % (k,v))
|
||||
|
||||
|
||||
def dump(self):
|
||||
for (k, v) in self.m_attrs.items():
|
||||
print("['%s']='%s'" % (k, v))
|
||||
|
|
|
@ -15,6 +15,7 @@ else:
|
|||
from html.parser import HTMLParser
|
||||
from email.mime.text import MIMEText
|
||||
|
||||
|
||||
def sendTextMail(strFrom, to, strSubject, text):
|
||||
# from = "SimpaCluster <guillaume.raffy@univ-rennes1.fr>"
|
||||
mail = MIMEText(text)
|
||||
|
@ -47,7 +48,7 @@ def log(message):
|
|||
|
||||
def executeProgram(astrArguments):
|
||||
# log('executeProgram : program [%s]' % (','.join(astrArguments)))
|
||||
popen = subprocess.Popen( astrArguments, shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE) # bufsize=1 seems to prevent deadlocks that happen 50% the time
|
||||
popen = subprocess.Popen(astrArguments, shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE) # bufsize=1 seems to prevent deadlocks that happen 50% the time
|
||||
stdout, stderr = popen.communicate()
|
||||
# popen.wait()
|
||||
result = (popen.returncode, stdout.decode(), stderr)
|
||||
|
@ -60,7 +61,7 @@ def executeCommand(command):
|
|||
"""
|
||||
executes the shell command such as 'set x=1; myprog $x'
|
||||
"""
|
||||
popen = subprocess.Popen( [command], shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, executable='/bin/bash') # bufsize=1 seems to prevent deadlocks that happen 50% the time
|
||||
popen = subprocess.Popen([command], shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, executable='/bin/bash') # bufsize=1 seems to prevent deadlocks that happen 50% the time
|
||||
# if we don't specify the optional executable argument, then the default non interactive shell will be used. On debian, the default non-interactive shell is dash, which doesn't understand the keyword 'source' that we use in many places
|
||||
stdout, stderr = popen.communicate()
|
||||
# popen.wait()
|
||||
|
@ -85,7 +86,6 @@ def executeCommandOn(target_machine_fqdn, command, user=None):
|
|||
target = '%s@%s' % (user, target_machine_fqdn)
|
||||
else:
|
||||
target = target_machine_fqdn
|
||||
|
||||
result = executeProgram(['ssh', target, "%s" % command])
|
||||
logging.debug("finished executing %s on %s as %s" % (command, target_machine_fqdn, user))
|
||||
return result
|
||||
|
@ -94,11 +94,12 @@ def executeCommandOn(target_machine_fqdn, command, user=None):
|
|||
def getUpsStatus():
|
||||
|
||||
class MyHTMLParser(HTMLParser):
|
||||
|
||||
def __init__(self):
|
||||
HTMLParser.__init__(self)
|
||||
self.TokenList = []
|
||||
|
||||
def handle_data( self, data):
|
||||
def handle_data(self, data):
|
||||
data = data.strip()
|
||||
if data and len(data) > 0:
|
||||
self.TokenList.append(data)
|
||||
|
@ -118,7 +119,8 @@ def getUpsStatus():
|
|||
return
|
||||
h = MyHTMLParser()
|
||||
h.feed(res)
|
||||
tokensList = h.GetTokenList() # @UnusedVariable
|
||||
tokensList = h.GetTokenList() # noqa:F841
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
from SimpaDbUtil import wakeUp
|
||||
|
|
|
@ -1,63 +0,0 @@
|
|||
'''
|
||||
The goal of this application is to convert a mno database into mno's web site compatible database (drupal)
|
||||
'''
|
||||
|
||||
import sqlite3
|
||||
|
||||
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
from SimpaDbUtil import SqlFile, SqlDatabaseReader
|
||||
from _sqlite3 import Row
|
||||
|
||||
class OrchestraSqlDb( object ):
|
||||
def __init__(self, sql_reader):
|
||||
"""
|
||||
:param SqlDatabaseReader sql_reader: the inventory database
|
||||
"""
|
||||
super(OrchestraSqlDb, self).__init__()
|
||||
self._sql_reader = sql_reader
|
||||
|
||||
def query(self, sql_query):
|
||||
return self._sql_reader.query(sql_query)
|
||||
|
||||
|
||||
class Concert(object):
|
||||
pass
|
||||
|
||||
class Recording(object):
|
||||
pass
|
||||
|
||||
class OrchestraDb(object):
|
||||
|
||||
def __init__(self, mno_drupal_db_sql_file_path):
|
||||
self.concerts = {}
|
||||
|
||||
sql_source = SqlFile(mno_drupal_db_sql_file_path)
|
||||
sql_reader = SqlDatabaseReader(sql_source)
|
||||
orchestra_sql_db = OrchestraSqlDb(sql_reader)
|
||||
|
||||
self._parse_from_orchestra_drupal_db(orchestra_sql_db)
|
||||
|
||||
def _parse_from_orchestra_drupal_db(self, orchestra_sql_db):
|
||||
"""
|
||||
:param OrchestraSqlDb orchestra_sql_db:
|
||||
"""
|
||||
|
||||
concert_rows = orchestra_sql_db.query("SELECT nid,title FROM node WHERE type is 'concert'")
|
||||
|
||||
for concert_row in concert_rows:
|
||||
(nid, title)=concert_row
|
||||
print(title)
|
||||
nid = int(nid)
|
||||
track_id_rows = orchestra_sql_db.query("SELECT field_tracks_target_id FROM field_revision_field_tracks WHERE entity_id=%d" % nid )
|
||||
for track_id_row in track_id_rows:
|
||||
(field_tracks_target_id, ) = track_id_row
|
||||
#print(field_tracks_target_id)
|
||||
|
||||
track_rows = orchestra_sql_db.query("SELECT title FROM node WHERE nid=%d" % field_tracks_target_id)
|
||||
(recording_title, ) = track_rows[0]
|
||||
print("\t%s" % recording_title)
|
||||
|
||||
mno_db = OrchestraDb('/Users/graffy/data/Perso/MeltingNotes_work.git/website/v2_drupal/melting_drupal.sql')
|
|
@ -1,6 +1,7 @@
|
|||
import re
|
||||
|
||||
def mysql_to_sqlite( mysql_sql_code, truncate_hex_strings = False ):
|
||||
|
||||
def mysql_to_sqlite(mysql_sql_code, truncate_hex_strings=False):
|
||||
"""
|
||||
converts a mysql-compatible sql code into a sqlite-ompatible sql code
|
||||
|
||||
|
@ -29,23 +30,23 @@ def mysql_to_sqlite( mysql_sql_code, truncate_hex_strings = False ):
|
|||
content = COMMENTS_RE.sub('', content)
|
||||
|
||||
# sqlite doesn't like ' being escaped as \', use '' instead
|
||||
content = re.sub(r'\\\'', '\'\'', content, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL )
|
||||
content = re.sub(r'\\\'', '\'\'', content, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL)
|
||||
|
||||
if truncate_hex_strings:
|
||||
# sqlite doesn't like too big hex strings 0x613a343a7b733a383a
|
||||
content = re.sub(r'0x[0-9a-f]+', '0xdeadbeef', content, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL )
|
||||
content = re.sub(r'0x[0-9a-f]+', '0xdeadbeef', content, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL)
|
||||
|
||||
# sqlite doesn't understand
|
||||
# `format` varchar(100) CHARACTER SET utf8 COLLATE utf8_bin NOT NULL
|
||||
content = re.sub(r'\s+CHARACTER SET\s+[^\s]+', '', content, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL )
|
||||
content = re.sub(r'\s+CHARACTER SET\s+[^\s]+', '', content, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL)
|
||||
|
||||
# sqlite doesn't know the utf8_bin :
|
||||
# `format` varchar(100) CHARACTER SET utf8 COLLATE utf8_bin NOT NULL
|
||||
#no such collation sequence: utf8_bin
|
||||
content = re.sub(r'\s+COLLATE\s+utf8_bin\s+', ' ', content, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL )
|
||||
# no such collation sequence: utf8_bin
|
||||
content = re.sub(r'\s+COLLATE\s+utf8_bin\s+', ' ', content, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL)
|
||||
|
||||
# sqlite doesn't like 'unsigned' as in `ip_address_3` tinyint(3) unsigned NOT NULL default '27',
|
||||
content = re.sub(r' unsigned ', ' ', content, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL )
|
||||
content = re.sub(r' unsigned ', ' ', content, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL)
|
||||
|
||||
# sqlite doesn't like 'enum' as in `type` enum('normal','light_out_management') NOT NULL default 'normal',,
|
||||
content = re.sub(r' enum\([^\)]*\) ', ' varchar(255) ', content)
|
||||
|
@ -54,29 +55,27 @@ def mysql_to_sqlite( mysql_sql_code, truncate_hex_strings = False ):
|
|||
# ALTER TABLE `blocked_ips`
|
||||
# ADD PRIMARY KEY (`iid`),
|
||||
# ADD KEY `blocked_ip` (`ip`);
|
||||
content = re.sub(r'alter table [^;]*;', '', content, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL )
|
||||
|
||||
content = re.sub(r'alter table [^;]*;', '', content, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL)
|
||||
|
||||
# COMMIT;
|
||||
# sqlite3.OperationalError: cannot commit - no transaction is active
|
||||
content = re.sub(r'commit\s*;', '', content, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL )
|
||||
content = re.sub(r'commit\s*;', '', content, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL)
|
||||
|
||||
# insert multiple values
|
||||
# INSERTVALS_RE = re.compile(r'^(INSERT INTO.*?VALUES)\s*\((.*)\*;', re.IGNORECASE | re.MULTILINE | re.DOTALL)
|
||||
INSERTVALS_RE = re.compile(r'^(INSERT INTO.*?VALUES)\s*([^;]*);', re.IGNORECASE | re.MULTILINE | re.DOTALL)
|
||||
#INSERTVALS_RE = re.compile(r'^(INSERT INTO.*?VALUES)\s*((\[^\)](\)));$', re.IGNORECASE | re.MULTILINE | re.DOTALL)
|
||||
# INSERTVALS_RE = re.compile(r'^(INSERT INTO.*?VALUES)\s*((\[^\)](\)));$', re.IGNORECASE | re.MULTILINE | re.DOTALL)
|
||||
INSERTVALS_SPLIT_RE = re.compile(r'\)\s*,\s*\(', re.IGNORECASE | re.MULTILINE | re.DOTALL)
|
||||
|
||||
|
||||
def insertvals_replacer(match):
|
||||
insert, values = match.groups()
|
||||
# print("insert=%s"%insert)
|
||||
# print("values=%s"%values)
|
||||
values = re.sub('^\s*\(' ,'', values)
|
||||
values = re.sub('\)\s*$' ,'', values)
|
||||
values = re.sub(r'^\s*\(', '', values)
|
||||
values = re.sub(r'\)\s*$', '', values)
|
||||
replacement = ''
|
||||
for vals in INSERTVALS_SPLIT_RE.split(values):
|
||||
#print("vals=%s"%vals)
|
||||
# print("vals=%s"%vals)
|
||||
replacement = '%s\n%s (%s);' % (replacement, insert, vals)
|
||||
return replacement
|
||||
|
||||
|
|
|
@ -4,6 +4,7 @@
|
|||
import socket
|
||||
import struct
|
||||
|
||||
|
||||
def wake_on_lan(macaddress):
|
||||
""" Switches on remote computers using WOL. """
|
||||
|
||||
|
@ -32,11 +33,10 @@ def wake_on_lan(macaddress):
|
|||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
# Use macaddresses with any seperators.
|
||||
wake_on_lan('00:1E:52:F3:61:60') # simpatix28
|
||||
#wake_on_lan('00:24:36:F2:D0:FA') # simpatix33
|
||||
#wake_on_lan('0F:0F:DF:0F:BF:EF')
|
||||
#wake_on_lan('0F-0F-DF-0F-BF-EF')
|
||||
# wake_on_lan('00:24:36:F2:D0:FA') # simpatix33
|
||||
# wake_on_lan('0F:0F:DF:0F:BF:EF')
|
||||
# wake_on_lan('0F-0F-DF-0F-BF-EF')
|
||||
# or without any seperators.
|
||||
#wake_on_lan('0F0FDF0FBFEF')
|
||||
# wake_on_lan('0F0FDF0FBFEF')
|
||||
|
|
Loading…
Reference in New Issue