Ai remis le cluster controller en route. Il n'est pas si simple que ça à démarrer, alors j'en ai profité pour écrire l'installeur, qui tient également lieu de documentation. (partie 5)

This commit is contained in:
Guillaume Raffy 2011-10-07 15:51:20 +00:00
parent e326ab1976
commit d97fca4143
6 changed files with 9 additions and 9 deletions

View File

@ -181,7 +181,7 @@ class ClusterController:
self.wakeUpMachinesForPendingJobs() self.wakeUpMachinesForPendingJobs()
def storeSessionInDatabase( self ): def storeSessionInDatabase( self ):
conn = MySQLdb.connect('simpatix10', 'root', '', 'clustercontroller') conn = MySQLdb.connect('simpatix10', 'clusterctrl', '', 'clustercontroller')
assert(conn) assert(conn)
# retrieve the session id, as it's an auto_increment field # retrieve the session id, as it's an auto_increment field
@ -226,10 +226,11 @@ class ClusterController:
""" """
""" """
self.m_iSessionId = self.storeSessionInDatabase() self.m_iSessionId = self.storeSessionInDatabase()
log("storeSessionInDatabase completed")
DELAY_BETWEEN_MEASURES = 10 # in seconds DELAY_BETWEEN_MEASURES = 10 # in seconds
self.m_clusterStatus.startReadingThreads() self.m_clusterStatus.startReadingThreads()
while not self.m_clusterStatus.isReady(): while not self.m_clusterStatus.isReady():
#log('waiting for system to be ready') log('waiting for system to be ready')
time.sleep(1) time.sleep(1)
None None
logInfo('ClusterController::run : cluster initial readings have completed') logInfo('ClusterController::run : cluster initial readings have completed')

View File

@ -17,10 +17,9 @@ class ClusterStatus:
self.m_lock = threading.Lock() # to prevent concurrent access to this instance self.m_lock = threading.Lock() # to prevent concurrent access to this instance
self.m_jobsStateUpdater = JobsStateUpdater( self ) self.m_jobsStateUpdater = JobsStateUpdater( self )
self.m_jobsState = None self.m_jobsState = None
#self.m_controlledMachineNames = [ 'simpatix26', 'simpatix27', 'simpatix38', 'simpatix10' ] self.m_controlledMachineNames = [ 'simpatix30' ]
#self.m_controlledMachineNames = [ 'simpatix15' ] #self.m_controlledMachineNames = [] # [ 'simpatix30' ]
self.m_controlledMachineNames = [] # [ 'simpatix10' ] if False:
if True:
for iMachine in range(11, 40): for iMachine in range(11, 40):
if (iMachine == 31) or (iMachine == 32): if (iMachine == 31) or (iMachine == 32):
continue # these machines don't seem to be able to go to sleep properly (bug 00000010) continue # these machines don't seem to be able to go to sleep properly (bug 00000010)

View File

@ -4,7 +4,6 @@ import threading
gLogFilePath = '/var/log/ClusterController.log' gLogFilePath = '/var/log/ClusterController.log'
def log( message ): def log( message ):
return
threadName = threading.currentThread().getName() threadName = threading.currentThread().getName()
logMessage = time.asctime(time.localtime())+' : '+ threadName + ' : ' + message logMessage = time.asctime(time.localtime())+' : '+ threadName + ' : ' + message
print logMessage print logMessage

View File

@ -20,7 +20,7 @@ class QstatParser:
elif c == 't': elif c == 't':
jobState += JobStateFlags.TRANSFERING jobState += JobStateFlags.TRANSFERING
else: else:
assert( False, 'unhandled job state flag :"' + c + '"' ) assert False, 'unhandled job state flag :"' + c + '"'
return jobState return jobState
def parseQstatOutput( self, qstatOutput ): def parseQstatOutput( self, qstatOutput ):
jobsState = JobsState() jobsState = JobsState()

View File

@ -26,7 +26,7 @@ class SunGridEngine:
if False: # no need for job details at the moment and since it's very slow, it's been disabled if False: # no need for job details at the moment and since it's very slow, it's been disabled
for jobId, job in jobsState.getPendingJobs().iteritems(): for jobId, job in jobsState.getPendingJobs().iteritems():
(returnCode, stdout, stderr) = executeProgram( ['qstat', '-j', job.getId().asStr()] ) (returnCode, stdout, stderr) = executeProgram( ['qstat', '-j', job.getId().asStr()] )
assert( returnCode != 0, 'prout' ) assert returnCode != 0, 'prout'
QstatParser().parseJobDetails( stdout, job ) QstatParser().parseJobDetails( stdout, job )
return jobsState return jobsState

View File

@ -207,6 +207,7 @@ def onException(exception):
strMessage += f.getvalue() strMessage += f.getvalue()
f.close() f.close()
logError(strMessage) logError(strMessage)
print(strMessage)
Lib.Util.sendTextMail( 'ClusterController <guillaume.raffy@univ-rennes1.fr>', 'guillaume.raffy@univ-rennes1.fr', 'ClusterController has stopped because of an exception', strMessage) Lib.Util.sendTextMail( 'ClusterController <guillaume.raffy@univ-rennes1.fr>', 'guillaume.raffy@univ-rennes1.fr', 'ClusterController has stopped because of an exception', strMessage)
executeCommand('kill -9 %d' % os.getpid()) # stop other threads immediately executeCommand('kill -9 %d' % os.getpid()) # stop other threads immediately
exit() exit()