Ai remis le cluster controller en route. Il n'est pas si simple que ça à démarrer, alors j'en ai profité pour écrire l'installeur, qui tient également lieu de documentation. (partie 5)

This commit is contained in:
Guillaume Raffy 2011-10-07 15:51:20 +00:00
parent e326ab1976
commit d97fca4143
6 changed files with 9 additions and 9 deletions

View File

@ -181,7 +181,7 @@ class ClusterController:
self.wakeUpMachinesForPendingJobs()
def storeSessionInDatabase( self ):
conn = MySQLdb.connect('simpatix10', 'root', '', 'clustercontroller')
conn = MySQLdb.connect('simpatix10', 'clusterctrl', '', 'clustercontroller')
assert(conn)
# retrieve the session id, as it's an auto_increment field
@ -226,10 +226,11 @@ class ClusterController:
"""
"""
self.m_iSessionId = self.storeSessionInDatabase()
log("storeSessionInDatabase completed")
DELAY_BETWEEN_MEASURES = 10 # in seconds
self.m_clusterStatus.startReadingThreads()
while not self.m_clusterStatus.isReady():
#log('waiting for system to be ready')
log('waiting for system to be ready')
time.sleep(1)
None
logInfo('ClusterController::run : cluster initial readings have completed')

View File

@ -17,10 +17,9 @@ class ClusterStatus:
self.m_lock = threading.Lock() # to prevent concurrent access to this instance
self.m_jobsStateUpdater = JobsStateUpdater( self )
self.m_jobsState = None
#self.m_controlledMachineNames = [ 'simpatix26', 'simpatix27', 'simpatix38', 'simpatix10' ]
#self.m_controlledMachineNames = [ 'simpatix15' ]
self.m_controlledMachineNames = [] # [ 'simpatix10' ]
if True:
self.m_controlledMachineNames = [ 'simpatix30' ]
#self.m_controlledMachineNames = [] # [ 'simpatix30' ]
if False:
for iMachine in range(11, 40):
if (iMachine == 31) or (iMachine == 32):
continue # these machines don't seem to be able to go to sleep properly (bug 00000010)

View File

@ -4,7 +4,6 @@ import threading
gLogFilePath = '/var/log/ClusterController.log'
def log( message ):
return
threadName = threading.currentThread().getName()
logMessage = time.asctime(time.localtime())+' : '+ threadName + ' : ' + message
print logMessage

View File

@ -20,7 +20,7 @@ class QstatParser:
elif c == 't':
jobState += JobStateFlags.TRANSFERING
else:
assert( False, 'unhandled job state flag :"' + c + '"' )
assert False, 'unhandled job state flag :"' + c + '"'
return jobState
def parseQstatOutput( self, qstatOutput ):
jobsState = JobsState()

View File

@ -26,7 +26,7 @@ class SunGridEngine:
if False: # no need for job details at the moment and since it's very slow, it's been disabled
for jobId, job in jobsState.getPendingJobs().iteritems():
(returnCode, stdout, stderr) = executeProgram( ['qstat', '-j', job.getId().asStr()] )
assert( returnCode != 0, 'prout' )
assert returnCode != 0, 'prout'
QstatParser().parseJobDetails( stdout, job )
return jobsState

View File

@ -207,6 +207,7 @@ def onException(exception):
strMessage += f.getvalue()
f.close()
logError(strMessage)
print(strMessage)
Lib.Util.sendTextMail( 'ClusterController <guillaume.raffy@univ-rennes1.fr>', 'guillaume.raffy@univ-rennes1.fr', 'ClusterController has stopped because of an exception', strMessage)
executeCommand('kill -9 %d' % os.getpid()) # stop other threads immediately
exit()