Ai remis le cluster controller en route. Il n'est pas si simple que ça à démarrer, alors j'en ai profité pour écrire l'installeur, qui tient également lieu de documentation. (partie 5)
This commit is contained in:
parent
e326ab1976
commit
d97fca4143
|
@ -181,7 +181,7 @@ class ClusterController:
|
|||
self.wakeUpMachinesForPendingJobs()
|
||||
|
||||
def storeSessionInDatabase( self ):
|
||||
conn = MySQLdb.connect('simpatix10', 'root', '', 'clustercontroller')
|
||||
conn = MySQLdb.connect('simpatix10', 'clusterctrl', '', 'clustercontroller')
|
||||
assert(conn)
|
||||
|
||||
# retrieve the session id, as it's an auto_increment field
|
||||
|
@ -226,10 +226,11 @@ class ClusterController:
|
|||
"""
|
||||
"""
|
||||
self.m_iSessionId = self.storeSessionInDatabase()
|
||||
log("storeSessionInDatabase completed")
|
||||
DELAY_BETWEEN_MEASURES = 10 # in seconds
|
||||
self.m_clusterStatus.startReadingThreads()
|
||||
while not self.m_clusterStatus.isReady():
|
||||
#log('waiting for system to be ready')
|
||||
log('waiting for system to be ready')
|
||||
time.sleep(1)
|
||||
None
|
||||
logInfo('ClusterController::run : cluster initial readings have completed')
|
||||
|
|
|
@ -17,10 +17,9 @@ class ClusterStatus:
|
|||
self.m_lock = threading.Lock() # to prevent concurrent access to this instance
|
||||
self.m_jobsStateUpdater = JobsStateUpdater( self )
|
||||
self.m_jobsState = None
|
||||
#self.m_controlledMachineNames = [ 'simpatix26', 'simpatix27', 'simpatix38', 'simpatix10' ]
|
||||
#self.m_controlledMachineNames = [ 'simpatix15' ]
|
||||
self.m_controlledMachineNames = [] # [ 'simpatix10' ]
|
||||
if True:
|
||||
self.m_controlledMachineNames = [ 'simpatix30' ]
|
||||
#self.m_controlledMachineNames = [] # [ 'simpatix30' ]
|
||||
if False:
|
||||
for iMachine in range(11, 40):
|
||||
if (iMachine == 31) or (iMachine == 32):
|
||||
continue # these machines don't seem to be able to go to sleep properly (bug 00000010)
|
||||
|
|
|
@ -4,7 +4,6 @@ import threading
|
|||
gLogFilePath = '/var/log/ClusterController.log'
|
||||
|
||||
def log( message ):
|
||||
return
|
||||
threadName = threading.currentThread().getName()
|
||||
logMessage = time.asctime(time.localtime())+' : '+ threadName + ' : ' + message
|
||||
print logMessage
|
||||
|
|
|
@ -20,7 +20,7 @@ class QstatParser:
|
|||
elif c == 't':
|
||||
jobState += JobStateFlags.TRANSFERING
|
||||
else:
|
||||
assert( False, 'unhandled job state flag :"' + c + '"' )
|
||||
assert False, 'unhandled job state flag :"' + c + '"'
|
||||
return jobState
|
||||
def parseQstatOutput( self, qstatOutput ):
|
||||
jobsState = JobsState()
|
||||
|
|
|
@ -26,7 +26,7 @@ class SunGridEngine:
|
|||
if False: # no need for job details at the moment and since it's very slow, it's been disabled
|
||||
for jobId, job in jobsState.getPendingJobs().iteritems():
|
||||
(returnCode, stdout, stderr) = executeProgram( ['qstat', '-j', job.getId().asStr()] )
|
||||
assert( returnCode != 0, 'prout' )
|
||||
assert returnCode != 0, 'prout'
|
||||
QstatParser().parseJobDetails( stdout, job )
|
||||
|
||||
return jobsState
|
||||
|
|
|
@ -207,6 +207,7 @@ def onException(exception):
|
|||
strMessage += f.getvalue()
|
||||
f.close()
|
||||
logError(strMessage)
|
||||
print(strMessage)
|
||||
Lib.Util.sendTextMail( 'ClusterController <guillaume.raffy@univ-rennes1.fr>', 'guillaume.raffy@univ-rennes1.fr', 'ClusterController has stopped because of an exception', strMessage)
|
||||
executeCommand('kill -9 %d' % os.getpid()) # stop other threads immediately
|
||||
exit()
|
||||
|
|
Loading…
Reference in New Issue