228 lines
11 KiB
Python
228 lines
11 KiB
Python
import Lib.Util
|
|
import Lib.SimpaDbUtil
|
|
from Log import *
|
|
from PowerState import *
|
|
import re
|
|
import StringIO
|
|
import os
|
|
import traceback
|
|
import sys
|
|
|
|
def executeProgram( astrArguments ):
|
|
bBUG_00000008_IS_STILL_ACTIVE = True
|
|
if bBUG_00000008_IS_STILL_ACTIVE:
|
|
logDebug('executeProgram : program = [%s]' % (','.join(astrArguments) ))
|
|
(returnCode, stdout, stderr) = Lib.Util.executeProgram( astrArguments )
|
|
if bBUG_00000008_IS_STILL_ACTIVE:
|
|
logDebug('executeCommand : return code of [%s] = %d' % (','.join(astrArguments), returnCode))
|
|
# for debugging purpose, log info in case the command failed
|
|
if returnCode != 0:
|
|
logDebug('executeCommand : return code of [%s] = %d' % (','.join(astrArguments), returnCode))
|
|
logDebug('executeCommand : stdout of [%s] = %s' % (','.join(astrArguments), stdout))
|
|
logDebug('executeCommand : stderr of [%s] = %s' % (','.join(astrArguments), stderr))
|
|
return (returnCode, stdout, stderr)
|
|
|
|
def executeCommand( command ):
|
|
#logDebug('executeCommand : command = ' + command)
|
|
(returnCode, stdout, stderr) = Lib.Util.executeCommand( command )
|
|
#logDebug('executeCommand : return code of "'+command+'" = '+str(returnCode))
|
|
return (returnCode, stdout, stderr)
|
|
|
|
def executeIpmiCommand( machineName, ipmiCommandArgs ):
|
|
lomIpAddress = Lib.SimpaDbUtil.getLightOutManagementIpAddress( machineName )
|
|
lomPasswordFilepath = '/usr/local/etc/LightOutManagementPassword.txt'
|
|
astrProgram = ['ipmitool', '-U', 'admin', '-H', lomIpAddress, '-f', lomPasswordFilepath]
|
|
astrProgram.extend( ipmiCommandArgs )
|
|
#print 'executeIpmiCommand'
|
|
#print astrProgram
|
|
bBUG_00000005_IS_STILL_ACTIVE = True
|
|
if bBUG_00000005_IS_STILL_ACTIVE:
|
|
# bug 00000005 causes ipmitool to randomly fail for no apparent reason (two consecutive calls might give different errors, these errors not always being timeouts). Therefore we try and try again, until the command succeeds. If we don't do this, cluster controller keeps stopping because ipmi commands fail. The effect of this hack is that the UNPLUGGED power state is no longer detected; therefore, with this hack, cluster controller is expecting all machines to be plugged.
|
|
bCommandSucceeded = False
|
|
while not bCommandSucceeded:
|
|
(returnCode, stdout, stderr) = executeProgram( astrProgram )
|
|
if returnCode == 0:
|
|
bCommandSucceeded = True
|
|
else:
|
|
logWarning('the command "%s" failed. Retrying a bit later' % ' '.join(astrProgram))
|
|
time.sleep(5) # wait for 5 seconds before the next attempt, in order not to saturate activity
|
|
else:
|
|
(returnCode, stdout, stderr) = executeProgram( astrProgram )
|
|
"""
|
|
sh-3.2# ipmitool -U admin -H 129.20.27.220 -f /usr/local/etc/LightOutManagementPassword.txt sensor get 'ACPI State'
|
|
Unabled to establish a session with the BMC.
|
|
Command failed due to insufficient resources for session (0xFFFEF901)
|
|
-> this error means that the number of active conections to the BMC has reached the maximum (usually 5).
|
|
|
|
sh-3.2# ipmitool -U admin -H 129.20.27.212 -f /usr/local/etc/LightOutManagementPassword.txt sensor get 'ACPI State'
|
|
Unabled to establish a session with the BMC.
|
|
Command failed due to Unknown (0xFFFEF923) (0xFFFEF923)
|
|
|
|
sh-3.2# ipmitool -U admin -H 129.20.27.212 -f /usr/local/etc/LightOutManagementPassword.txt sensor get 'ACPI State'
|
|
Unabled to establish a session with the BMC.
|
|
Command failed due to Timeout (0xFFFEF9C3)
|
|
"""
|
|
|
|
return (returnCode, stdout, stderr)
|
|
|
|
def getPowerState( machineName ):
|
|
ePowerState = PowerState.UNKNOWN
|
|
bPowerStateRead = False
|
|
iNumFailedAttempts = 0
|
|
while not bPowerStateRead:
|
|
(returnCode, stdout, stderr) = executeIpmiCommand( machineName, ['sensor', 'get', 'ACPI State'] )
|
|
if returnCode == 0:
|
|
matchObj = re.search('\[(?P<AcpiState>S[0-9][^\:]*)\:', stdout)
|
|
bBUG_00000002_IS_STILL_ACTIVE = True
|
|
if bBUG_00000002_IS_STILL_ACTIVE:
|
|
if matchObj == None:
|
|
# the following warning has been commented out because it pollutes the logs and apparently
|
|
# it's a 'feature' of all 4 core machines : if the machine is woken up using ipmitool, then
|
|
# no power on event is logged ...
|
|
#logWarning('degenerate ipmitool output for machine %s (see bug 00000002). Assuming power in on because that''s what I noticed when I had the case.' % machineName)
|
|
return PowerState.ON
|
|
else:
|
|
assert( matchObj )
|
|
strAcpiState = matchObj.group('AcpiState')
|
|
if strAcpiState == 'S0/G0':
|
|
ePowerState = PowerState.ON
|
|
elif strAcpiState == 'S3': # memory is still powered
|
|
ePowerState = PowerState.SLEEP
|
|
elif strAcpiState == 'S5/G2': # soft-off
|
|
ePowerState = PowerState.OFF
|
|
else:
|
|
print strAcpiState
|
|
assert( False )
|
|
bPowerStateRead = True
|
|
else:
|
|
# error ... it's either because the machine is unplugged or because the machine is busy (well I'm not sure what happened but I had the case where the command failed for no apparent reason, and therefore I suspect it to be busy ). In order to differentiate these 2 cases, we try again and if this caommand fails too many times then we decide it's unplugged (very dodgy I know but I'm disapointed that this command doen't always work, and for now I don't know other ways to differentiate between these cases....)
|
|
iMAX_NUM_ATTEMPTS=5
|
|
iNumFailedAttempts += 1
|
|
if iNumFailedAttempts < iMAX_NUM_ATTEMPTS:
|
|
logWarning('failed to read the power state of %s. I\'ll try a again a bit later....' % machineName)
|
|
time.sleep(5)
|
|
else:
|
|
logWarning('failed to read the power state of %s too many times. I assume this machine is unplugged' % machineName)
|
|
ePowerState = PowerState.UNPLUGGED # too many attempts failed ... I guess it's because the machine is unplugged
|
|
bPowerStateRead = True
|
|
return ePowerState
|
|
|
|
def wakeUpMachine( machineName ):
|
|
"""
|
|
this method seems more reliable than wake on lan (sometimes, sending wake on lan packet seems to have no effect)
|
|
@return true on success, false otherwise
|
|
@note I once had this method failing for no obvious reason.. maybe this command does not succeed if the machine is in a transition state
|
|
"""
|
|
(returnCode, stdout, stderr) = executeIpmiCommand( machineName, ['chassis', 'power', 'on'] )
|
|
bSuccess = (returnCode == 0) # this command can fail if the machine is manually unplugged for example
|
|
return bSuccess
|
|
|
|
def blockingPutMachineToSleep( machineName ):
|
|
"""
|
|
@return true on success, false otherwise
|
|
"""
|
|
logInfo('putting machine %s to sleep...' % machineName)
|
|
iMaxNumAttempts = 5
|
|
bSuccess = False
|
|
bBUG_239_IS_STILL_ALIVE = True
|
|
iAttempt = 0
|
|
# note : each sleep order is not actually succeeding (god knows why). Therefore, we need to try again and again.
|
|
while not bSuccess:
|
|
# note : pmset must be executed as root
|
|
(returnCode, stdout, stderr) = executeProgram(['ssh', machineName, 'pmset sleepnow'])
|
|
# check if the machine actually went to sleep
|
|
iMaxGoToSleepDuration = 30 # in seconds
|
|
iDelay = 0
|
|
while iDelay < iMaxGoToSleepDuration:
|
|
time.sleep(5)
|
|
iDelay += 5
|
|
ePowerState = getPowerState( machineName )
|
|
if ePowerState == PowerState.SLEEP:
|
|
logInfo('machine %s is now sleeping (put to sleep succeeded)' % machineName)
|
|
return True
|
|
else:
|
|
if ePowerState != PowerState.ON:
|
|
logWarning('unexpectedly, powerState of %s is %s' % (machineName, PowerStateToStr(ePowerState)))
|
|
assert(ePowerState == PowerState.ON)
|
|
iAttempt += 1
|
|
if iAttempt > iMaxNumAttempts:
|
|
if bBUG_239_IS_STILL_ALIVE:
|
|
logWarning('the attempt to put %s to sleep failed too many times (probably because of bug 239 (machine is in a weird state : power on but no ssh possible) ?)... giving up. ' % (machineName))
|
|
return False
|
|
else:
|
|
logWarning('the attempt to put %s to sleep failed too many times... giving up' % (machineName))
|
|
return False
|
|
else:
|
|
logWarning('the attempt to put %s to sleep failed... trying again' % (machineName))
|
|
return True
|
|
|
|
def blockingWakeUpMachine(machineName):
|
|
logInfo('waking up machine %s...' % machineName)
|
|
numAttempts = 0
|
|
bWakeUpFailed = True
|
|
while bWakeUpFailed: # try more than once because sometimes for an unknown reason, the wake up order is ignored by the machine ... to be investigated
|
|
iMaxNumWakeUpAttempts = 50
|
|
iNumWakeUpAttempts = 0
|
|
bWakeUpMachineSucceeded = False
|
|
while not bWakeUpMachineSucceeded:
|
|
bWakeUpMachineSucceeded = wakeUpMachine( machineName )
|
|
iNumWakeUpAttempts += 1
|
|
# the previous command can fail if the machine is already in a transition
|
|
# in that case we try sevral times bevire giving up
|
|
if(bWakeUpMachineSucceeded == False):
|
|
if iNumWakeUpAttempts < iMaxNumWakeUpAttempts:
|
|
iDelay = 5
|
|
logWarning('wake up attempt %d of %s failed... I\'ll try again in %d seconds' % (iNumWakeUpAttempts, machineName, iDelay))
|
|
time.sleep(iDelay)
|
|
else:
|
|
logWarning('wake up attempt %d of %s failed too many times... giving up' % (iNumWakeUpAttempts, machineName))
|
|
return False # couldn't wake up to machine for whatever reason
|
|
|
|
bWakeUpFailed = False
|
|
# wait until the machine is operational
|
|
WAKEUPTIMEOUT=5*60 # max number of seconds allowed for a machine to be alive after a wakeup request
|
|
wakeUpToAliveDuration = 0
|
|
while not Lib.SimpaDbUtil.isMachineResponding( machineName ):
|
|
time.sleep(5)
|
|
wakeUpToAliveDuration+=5
|
|
if wakeUpToAliveDuration > WAKEUPTIMEOUT:
|
|
# the wake up failed for whatever reason (power state changed manually ? wake up order got lost ?)
|
|
logWarning('%s took too long (more than %d seconds) to respond after a successful wakeup request.' % (machineName, WAKEUPTIMEOUT))
|
|
bWakeUpFailed = True
|
|
break
|
|
if bWakeUpFailed:
|
|
numAttempts+=1
|
|
if numAttempts >= 2:
|
|
logWarning('giving up waking up %s because the wake up request succeeded but the machine never actually came alive (and this too many times)' % (machineName))
|
|
return False # power state changed manually ?
|
|
else:
|
|
logWarning('attempting to wake up %s one more time' % (machineName))
|
|
else:
|
|
# wake up completed
|
|
logInfo('Waking up of machine %s completed successfully' % machineName)
|
|
return True
|
|
|
|
def onException(exception):
|
|
sys.stdout.flush()
|
|
strExceptionType = type( exception )
|
|
strMessage = 'exception %s : %s\n' % (strExceptionType, exception.message)
|
|
#traceback.print_last()
|
|
f = StringIO.StringIO()
|
|
traceback.print_exc(file=f)
|
|
strMessage += f.getvalue()
|
|
f.close()
|
|
logError(strMessage)
|
|
print(strMessage)
|
|
|
|
try:
|
|
# I had the case (see bugzilla 234) where an assert failed in a child thread, but the main process kept going. I suspect that it was caused
|
|
#by a failure of sendTextMail... that's why I've embedded the sendmail inside a try except block, so that is this operation fails, then the
|
|
# kill of the main process is still executed.
|
|
Lib.Util.sendTextMail( 'ClusterController <guillaume.raffy@univ-rennes1.fr>', 'guillaume.raffy@univ-rennes1.fr', 'ClusterController has stopped because of an exception', strMessage)
|
|
except BaseException:
|
|
logError("Could not send the email to notify the administrator that cluster controller failed")
|
|
pass
|
|
executeCommand('kill -9 %d' % os.getpid()) # stop other threads immediately
|
|
exit()
|
|
|
|
|