cocluto/ClusterController/Util.py

228 lines
11 KiB
Python
Raw Normal View History

import Lib.Util
import Lib.SimpaDbUtil
from Log import *
from PowerState import *
import re
import StringIO
import os
import traceback
import sys
def executeProgram( astrArguments ):
bBUG_00000008_IS_STILL_ACTIVE = True
if bBUG_00000008_IS_STILL_ACTIVE:
logDebug('executeProgram : program = [%s]' % (','.join(astrArguments) ))
(returnCode, stdout, stderr) = Lib.Util.executeProgram( astrArguments )
if bBUG_00000008_IS_STILL_ACTIVE:
logDebug('executeCommand : return code of [%s] = %d' % (','.join(astrArguments), returnCode))
# for debugging purpose, log info in case the command failed
if returnCode != 0:
logDebug('executeCommand : return code of [%s] = %d' % (','.join(astrArguments), returnCode))
logDebug('executeCommand : stdout of [%s] = %s' % (','.join(astrArguments), stdout))
logDebug('executeCommand : stderr of [%s] = %s' % (','.join(astrArguments), stderr))
return (returnCode, stdout, stderr)
def executeCommand( command ):
#logDebug('executeCommand : command = ' + command)
(returnCode, stdout, stderr) = Lib.Util.executeCommand( command )
#logDebug('executeCommand : return code of "'+command+'" = '+str(returnCode))
return (returnCode, stdout, stderr)
def executeIpmiCommand( machineName, ipmiCommandArgs ):
lomIpAddress = Lib.SimpaDbUtil.getLightOutManagementIpAddress( machineName )
lomPasswordFilepath = '/usr/local/etc/LightOutManagementPassword.txt'
astrProgram = ['ipmitool', '-U', 'admin', '-H', lomIpAddress, '-f', lomPasswordFilepath]
astrProgram.extend( ipmiCommandArgs )
#print 'executeIpmiCommand'
#print astrProgram
bBUG_00000005_IS_STILL_ACTIVE = True
if bBUG_00000005_IS_STILL_ACTIVE:
# bug 00000005 causes ipmitool to randomly fail for no apparent reason (two consecutive calls might give different errors, these errors not always being timeouts). Therefore we try and try again, until the command succeeds. If we don't do this, cluster controller keeps stopping because ipmi commands fail. The effect of this hack is that the UNPLUGGED power state is no longer detected; therefore, with this hack, cluster controller is expecting all machines to be plugged.
bCommandSucceeded = False
while not bCommandSucceeded:
(returnCode, stdout, stderr) = executeProgram( astrProgram )
if returnCode == 0:
bCommandSucceeded = True
else:
logWarning('the command "%s" failed. Retrying a bit later' % ' '.join(astrProgram))
time.sleep(5) # wait for 5 seconds before the next attempt, in order not to saturate activity
else:
(returnCode, stdout, stderr) = executeProgram( astrProgram )
"""
sh-3.2# ipmitool -U admin -H 129.20.27.220 -f /usr/local/etc/LightOutManagementPassword.txt sensor get 'ACPI State'
Unabled to establish a session with the BMC.
Command failed due to insufficient resources for session (0xFFFEF901)
-> this error means that the number of active conections to the BMC has reached the maximum (usually 5).
sh-3.2# ipmitool -U admin -H 129.20.27.212 -f /usr/local/etc/LightOutManagementPassword.txt sensor get 'ACPI State'
Unabled to establish a session with the BMC.
Command failed due to Unknown (0xFFFEF923) (0xFFFEF923)
sh-3.2# ipmitool -U admin -H 129.20.27.212 -f /usr/local/etc/LightOutManagementPassword.txt sensor get 'ACPI State'
Unabled to establish a session with the BMC.
Command failed due to Timeout (0xFFFEF9C3)
"""
return (returnCode, stdout, stderr)
def getPowerState( machineName ):
ePowerState = PowerState.UNKNOWN
bPowerStateRead = False
iNumFailedAttempts = 0
while not bPowerStateRead:
(returnCode, stdout, stderr) = executeIpmiCommand( machineName, ['sensor', 'get', 'ACPI State'] )
if returnCode == 0:
matchObj = re.search('\[(?P<AcpiState>S[0-9][^\:]*)\:', stdout)
bBUG_00000002_IS_STILL_ACTIVE = True
if bBUG_00000002_IS_STILL_ACTIVE:
if matchObj == None:
# the following warning has been commented out because it pollutes the logs and apparently
# it's a 'feature' of all 4 core machines : if the machine is woken up using ipmitool, then
# no power on event is logged ...
#logWarning('degenerate ipmitool output for machine %s (see bug 00000002). Assuming power in on because that''s what I noticed when I had the case.' % machineName)
return PowerState.ON
else:
assert( matchObj )
strAcpiState = matchObj.group('AcpiState')
if strAcpiState == 'S0/G0':
ePowerState = PowerState.ON
elif strAcpiState == 'S3': # memory is still powered
ePowerState = PowerState.SLEEP
elif strAcpiState == 'S5/G2': # soft-off
ePowerState = PowerState.OFF
else:
print strAcpiState
assert( False )
bPowerStateRead = True
else:
# error ... it's either because the machine is unplugged or because the machine is busy (well I'm not sure what happened but I had the case where the command failed for no apparent reason, and therefore I suspect it to be busy ). In order to differentiate these 2 cases, we try again and if this caommand fails too many times then we decide it's unplugged (very dodgy I know but I'm disapointed that this command doen't always work, and for now I don't know other ways to differentiate between these cases....)
iMAX_NUM_ATTEMPTS=5
iNumFailedAttempts += 1
if iNumFailedAttempts < iMAX_NUM_ATTEMPTS:
logWarning('failed to read the power state of %s. I\'ll try a again a bit later....' % machineName)
time.sleep(5)
else:
logWarning('failed to read the power state of %s too many times. I assume this machine is unplugged' % machineName)
ePowerState = PowerState.UNPLUGGED # too many attempts failed ... I guess it's because the machine is unplugged
bPowerStateRead = True
return ePowerState
def wakeUpMachine( machineName ):
"""
this method seems more reliable than wake on lan (sometimes, sending wake on lan packet seems to have no effect)
@return true on success, false otherwise
@note I once had this method failing for no obvious reason.. maybe this command does not succeed if the machine is in a transition state
"""
(returnCode, stdout, stderr) = executeIpmiCommand( machineName, ['chassis', 'power', 'on'] )
bSuccess = (returnCode == 0) # this command can fail if the machine is manually unplugged for example
return bSuccess
def blockingPutMachineToSleep( machineName ):
"""
@return true on success, false otherwise
"""
logInfo('putting machine %s to sleep...' % machineName)
iMaxNumAttempts = 5
bSuccess = False
bBUG_239_IS_STILL_ALIVE = True
iAttempt = 0
# note : each sleep order is not actually succeeding (god knows why). Therefore, we need to try again and again.
while not bSuccess:
# note : pmset must be executed as root
(returnCode, stdout, stderr) = executeProgram(['ssh', machineName, 'pmset sleepnow'])
# check if the machine actually went to sleep
iMaxGoToSleepDuration = 30 # in seconds
iDelay = 0
while iDelay < iMaxGoToSleepDuration:
time.sleep(5)
iDelay += 5
ePowerState = getPowerState( machineName )
if ePowerState == PowerState.SLEEP:
logInfo('machine %s is now sleeping (put to sleep succeeded)' % machineName)
return True
else:
if ePowerState != PowerState.ON:
logWarning('unexpectedly, powerState of %s is %s' % (machineName, PowerStateToStr(ePowerState)))
assert(ePowerState == PowerState.ON)
iAttempt += 1
if iAttempt > iMaxNumAttempts:
if bBUG_239_IS_STILL_ALIVE:
logWarning('the attempt to put %s to sleep failed too many times (probably because of bug 239 (machine is in a weird state : power on but no ssh possible) ?)... giving up. ' % (machineName))
return False
else:
logWarning('the attempt to put %s to sleep failed too many times... giving up' % (machineName))
return False
else:
logWarning('the attempt to put %s to sleep failed... trying again' % (machineName))
return True
def blockingWakeUpMachine(machineName):
logInfo('waking up machine %s...' % machineName)
numAttempts = 0
bWakeUpFailed = True
while bWakeUpFailed: # try more than once because sometimes for an unknown reason, the wake up order is ignored by the machine ... to be investigated
iMaxNumWakeUpAttempts = 50
iNumWakeUpAttempts = 0
bWakeUpMachineSucceeded = False
while not bWakeUpMachineSucceeded:
bWakeUpMachineSucceeded = wakeUpMachine( machineName )
iNumWakeUpAttempts += 1
# the previous command can fail if the machine is already in a transition
# in that case we try sevral times bevire giving up
if(bWakeUpMachineSucceeded == False):
if iNumWakeUpAttempts < iMaxNumWakeUpAttempts:
iDelay = 5
logWarning('wake up attempt %d of %s failed... I\'ll try again in %d seconds' % (iNumWakeUpAttempts, machineName, iDelay))
time.sleep(iDelay)
else:
logWarning('wake up attempt %d of %s failed too many times... giving up' % (iNumWakeUpAttempts, machineName))
return False # couldn't wake up to machine for whatever reason
bWakeUpFailed = False
# wait until the machine is operational
WAKEUPTIMEOUT=5*60 # max number of seconds allowed for a machine to be alive after a wakeup request
wakeUpToAliveDuration = 0
while not Lib.SimpaDbUtil.isMachineResponding( machineName ):
time.sleep(5)
wakeUpToAliveDuration+=5
if wakeUpToAliveDuration > WAKEUPTIMEOUT:
# the wake up failed for whatever reason (power state changed manually ? wake up order got lost ?)
logWarning('%s took too long (more than %d seconds) to respond after a successful wakeup request.' % (machineName, WAKEUPTIMEOUT))
bWakeUpFailed = True
break
if bWakeUpFailed:
numAttempts+=1
if numAttempts >= 2:
logWarning('giving up waking up %s because the wake up request succeeded but the machine never actually came alive (and this too many times)' % (machineName))
return False # power state changed manually ?
else:
logWarning('attempting to wake up %s one more time' % (machineName))
else:
# wake up completed
logInfo('Waking up of machine %s completed successfully' % machineName)
return True
def onException(exception):
sys.stdout.flush()
strExceptionType = type( exception )
strMessage = 'exception %s : %s\n' % (strExceptionType, exception.message)
#traceback.print_last()
f = StringIO.StringIO()
traceback.print_exc(file=f)
strMessage += f.getvalue()
f.close()
logError(strMessage)
print(strMessage)
try:
# I had the case (see bugzilla 234) where an assert failed in a child thread, but the main process kept going. I suspect that it was caused
#by a failure of sendTextMail... that's why I've embedded the sendmail inside a try except block, so that is this operation fails, then the
# kill of the main process is still executed.
Lib.Util.sendTextMail( 'ClusterController <guillaume.raffy@univ-rennes1.fr>', 'guillaume.raffy@univ-rennes1.fr', 'ClusterController has stopped because of an exception', strMessage)
except BaseException:
logError("Could not send the email to notify the administrator that cluster controller failed")
pass
executeCommand('kill -9 %d' % os.getpid()) # stop other threads immediately
exit()