import Lib.Util import Lib.SimpaDbUtil from Log import * from PowerState import * import re import StringIO import os import traceback import sys def executeProgram( astrArguments ): bBUG_00000008_IS_STILL_ACTIVE = True if bBUG_00000008_IS_STILL_ACTIVE: logDebug('executeProgram : program = [%s]' % (','.join(astrArguments) )) (returnCode, stdout, stderr) = Lib.Util.executeProgram( astrArguments ) if bBUG_00000008_IS_STILL_ACTIVE: logDebug('executeCommand : return code of [%s] = %d' % (','.join(astrArguments), returnCode)) return (returnCode, stdout, stderr) def executeCommand( command ): #logDebug('executeCommand : command = ' + command) (returnCode, stdout, stderr) = Lib.Util.executeCommand( command ) #logDebug('executeCommand : return code of "'+command+'" = '+str(returnCode)) return (returnCode, stdout, stderr) def executeIpmiCommand( machineName, ipmiCommandArgs ): lomIpAddress = Lib.SimpaDbUtil.getLightOutManagementIpAddress( machineName ) lomPasswordFilepath = '/usr/local/etc/LightOutManagementPassword.txt' astrProgram = ['ipmitool', '-U', 'admin', '-H', lomIpAddress, '-f', lomPasswordFilepath] astrProgram.extend( ipmiCommandArgs ) #print 'executeIpmiCommand' #print astrProgram bBUG_00000005_IS_STILL_ACTIVE = True if bBUG_00000005_IS_STILL_ACTIVE: # bug 00000005 causes ipmitool to randomly fail for no apparent reason (two consecutive calls might give different errors, these errors not always being timeouts). Therefore we try and try again, until the command succeeds. If we don't do this, cluster controller keeps stopping because ipmi commands fail. The effect of this hack is that the UNPLUGGED power state is no longer detected; therefore, with this hack, cluster controller is expecting all machines to be plugged. bCommandSucceeded = False while not bCommandSucceeded: (returnCode, stdout, stderr) = executeProgram( astrProgram ) if returnCode == 0: bCommandSucceeded = True else: logWarning('the command "%s" failed. Retrying a bit later' % ' '.join(astrProgram)) time.sleep(5) # wait for 5 seconds before the next attempt, in order not to saturate activity else: (returnCode, stdout, stderr) = executeProgram( astrProgram ) """ sh-3.2# ipmitool -U admin -H 129.20.27.220 -f /usr/local/etc/LightOutManagementPassword.txt sensor get 'ACPI State' Unabled to establish a session with the BMC. Command failed due to insufficient resources for session (0xFFFEF901) -> this error means that the number of active conections to the BMC has reached the maximum (usually 5). sh-3.2# ipmitool -U admin -H 129.20.27.212 -f /usr/local/etc/LightOutManagementPassword.txt sensor get 'ACPI State' Unabled to establish a session with the BMC. Command failed due to Unknown (0xFFFEF923) (0xFFFEF923) sh-3.2# ipmitool -U admin -H 129.20.27.212 -f /usr/local/etc/LightOutManagementPassword.txt sensor get 'ACPI State' Unabled to establish a session with the BMC. Command failed due to Timeout (0xFFFEF9C3) """ return (returnCode, stdout, stderr) def getPowerState( machineName ): ePowerState = PowerState.UNKNOWN bPowerStateRead = False iNumFailedAttempts = 0 while not bPowerStateRead: (returnCode, stdout, stderr) = executeIpmiCommand( machineName, ['sensor', 'get', 'ACPI State'] ) if returnCode == 0: matchObj = re.search('\[(?PS[0-9][^\:]*)\:', stdout) bBUG_00000002_IS_STILL_ACTIVE = True if bBUG_00000002_IS_STILL_ACTIVE: if matchObj == None: # the following warning has been commented out because it pollutes the logs and apparently # it's a 'feature' of all 4 core machines : if the machine is woken up using ipmitool, then # no power on event is logged ... #logWarning('degenerate ipmitool output for machine %s (see bug 00000002). Assuming power in on because that''s what I noticed when I had the case.' % machineName) return PowerState.ON else: assert( matchObj ) strAcpiState = matchObj.group('AcpiState') if strAcpiState == 'S0/G0': ePowerState = PowerState.ON elif strAcpiState == 'S3': # memory is still powered ePowerState = PowerState.SLEEP elif strAcpiState == 'S5/G2': # soft-off ePowerState = PowerState.OFF else: print strAcpiState assert( False ) bPowerStateRead = True else: # error ... it's either because the machine is unplugged or because the machine is busy (well I'm not sure what happened but I had the case where the command failed for no apparent reason, and therefore I suspect it to be busy ). In order to differentiate these 2 cases, we try again and if this caommand fails too many times then we decide it's unplugged (very dodgy I know but I'm disapointed that this command doen't always work, and for now I don't know other ways to differentiate between these cases....) iMAX_NUM_ATTEMPTS=5 iNumFailedAttempts += 1 if iNumFailedAttempts < iMAX_NUM_ATTEMPTS: logWarning('failed to read the power state of %s. I\'ll try a again a bit later....' % machineName) time.sleep(5) else: logWarning('failed to read the power state of %s too many times. I assume this machine is unplugged' % machineName) ePowerState = PowerState.UNPLUGGED # too many attempts failed ... I guess it's because the machine is unplugged bPowerStateRead = True return ePowerState def wakeUpMachine( machineName ): """ this method seems more reliable than wake on lan (sometimes, sending wake on lan packet seems to have no effect) @return true on success, false otherwise @note I once had this method failing for no obvious reason.. maybe this command does not succeed if the machine is in a transition state """ (returnCode, stdout, stderr) = executeIpmiCommand( machineName, ['chassis', 'power', 'on'] ) bSuccess = (returnCode == 0) # this command can fail if the machine is manually unplugged for example return bSuccess def blockingPutMachineToSleep( machineName ): """ @return true on success, false otherwise """ logInfo('putting machine %s to sleep...' % machineName) iMaxNumAttempts = 5 bSuccess = False bBUG_00000010_IS_STILL_ALIVE = True iAttempt = 0 # note : each sleep order is not actually succeeding (god knows why). Therefore, we need to try again and again. while not bSuccess: # note : pmset must be executed as root (returnCode, stdout, stderr) = executeProgram(['ssh', machineName, 'pmset sleepnow']) # check if the machine actually went to sleep iMaxGoToSleepDuration = 30 # in seconds iDelay = 0 while iDelay < iMaxGoToSleepDuration: time.sleep(5) iDelay += 5 ePowerState = getPowerState( machineName ) if ePowerState == PowerState.SLEEP: logInfo('machine %s is now sleeping (put to sleep succeeded)' % machineName) return True else: if ePowerState != PowerState.ON: logWarning('unexpectedly, powerState of %s is %s' % (machineName, PowerStateToStr(ePowerState))) assert(ePowerState == PowerState.ON) iAttempt += 1 if iAttempt > iMaxNumAttempts: if bBUG_00000010_IS_STILL_ALIVE: logWarning('the attempt to put %s to sleep failed to many times (probably because of bug 00000010 (machine is in a weird state : power on but no ssh possible) ?)... giving up. ' % (machineName)) return False else: logWarning('the attempt to put %s to sleep failed to many times... giving up' % (machineName)) return False else: logWarning('the attempt to put %s to sleep failed... trying again' % (machineName)) return True def blockingWakeUpMachine(machineName): logInfo('waking up machine %s...' % machineName) numAttempts = 0 bWakeUpFailed = True while bWakeUpFailed: # try more than once because sometimes for an unknown reason, the wake up order is ignored by the machine ... to be investigated iMaxNumWakeUpAttempts = 50 iNumWakeUpAttempts = 0 bWakeUpMachineSucceeded = False while not bWakeUpMachineSucceeded: bWakeUpMachineSucceeded = wakeUpMachine( machineName ) iNumWakeUpAttempts += 1 # the previous command can fail if the machine is already in a transition # in that case we try sevral times bevire giving up if(bWakeUpMachineSucceeded == False): if iNumWakeUpAttempts < iMaxNumWakeUpAttempts: iDelay = 5 logWarning('wake up attempt %d of %s failed... I\'ll try again in %d seconds' % (iNumWakeUpAttempts, machineName, iDelay)) time.sleep(iDelay) else: logWarning('wake up attempt %d of %s failed too many times... giving up' % (iNumWakeUpAttempts, machineName)) return False # couldn't wake up to machine for whatever reason bWakeUpFailed = False # wait until the machine is operational WAKEUPTIMEOUT=5*60 # max number of seconds allowed for a machine to be alive after a wakeup request wakeUpToAliveDuration = 0 while not Lib.SimpaDbUtil.isMachineResponding( machineName ): time.sleep(5) wakeUpToAliveDuration+=5 if wakeUpToAliveDuration > WAKEUPTIMEOUT: # the wake up failed for whatever reason (power state changed manually ? wake up order got lost ?) logWarning('%s took too long (more than %d seconds) to respond after a successful wakeup request.' % (machineName, WAKEUPTIMEOUT)) bWakeUpFailed = True break if bWakeUpFailed: numAttempts+=1 if numAttempts >= 2: logWarning('giving up waking up %s because the wake up request succeeded but the machine never actually came alive (and this too many times)' % (machineName)) return False # power state changed manually ? else: logWarning('attempting to wake up %s one more time' % (machineName)) else: # wake up completed logInfo('Waking up of machine %s completed successfully' % machineName) return True def onException(exception): sys.stdout.flush() strExceptionType = type( exception ) strMessage = 'exception %s : %s\n' % (strExceptionType, exception.message) #traceback.print_last() f = StringIO.StringIO() traceback.print_exc(file=f) strMessage += f.getvalue() f.close() logError(strMessage) print(strMessage) Lib.Util.sendTextMail( 'ClusterController ', 'guillaume.raffy@univ-rennes1.fr', 'ClusterController has stopped because of an exception', strMessage) executeCommand('kill -9 %d' % os.getpid()) # stop other threads immediately exit()