# bug 00000005 causes ipmitool to randomly fail for no apparent reason (two consecutive calls ight give different errors, these errors not always being timeouts). Therefore we try and try again, until the command succeeds. If we don't do this, cluster controller keeps stopping because ipmi commands fail. The effect of this hack is that the UNPLUGGED power state is no longer detected; therefore, with this hack, cluster controller is expecting all machines to be plugged.
# the following warning has been commented out because it pollutes the logs and apparently
# it's a 'feature' of all 4 core machines : if the machine is woken up using ipmitool, then
# no power on event is logged ...
#logWarning('degenerate ipmitool output for machine %s (see bug 00000002). Assuming power in on because that''s what I noticed when I had the case.' % machineName)
returnPowerState.ON
else:
assert(matchObj)
strAcpiState=matchObj.group('AcpiState')
ifstrAcpiState=='S0/G0':
ePowerState=PowerState.ON
elifstrAcpiState=='S3':# memory is still powered
ePowerState=PowerState.SLEEP
elifstrAcpiState=='S5/G2':# soft-off
ePowerState=PowerState.OFF
else:
printstrAcpiState
assert(False)
bPowerStateRead=True
else:
# error ... it's either because the machine is unplugged or because the machine is busy (well I'm not sure what happened but I had the case where the command failed for no apparent reason, and therefore I suspect it to be busy ). In order to differentiate these 2 cases, we try again and if this caommand fails too many times then we decide it's unplugged (very dodgy I know but I'm disapointed that this command doen't always work, and for now I don't know other ways to differentiate between these cases....)
iMAX_NUM_ATTEMPTS=5
iNumFailedAttempts+=1
ifiNumFailedAttempts<iMAX_NUM_ATTEMPTS:
logWarning('failed to read the power state of %s. I\'ll try a again a bit later....'%machineName)
time.sleep(5)
else:
logWarning('failed to read the power state of %s too many times. I assume this machine is unplugged'%machineName)
ePowerState=PowerState.UNPLUGGED# too many attempts failed ... I guess it's because the machine is unplugged
logInfo('machine %s is now sleeping (put to sleep succeeded)'%machineName)
returnTrue
else:
ifePowerState!=PowerState.ON:
logWarning('unexpectedly, powerState of %s is %s'%(machineName,PowerStateToStr(ePowerState)))
assert(ePowerState==PowerState.ON)
iAttempt+=1
ifiAttempt>iMaxNumAttempts:
ifbBUG_00000010_IS_STILL_ALIVE:
logWarning('the attempt to put %s to sleep failed to many times (probably because of bug 00000010 (machine is in a weird state : power on but no ssh possible) ?)... giving up. '%(machineName))
returnFalse
else:
logWarning('the attempt to put %s to sleep failed to many times... giving up'%(machineName))
returnFalse
else:
logWarning('the attempt to put %s to sleep failed... trying again'%(machineName))
returnTrue
defblockingWakeUpMachine(machineName):
logInfo('waking up machine %s...'%machineName)
numAttempts=0
bWakeUpFailed=True
whilebWakeUpFailed:# try more than once because sometimes for an unknown reason, the wake up order is ignored by the machine ... to be investigated
# the wake up failed for whatever reason (power state changed manually ? wake up order got lost ?)
logWarning('%s took too long (more than %d seconds) to respond after a successful wakeup request.'%(machineName,WAKEUPTIMEOUT))
bWakeUpFailed=True
break
ifbWakeUpFailed:
numAttempts+=1
ifnumAttempts>=2:
logWarning('giving up waking up %s because the wake up request succeeded but the machine never actually came alive (and this too many times)'%(machineName))
returnFalse# power state changed manually ?
else:
logWarning('attempting to wake up %s one more time'%(machineName))
else:
# wake up completed
logInfo('Waking up of machine %s completed successfully'%machineName)
Lib.Util.sendTextMail('ClusterController <guillaume.raffy@univ-rennes1.fr>','guillaume.raffy@univ-rennes1.fr','ClusterController has stopped because of an exception',strMessage)
executeCommand('kill -9 %d'%os.getpid())# stop other threads immediately