# bug 00000005 causes ipmitool to randomly fail for no apparent reason (two consecutive calls might give different errors, these errors not always being timeouts). Therefore we try and try again, until the command succeeds. If we don't do this, cluster controller keeps stopping because ipmi commands fail. The effect of this hack is that the UNPLUGGED power state is no longer detected; therefore, with this hack, cluster controller is expecting all machines to be plugged.
# the following warning has been commented out because it pollutes the logs and apparently
# it's a 'feature' of all 4 core machines : if the machine is woken up using ipmitool, then
# no power on event is logged ...
#logWarning('degenerate ipmitool output for machine %s (see bug 00000002). Assuming power in on because that''s what I noticed when I had the case.' % machineName)
returnPowerState.ON
else:
assert(matchObj)
strAcpiState=matchObj.group('AcpiState')
ifstrAcpiState=='S0/G0':
ePowerState=PowerState.ON
elifstrAcpiState=='S3':# memory is still powered
ePowerState=PowerState.SLEEP
elifstrAcpiState=='S5/G2':# soft-off
ePowerState=PowerState.OFF
else:
printstrAcpiState
assert(False)
bPowerStateRead=True
else:
# error ... it's either because the machine is unplugged or because the machine is busy (well I'm not sure what happened but I had the case where the command failed for no apparent reason, and therefore I suspect it to be busy ). In order to differentiate these 2 cases, we try again and if this caommand fails too many times then we decide it's unplugged (very dodgy I know but I'm disapointed that this command doen't always work, and for now I don't know other ways to differentiate between these cases....)
iMAX_NUM_ATTEMPTS=5
iNumFailedAttempts+=1
ifiNumFailedAttempts<iMAX_NUM_ATTEMPTS:
logWarning('failed to read the power state of %s. I\'ll try a again a bit later....'%machineName)
time.sleep(5)
else:
logWarning('failed to read the power state of %s too many times. I assume this machine is unplugged'%machineName)
ePowerState=PowerState.UNPLUGGED# too many attempts failed ... I guess it's because the machine is unplugged
logWarning('the attempt to put %s to sleep failed too many times (probably because of bug 239 (machine is in a weird state : power on but no ssh possible) ?)... giving up. '%(machineName))
# the wake up failed for whatever reason (power state changed manually ? wake up order got lost ?)
logWarning('%s took too long (more than %d seconds) to respond after a successful wakeup request.'%(machineName,WAKEUPTIMEOUT))
bWakeUpFailed=True
break
ifbWakeUpFailed:
numAttempts+=1
ifnumAttempts>=2:
logWarning('giving up waking up %s because the wake up request succeeded but the machine never actually came alive (and this too many times)'%(machineName))
returnFalse# power state changed manually ?
else:
logWarning('attempting to wake up %s one more time'%(machineName))
else:
# wake up completed
logInfo('Waking up of machine %s completed successfully'%machineName)
# I had the case (see bugzilla 234) where an assert failed in a child thread, but the main process kept going. I suspect that it was caused
#by a failure of sendTextMail... that's why I've embedded the sendmail inside a try except block, so that is this operation fails, then the
# kill of the main process is still executed.
Lib.Util.sendTextMail('ClusterController <guillaume.raffy@univ-rennes1.fr>','guillaume.raffy@univ-rennes1.fr','ClusterController has stopped because of an exception',strMessage)
exceptBaseException:
logError("Could not send the email to notify the administrator that cluster controller failed")