cocluto/ClusterController/Util.py

import Lib.Util
import Lib.SimpaDbUtil
from Log import *
from PowerState import *
import re
import StringIO
import os
import traceback
import sys

def executeProgram( astrArguments ):
	bBUG_00000008_IS_STILL_ACTIVE = True
	if bBUG_00000008_IS_STILL_ACTIVE:
		logDebug('executeProgram : program = [%s]' % (','.join(astrArguments) ))
	(returnCode, stdout, stderr) = Lib.Util.executeProgram( astrArguments )
	if bBUG_00000008_IS_STILL_ACTIVE:
		logDebug('executeCommand : return code of [%s] = %d' % (','.join(astrArguments), returnCode))
	# for debugging purpose, log info in case the command failed
	if returnCode != 0:
		logDebug('executeCommand : return code of [%s] = %d' % (','.join(astrArguments), returnCode))
		logDebug('executeCommand : stdout of [%s] = %s' % (','.join(astrArguments), stdout))
		logDebug('executeCommand : stderr of [%s] = %s' % (','.join(astrArguments), stderr))
	return (returnCode, stdout, stderr)

def executeCommand( command ):
	#logDebug('executeCommand : command = ' + command)
	(returnCode, stdout, stderr) = Lib.Util.executeCommand( command )
	#logDebug('executeCommand : return code of "'+command+'" = '+str(returnCode))
	return (returnCode, stdout, stderr)

def executeIpmiCommand( machineName, ipmiCommandArgs ):
	lomIpAddress = Lib.SimpaDbUtil.getLightOutManagementIpAddress( machineName )
	lomPasswordFilepath = '/usr/local/etc/LightOutManagementPassword.txt'
	astrProgram = ['ipmitool', '-U', 'admin', '-H', lomIpAddress, '-f', lomPasswordFilepath]
	astrProgram.extend( ipmiCommandArgs )
	#print 'executeIpmiCommand'
	#print astrProgram
	bBUG_00000005_IS_STILL_ACTIVE = True
	if bBUG_00000005_IS_STILL_ACTIVE:
		# bug 00000005 causes ipmitool to randomly fail for no apparent reason (two consecutive calls might give different errors, these errors not always being timeouts). Therefore we try and try again, until the command succeeds. If we don't do this, cluster controller keeps stopping because ipmi commands fail. The effect of this hack is that the UNPLUGGED power state is no longer detected; therefore, with this hack, cluster controller is expecting all machines to be plugged.
		bCommandSucceeded = False
		while not bCommandSucceeded:
			(returnCode, stdout, stderr) = executeProgram( astrProgram )
			if returnCode == 0:
				bCommandSucceeded = True
			else:
				logWarning('the command "%s" failed. Retrying a bit later' % ' '.join(astrProgram))
				time.sleep(5) # wait for 5 seconds before the next attempt, in order not to saturate activity
	else:
		(returnCode, stdout, stderr) = executeProgram( astrProgram )
	"""
		sh-3.2# ipmitool -U admin -H 129.20.27.220 -f /usr/local/etc/LightOutManagementPassword.txt sensor get 'ACPI State'
		Unabled to establish a session with the BMC.
		Command failed due to insufficient resources for session (0xFFFEF901)
	-> this error means that the number of active conections to the BMC has reached the maximum (usually 5).

	sh-3.2# ipmitool -U admin -H 129.20.27.212 -f /usr/local/etc/LightOutManagementPassword.txt sensor get 'ACPI State'
	Unabled to establish a session with the BMC.
	Command failed due to Unknown (0xFFFEF923) (0xFFFEF923)

	sh-3.2# ipmitool -U admin -H 129.20.27.212 -f /usr/local/etc/LightOutManagementPassword.txt sensor get 'ACPI State'
	Unabled to establish a session with the BMC.
	Command failed due to Timeout (0xFFFEF9C3)
	"""

	return (returnCode, stdout, stderr)

def getPowerState( machineName ):
	ePowerState = PowerState.UNKNOWN
	bPowerStateRead = False
	iNumFailedAttempts = 0
	while not bPowerStateRead:
		(returnCode, stdout, stderr) = executeIpmiCommand( machineName, ['sensor', 'get', 'ACPI State'] )
		if returnCode == 0:
			matchObj = re.search('\[(?P<AcpiState>S[0-9][^\:]*)\:', stdout)
			bBUG_00000002_IS_STILL_ACTIVE = True
			if bBUG_00000002_IS_STILL_ACTIVE:
				if matchObj == None:
					# the following warning has been commented out because it pollutes the logs and apparently
					# it's a 'feature' of all 4 core machines : if the machine is woken up using ipmitool, then
					# no power on event is logged ...
					#logWarning('degenerate ipmitool output for machine %s (see bug 00000002). Assuming power in on because that''s what I noticed when I had the case.' % machineName)
					return PowerState.ON
			else:
				assert( matchObj )
			strAcpiState = matchObj.group('AcpiState')
			if strAcpiState == 'S0/G0':
				ePowerState = PowerState.ON
			elif strAcpiState == 'S3': # memory is still powered
				ePowerState = PowerState.SLEEP
			elif strAcpiState == 'S5/G2': # soft-off
				ePowerState = PowerState.OFF
			else:
				print strAcpiState
				assert( False )
			bPowerStateRead = True
		else:
			# error ... it's either because the machine is unplugged or because the machine is busy (well I'm not sure what happened but I had the case where the command failed for no apparent reason, and therefore I suspect it to be busy ). In order to differentiate these 2 cases, we try again and if this caommand fails too many times then we decide it's unplugged (very dodgy I know but I'm disapointed that this command doen't always work, and for now I don't know other ways to differentiate between these cases....)
			iMAX_NUM_ATTEMPTS=5
			iNumFailedAttempts += 1
			if iNumFailedAttempts < iMAX_NUM_ATTEMPTS:
				logWarning('failed to read the power state of %s. I\'ll try a again a bit later....' % machineName)
				time.sleep(5)
			else:
				logWarning('failed to read the power state of %s too many times. I assume this machine is unplugged' % machineName)
				ePowerState = PowerState.UNPLUGGED # too many attempts failed ... I guess it's because the machine is unplugged
				bPowerStateRead = True
	return ePowerState

def wakeUpMachine( machineName ):
	"""
		this method seems more reliable than wake on lan (sometimes, sending wake on lan packet seems to have no effect)
		@return	true on success, false otherwise
		@note	I once had this method failing for no obvious reason.. maybe this command does not succeed if the machine is in a transition state
	"""
	(returnCode, stdout, stderr) = executeIpmiCommand( machineName, ['chassis', 'power',  'on'] )
	bSuccess = (returnCode == 0) # this command can fail if the machine is manually unplugged for example
	return bSuccess

def blockingPutMachineToSleep( machineName ):
	"""
		@return	true on success, false otherwise
	"""
	logInfo('putting machine %s to sleep...' % machineName)
	iMaxNumAttempts = 5
	bSuccess = False
	bBUG_239_IS_STILL_ALIVE = True
	iAttempt = 0
	# note : each sleep order is not actually succeeding (god knows why). Therefore, we need to try again and again.
	while not bSuccess:
		# note : pmset must be executed as root
		(returnCode, stdout, stderr) = executeProgram(['ssh', machineName, 'pmset sleepnow'])
		# check if the machine actually went to sleep
		iMaxGoToSleepDuration = 30 # in seconds
		iDelay = 0
		while iDelay < iMaxGoToSleepDuration:
			time.sleep(5)
			iDelay += 5
			ePowerState = getPowerState( machineName )
			if ePowerState == PowerState.SLEEP:
				logInfo('machine %s is now sleeping (put to sleep succeeded)' % machineName)
				return True
			else:
				if ePowerState != PowerState.ON:
					logWarning('unexpectedly, powerState of %s is %s' % (machineName, PowerStateToStr(ePowerState)))
					assert(ePowerState == PowerState.ON)
		iAttempt += 1
		if iAttempt > iMaxNumAttempts:
			if bBUG_239_IS_STILL_ALIVE:
				logWarning('the attempt to put %s to sleep failed too many times (probably because of bug 239 (machine is in a weird state : power on but no ssh possible) ?)... giving up. ' % (machineName))
				return False
			else:
				logWarning('the attempt to put %s to sleep failed too many times... giving up' % (machineName))
				return False
		else:
			logWarning('the attempt to put %s to sleep failed... trying again' % (machineName))
	return True

def blockingWakeUpMachine(machineName):
	logInfo('waking up machine %s...' % machineName)
	numAttempts = 0
	bWakeUpFailed = True
	while bWakeUpFailed: # try more than once because sometimes for an unknown reason, the wake up order is ignored by the machine ... to be investigated
		iMaxNumWakeUpAttempts = 50
		iNumWakeUpAttempts = 0
		bWakeUpMachineSucceeded = False
		while not bWakeUpMachineSucceeded:
			bWakeUpMachineSucceeded = wakeUpMachine( machineName )
			iNumWakeUpAttempts += 1
			# the previous command can fail if the machine is already in a transition
			# in that case we try sevral times bevire giving up
			if(bWakeUpMachineSucceeded == False):
				if iNumWakeUpAttempts < iMaxNumWakeUpAttempts:
					iDelay = 5
					logWarning('wake up attempt %d of %s failed... I\'ll try  again in %d seconds' % (iNumWakeUpAttempts, machineName, iDelay))
					time.sleep(iDelay)
				else:
					logWarning('wake up attempt %d of %s failed too many times... giving up' % (iNumWakeUpAttempts, machineName))
					return False # couldn't wake up to machine for whatever reason

		bWakeUpFailed = False
		# wait until the machine is operational
		WAKEUPTIMEOUT=5*60 # max number of seconds allowed for a machine to be alive after a wakeup request
		wakeUpToAliveDuration = 0
		while not Lib.SimpaDbUtil.isMachineResponding( machineName ):
			time.sleep(5)
			wakeUpToAliveDuration+=5
			if wakeUpToAliveDuration > WAKEUPTIMEOUT:
				# the wake up failed for whatever reason (power state changed manually ? wake up order got lost ?)
				logWarning('%s took too long (more than %d seconds) to respond after a successful wakeup request.' % (machineName, WAKEUPTIMEOUT))
				bWakeUpFailed = True
				break
		if bWakeUpFailed:
			numAttempts+=1
			if numAttempts >= 2:
				logWarning('giving up waking up %s because the wake up request succeeded but the machine never actually came alive (and this too many times)' % (machineName))
				return False # power state changed manually ?
			else:
				logWarning('attempting to wake up %s one more time' % (machineName))
		else:
			# wake up completed
			logInfo('Waking up of machine %s completed successfully' % machineName)
			return True

def onException(exception):
	sys.stdout.flush()
	strExceptionType = type( exception )
	strMessage = 'exception %s : %s\n' % (strExceptionType, exception.message)
	#traceback.print_last()
	f = StringIO.StringIO()
	traceback.print_exc(file=f)
	strMessage += f.getvalue()
	f.close()
	logError(strMessage)
	print(strMessage)

	try:
		# I had the case (see bugzilla 234) where an assert failed in a child thread, but the main process kept going. I suspect that it was caused
		#by a failure of sendTextMail... that's why I've embedded the sendmail inside a try except block, so that is this operation fails, then the
		# kill of the main process is still executed.
		Lib.Util.sendTextMail( 'ClusterController <guillaume.raffy@univ-rennes1.fr>', 'guillaume.raffy@univ-rennes1.fr', 'ClusterController has stopped because of an exception', strMessage)
	except BaseException:
		logError("Could not send the email to notify the administrator that cluster controller failed")
		pass
	executeCommand('kill -9 %d' % os.getpid()) # stop other threads immediately
	exit()