You are viewing a plain text version of this content. The canonical link for it is here.
Posted to common-commits@hadoop.apache.org by dd...@apache.org on 2008/06/06 12:57:42 UTC
svn commit: r663889 - in /hadoop/core/trunk/src/contrib/hod: CHANGES.txt
bin/hod bin/ringmaster hodlib/GridServices/service.py hodlib/Hod/hadoop.py
hodlib/Hod/hod.py hodlib/RingMaster/ringMaster.py
Author: ddas
Date: Fri Jun 6 03:57:42 2008
New Revision: 663889
URL: http://svn.apache.org/viewvc?rev=663889&view=rev
Log:
HADOOP-3184. Modified HOD to handle master failures on bad nodes by trying to bring them up on another node in the ring. Contributed by Hemanth Yamijala.
Modified:
hadoop/core/trunk/src/contrib/hod/CHANGES.txt
hadoop/core/trunk/src/contrib/hod/bin/hod
hadoop/core/trunk/src/contrib/hod/bin/ringmaster
hadoop/core/trunk/src/contrib/hod/hodlib/GridServices/service.py
hadoop/core/trunk/src/contrib/hod/hodlib/Hod/hadoop.py
hadoop/core/trunk/src/contrib/hod/hodlib/Hod/hod.py
hadoop/core/trunk/src/contrib/hod/hodlib/RingMaster/ringMaster.py
Modified: hadoop/core/trunk/src/contrib/hod/CHANGES.txt
URL: http://svn.apache.org/viewvc/hadoop/core/trunk/src/contrib/hod/CHANGES.txt?rev=663889&r1=663888&r2=663889&view=diff
==============================================================================
--- hadoop/core/trunk/src/contrib/hod/CHANGES.txt (original)
+++ hadoop/core/trunk/src/contrib/hod/CHANGES.txt Fri Jun 6 03:57:42 2008
@@ -9,6 +9,10 @@
exist and to auto-deallocate a cluster while reallocating it, if it is
already dead. (Hemanth Yamijala via mukund)
+ HADOOP-3184. Modified HOD to handle master failures on bad nodes by trying
+ to bring them up on another node in the ring. (Hemanth Yamijala via ddas)
+
+
NEW FEATURES
IMPROVEMENTS
Modified: hadoop/core/trunk/src/contrib/hod/bin/hod
URL: http://svn.apache.org/viewvc/hadoop/core/trunk/src/contrib/hod/bin/hod?rev=663889&r1=663888&r2=663889&view=diff
==============================================================================
--- hadoop/core/trunk/src/contrib/hod/bin/hod (original)
+++ hadoop/core/trunk/src/contrib/hod/bin/hod Fri Jun 6 03:57:42 2008
@@ -225,7 +225,12 @@
False, 120, False, True),
('idleness-limit', 'pos_int', 'Limit after which to deallocate the cluster',
- False, 3600, False, True)),
+ False, 3600, False, True),
+
+ ('max-master-failures', 'pos_int',
+ 'Defines how many times a master can fail before' \
+ ' failing cluster allocation', False, 5, True, True)),
+
'gridservice-mapred' : (
('external', 'bool', "Connect to an already running MapRed?",
Modified: hadoop/core/trunk/src/contrib/hod/bin/ringmaster
URL: http://svn.apache.org/viewvc/hadoop/core/trunk/src/contrib/hod/bin/ringmaster?rev=663889&r1=663888&r2=663889&view=diff
==============================================================================
--- hadoop/core/trunk/src/contrib/hod/bin/ringmaster (original)
+++ hadoop/core/trunk/src/contrib/hod/bin/ringmaster Fri Jun 6 03:57:42 2008
@@ -113,7 +113,11 @@
False, 120, False, True),
('idleness-limit', 'pos_int', 'Limit after which to deallocate the cluster',
- False, 3600, False, True)),
+ False, 3600, False, True),
+
+ ('max-master-failures', 'pos_int',
+ 'Defines how many times a master can fail before' \
+ ' failing cluster allocation', False, 5, True, True)),
'resource_manager' : (
('id', 'string', 'Batch scheduler ID: torque|condor.',
@@ -330,7 +334,8 @@
'ringmaster', 'svcrgy-addr'))
serviceClient = hodXRClient(serviceAddr)
if serviceClient is not None:
- serviceClient.setRMError([str(e),get_exception_string()])
+ serviceClient.setRMError([local_fqdn(), str(e), \
+ get_exception_string()])
log.info("Reported errors to service registry at %s" % serviceAddr)
except Exception, e:
log.error("Failed to report errors to service registry.")
Modified: hadoop/core/trunk/src/contrib/hod/hodlib/GridServices/service.py
URL: http://svn.apache.org/viewvc/hadoop/core/trunk/src/contrib/hod/hodlib/GridServices/service.py?rev=663889&r1=663888&r2=663889&view=diff
==============================================================================
--- hadoop/core/trunk/src/contrib/hod/hodlib/GridServices/service.py (original)
+++ hadoop/core/trunk/src/contrib/hod/hodlib/GridServices/service.py Fri Jun 6 03:57:42 2008
@@ -66,6 +66,7 @@
self.masterAddress = 'none'
self.requiredNode = requiredNode
self.failedMsg = None
+ self.masterFailureCount = 0
def getRequiredNode(self):
return self.requiredNode
@@ -136,6 +137,9 @@
""" set the master initialized to
true. """
self.masterInitialized = True
+ # Reset failure related variables, as master is initialized successfully.
+ self.masterFailureCount = 0
+ self.failedMsg = None
def getMasterAddress(self):
""" it needs to change to reflect
@@ -152,11 +156,19 @@
return self.serviceDesc.isExternal()
def setMasterFailed(self, err):
+ """Sets variables related to Master failure"""
+ self.masterFailureCount += 1
self.failedMsg = err
+ # When command is sent to HodRings, this would have been set to True.
+ # Reset it to reflect the correct status.
+ self.launchedMaster = False
def getMasterFailed(self):
return self.failedMsg
-
+
+ def getMasterFailureCount(self):
+ return self.masterFailureCount
+
class NodeRequest:
""" A class to define
a node request. """
Modified: hadoop/core/trunk/src/contrib/hod/hodlib/Hod/hadoop.py
URL: http://svn.apache.org/viewvc/hadoop/core/trunk/src/contrib/hod/hodlib/Hod/hadoop.py?rev=663889&r1=663888&r2=663889&view=diff
==============================================================================
--- hadoop/core/trunk/src/contrib/hod/hodlib/Hod/hadoop.py (original)
+++ hadoop/core/trunk/src/contrib/hod/hodlib/Hod/hadoop.py Fri Jun 6 03:57:42 2008
@@ -604,9 +604,11 @@
if status == 5 or status == 6:
ringMasterErrors = self.__svcrgyClient.getRMError()
if ringMasterErrors:
- self.__log.critical("Cluster could not be allocated because of the following errors on the ringmaster host.\n%s" % \
- (ringMasterErrors[0]))
- self.__log.debug("Stack trace on ringmaster: %s" % ringMasterErrors[1])
+ self.__log.critical("Cluster could not be allocated because" \
+ " of the following errors on the "\
+ "ringmaster host %s.\n%s" % \
+ (ringMasterErrors[0], ringMasterErrors[1]))
+ self.__log.debug("Stack trace on ringmaster: %s" % ringMasterErrors[2])
return status
def __isRingMasterAlive(self, rmAddr):
Modified: hadoop/core/trunk/src/contrib/hod/hodlib/Hod/hod.py
URL: http://svn.apache.org/viewvc/hadoop/core/trunk/src/contrib/hod/hodlib/Hod/hod.py?rev=663889&r1=663888&r2=663889&view=diff
==============================================================================
--- hadoop/core/trunk/src/contrib/hod/hodlib/Hod/hod.py (original)
+++ hadoop/core/trunk/src/contrib/hod/hodlib/Hod/hod.py Fri Jun 6 03:57:42 2008
@@ -227,6 +227,31 @@
return opList
+ def __adjustMasterFailureCountConfig(self, nodeCount):
+ # This method adjusts the ringmaster.max-master-failures variable
+ # to a value that is bounded by the a function of the number of
+ # nodes.
+
+ maxFailures = self.__cfg['ringmaster']['max-master-failures']
+ # Count number of masters required - depends on which services
+ # are external
+ masters = 0
+ if not self.__cfg['gridservice-hdfs']['external']:
+ masters += 1
+ if not self.__cfg['gridservice-mapred']['external']:
+ masters += 1
+
+ # So, if there are n nodes and m masters, we look atleast for
+ # all masters to come up. Therefore, atleast m nodes should be
+ # good, which means a maximum of n-m master nodes can fail.
+ maxFailedNodes = nodeCount - masters
+
+ # The configured max number of failures is now bounded by this
+ # number.
+ self.__cfg['ringmaster']['max-master-failures'] = \
+ min(maxFailures, maxFailedNodes)
+
+
def _op_allocate(self, args):
operation = "allocate"
argLength = len(args)
@@ -312,6 +337,9 @@
self.__cleanup()
raise HodInterruptException()
self.__log.debug("Service Registry started.")
+
+ self.__adjustMasterFailureCountConfig(nodes)
+
try:
allocateStatus = self.__cluster.allocate(clusterDir, min, max)
except HodInterruptException, h:
Modified: hadoop/core/trunk/src/contrib/hod/hodlib/RingMaster/ringMaster.py
URL: http://svn.apache.org/viewvc/hadoop/core/trunk/src/contrib/hod/hodlib/RingMaster/ringMaster.py?rev=663889&r1=663888&r2=663889&view=diff
==============================================================================
--- hadoop/core/trunk/src/contrib/hod/hodlib/RingMaster/ringMaster.py (original)
+++ hadoop/core/trunk/src/contrib/hod/hodlib/RingMaster/ringMaster.py Fri Jun 6 03:57:42 2008
@@ -370,7 +370,12 @@
for v in self.serviceDict.itervalues():
if (not v.isExternal()):
if v.isLaunchable(self.serviceDict):
- if not v.isMasterLaunched():
+ # If a master is still not launched, or the number of
+ # retries for launching master is not reached,
+ # launch master
+ if not v.isMasterLaunched() and \
+ (v.getMasterFailureCount() <= \
+ self.cfg['ringmaster']['max-master-failures']):
cmdList = v.getMasterCommands(self.serviceDict)
v.setlaunchedMaster()
v.setMasterAddress(addr)
@@ -441,7 +446,8 @@
def setHodRingErrors(self, addr, errors):
"""This method is called by the hodrings to update errors
it encountered while starting up"""
- self.log.critical("Hodring at %s failed with following errors:\n%s" % (addr, errors))
+ self.log.critical("Hodring at %s failed with following errors:\n%s" \
+ % (addr, errors))
lock = self.masterParamLock
lock.acquire()
try:
@@ -452,7 +458,8 @@
idx = addr.rfind('_')
if idx is not -1:
addr = addr[:idx]
- v.setMasterFailed("Hodring at %s failed with following errors:\n%s" % (addr, errors))
+ v.setMasterFailed("Hodring at %s failed with following" \
+ " errors:\n%s" % (addr, errors))
except:
self.log.debug(get_exception_string())
pass
@@ -478,8 +485,16 @@
pass
else:
self.log.debug("getServiceAddr service: %s" % service)
+ # Check if we should give up ! If the limit on max failures is hit,
+ # give up.
err = service.getMasterFailed()
- if err is not None:
+ if (err is not None) and \
+ (service.getMasterFailureCount() > \
+ self.cfg['ringmaster']['max-master-failures']):
+ self.log.critical("Detected errors (%s) beyond allowed number"\
+ " of failures (%s). Flagging error to client" \
+ % (service.getMasterFailureCount(), \
+ self.cfg['ringmaster']['max-master-failures']))
addr = "Error: " + err
elif (service.isMasterInitialized()):
addr = service.getMasterAddrs()[0]