You are viewing a plain text version of this content. The canonical link for it is here.
Posted to common-commits@hadoop.apache.org by dd...@apache.org on 2008/06/06 12:57:42 UTC

svn commit: r663889 - in /hadoop/core/trunk/src/contrib/hod: CHANGES.txt bin/hod bin/ringmaster hodlib/GridServices/service.py hodlib/Hod/hadoop.py hodlib/Hod/hod.py hodlib/RingMaster/ringMaster.py

Author: ddas
Date: Fri Jun  6 03:57:42 2008
New Revision: 663889

URL: http://svn.apache.org/viewvc?rev=663889&view=rev
Log:
HADOOP-3184. Modified HOD to handle master failures on bad nodes by trying to bring them up on another node in the ring. Contributed by Hemanth Yamijala.

Modified:
    hadoop/core/trunk/src/contrib/hod/CHANGES.txt
    hadoop/core/trunk/src/contrib/hod/bin/hod
    hadoop/core/trunk/src/contrib/hod/bin/ringmaster
    hadoop/core/trunk/src/contrib/hod/hodlib/GridServices/service.py
    hadoop/core/trunk/src/contrib/hod/hodlib/Hod/hadoop.py
    hadoop/core/trunk/src/contrib/hod/hodlib/Hod/hod.py
    hadoop/core/trunk/src/contrib/hod/hodlib/RingMaster/ringMaster.py

Modified: hadoop/core/trunk/src/contrib/hod/CHANGES.txt
URL: http://svn.apache.org/viewvc/hadoop/core/trunk/src/contrib/hod/CHANGES.txt?rev=663889&r1=663888&r2=663889&view=diff
==============================================================================
--- hadoop/core/trunk/src/contrib/hod/CHANGES.txt (original)
+++ hadoop/core/trunk/src/contrib/hod/CHANGES.txt Fri Jun  6 03:57:42 2008
@@ -9,6 +9,10 @@
     exist and to auto-deallocate a cluster while reallocating it, if it is
     already dead. (Hemanth Yamijala via mukund)
 
+    HADOOP-3184. Modified HOD to handle master failures on bad nodes by trying 
+    to bring them up on another node in the ring. (Hemanth Yamijala via ddas)
+
+
   NEW FEATURES
 
   IMPROVEMENTS

Modified: hadoop/core/trunk/src/contrib/hod/bin/hod
URL: http://svn.apache.org/viewvc/hadoop/core/trunk/src/contrib/hod/bin/hod?rev=663889&r1=663888&r2=663889&view=diff
==============================================================================
--- hadoop/core/trunk/src/contrib/hod/bin/hod (original)
+++ hadoop/core/trunk/src/contrib/hod/bin/hod Fri Jun  6 03:57:42 2008
@@ -225,7 +225,12 @@
              False, 120, False, True),
 
              ('idleness-limit', 'pos_int', 'Limit after which to deallocate the cluster',
-             False, 3600, False, True)),
+             False, 3600, False, True),
+
+             ('max-master-failures', 'pos_int', 
+              'Defines how many times a master can fail before' \
+              ' failing cluster allocation', False, 5, True, True)),
+
 
             'gridservice-mapred' : (
              ('external', 'bool', "Connect to an already running MapRed?",

Modified: hadoop/core/trunk/src/contrib/hod/bin/ringmaster
URL: http://svn.apache.org/viewvc/hadoop/core/trunk/src/contrib/hod/bin/ringmaster?rev=663889&r1=663888&r2=663889&view=diff
==============================================================================
--- hadoop/core/trunk/src/contrib/hod/bin/ringmaster (original)
+++ hadoop/core/trunk/src/contrib/hod/bin/ringmaster Fri Jun  6 03:57:42 2008
@@ -113,7 +113,11 @@
              False, 120, False, True),
 
              ('idleness-limit', 'pos_int', 'Limit after which to deallocate the cluster',
-             False, 3600, False, True)),
+             False, 3600, False, True),
+
+             ('max-master-failures', 'pos_int', 
+              'Defines how many times a master can fail before' \
+              ' failing cluster allocation', False, 5, True, True)),
 
             'resource_manager' : (
              ('id', 'string', 'Batch scheduler ID: torque|condor.',
@@ -330,7 +334,8 @@
                                      'ringmaster', 'svcrgy-addr'))
       serviceClient = hodXRClient(serviceAddr)
       if serviceClient is not None:
-        serviceClient.setRMError([str(e),get_exception_string()])
+        serviceClient.setRMError([local_fqdn(), str(e), \
+                                    get_exception_string()])
         log.info("Reported errors to service registry at %s" % serviceAddr)
     except Exception, e:
       log.error("Failed to report errors to service registry.")

Modified: hadoop/core/trunk/src/contrib/hod/hodlib/GridServices/service.py
URL: http://svn.apache.org/viewvc/hadoop/core/trunk/src/contrib/hod/hodlib/GridServices/service.py?rev=663889&r1=663888&r2=663889&view=diff
==============================================================================
--- hadoop/core/trunk/src/contrib/hod/hodlib/GridServices/service.py (original)
+++ hadoop/core/trunk/src/contrib/hod/hodlib/GridServices/service.py Fri Jun  6 03:57:42 2008
@@ -66,6 +66,7 @@
     self.masterAddress = 'none'
     self.requiredNode = requiredNode
     self.failedMsg = None
+    self.masterFailureCount = 0
 
   def getRequiredNode(self):
     return self.requiredNode
@@ -136,6 +137,9 @@
     """ set the master initialized to
     true. """
     self.masterInitialized = True
+    # Reset failure related variables, as master is initialized successfully.
+    self.masterFailureCount = 0
+    self.failedMsg = None
 
   def getMasterAddress(self):
     """ it needs to change to reflect 
@@ -152,11 +156,19 @@
     return self.serviceDesc.isExternal()
 
   def setMasterFailed(self, err):
+    """Sets variables related to Master failure"""
+    self.masterFailureCount += 1
     self.failedMsg = err
+    # When command is sent to HodRings, this would have been set to True.
+    # Reset it to reflect the correct status.
+    self.launchedMaster = False
 
   def getMasterFailed(self):
     return self.failedMsg
-  
+ 
+  def getMasterFailureCount(self):
+    return self.masterFailureCount
+ 
 class NodeRequest:
   """ A class to define 
   a node request. """

Modified: hadoop/core/trunk/src/contrib/hod/hodlib/Hod/hadoop.py
URL: http://svn.apache.org/viewvc/hadoop/core/trunk/src/contrib/hod/hodlib/Hod/hadoop.py?rev=663889&r1=663888&r2=663889&view=diff
==============================================================================
--- hadoop/core/trunk/src/contrib/hod/hodlib/Hod/hadoop.py (original)
+++ hadoop/core/trunk/src/contrib/hod/hodlib/Hod/hadoop.py Fri Jun  6 03:57:42 2008
@@ -604,9 +604,11 @@
     if status == 5 or status == 6:
       ringMasterErrors = self.__svcrgyClient.getRMError()
       if ringMasterErrors:
-        self.__log.critical("Cluster could not be allocated because of the following errors on the ringmaster host.\n%s" % \
-                               (ringMasterErrors[0]))
-        self.__log.debug("Stack trace on ringmaster: %s" % ringMasterErrors[1])
+        self.__log.critical("Cluster could not be allocated because" \
+                            " of the following errors on the "\
+                            "ringmaster host %s.\n%s" % \
+                               (ringMasterErrors[0], ringMasterErrors[1]))
+        self.__log.debug("Stack trace on ringmaster: %s" % ringMasterErrors[2])
     return status
 
   def __isRingMasterAlive(self, rmAddr):

Modified: hadoop/core/trunk/src/contrib/hod/hodlib/Hod/hod.py
URL: http://svn.apache.org/viewvc/hadoop/core/trunk/src/contrib/hod/hodlib/Hod/hod.py?rev=663889&r1=663888&r2=663889&view=diff
==============================================================================
--- hadoop/core/trunk/src/contrib/hod/hodlib/Hod/hod.py (original)
+++ hadoop/core/trunk/src/contrib/hod/hodlib/Hod/hod.py Fri Jun  6 03:57:42 2008
@@ -227,6 +227,31 @@
          
     return opList 
   
+  def __adjustMasterFailureCountConfig(self, nodeCount):
+    # This method adjusts the ringmaster.max-master-failures variable
+    # to a value that is bounded by the a function of the number of
+    # nodes.
+
+    maxFailures = self.__cfg['ringmaster']['max-master-failures']
+    # Count number of masters required - depends on which services
+    # are external
+    masters = 0
+    if not self.__cfg['gridservice-hdfs']['external']:
+      masters += 1
+    if not self.__cfg['gridservice-mapred']['external']:
+      masters += 1
+
+    # So, if there are n nodes and m masters, we look atleast for
+    # all masters to come up. Therefore, atleast m nodes should be
+    # good, which means a maximum of n-m master nodes can fail.
+    maxFailedNodes = nodeCount - masters
+
+    # The configured max number of failures is now bounded by this
+    # number.
+    self.__cfg['ringmaster']['max-master-failures'] = \
+                              min(maxFailures, maxFailedNodes)
+
+    
   def _op_allocate(self, args):
     operation = "allocate"
     argLength = len(args)
@@ -312,6 +337,9 @@
               self.__cleanup()
               raise HodInterruptException()
             self.__log.debug("Service Registry started.")
+
+            self.__adjustMasterFailureCountConfig(nodes)
+            
             try:
               allocateStatus = self.__cluster.allocate(clusterDir, min, max)    
             except HodInterruptException, h:

Modified: hadoop/core/trunk/src/contrib/hod/hodlib/RingMaster/ringMaster.py
URL: http://svn.apache.org/viewvc/hadoop/core/trunk/src/contrib/hod/hodlib/RingMaster/ringMaster.py?rev=663889&r1=663888&r2=663889&view=diff
==============================================================================
--- hadoop/core/trunk/src/contrib/hod/hodlib/RingMaster/ringMaster.py (original)
+++ hadoop/core/trunk/src/contrib/hod/hodlib/RingMaster/ringMaster.py Fri Jun  6 03:57:42 2008
@@ -370,7 +370,12 @@
         for v in self.serviceDict.itervalues():
           if (not v.isExternal()):
             if v.isLaunchable(self.serviceDict):
-              if not v.isMasterLaunched():
+              # If a master is still not launched, or the number of 
+              # retries for launching master is not reached, 
+              # launch master
+              if not v.isMasterLaunched() and \
+                  (v.getMasterFailureCount() <= \
+                      self.cfg['ringmaster']['max-master-failures']):
                 cmdList = v.getMasterCommands(self.serviceDict)
                 v.setlaunchedMaster()
                 v.setMasterAddress(addr)
@@ -441,7 +446,8 @@
   def setHodRingErrors(self, addr, errors):
     """This method is called by the hodrings to update errors 
       it encountered while starting up"""
-    self.log.critical("Hodring at %s failed with following errors:\n%s" % (addr, errors))
+    self.log.critical("Hodring at %s failed with following errors:\n%s" \
+                        % (addr, errors))
     lock = self.masterParamLock
     lock.acquire()
     try:
@@ -452,7 +458,8 @@
             idx = addr.rfind('_')
             if idx is not -1:
               addr = addr[:idx]
-            v.setMasterFailed("Hodring at %s failed with following errors:\n%s" % (addr, errors))
+            v.setMasterFailed("Hodring at %s failed with following" \
+                                " errors:\n%s" % (addr, errors))
     except:
       self.log.debug(get_exception_string())
       pass
@@ -478,8 +485,16 @@
       pass
     else:
       self.log.debug("getServiceAddr service: %s" % service)
+      # Check if we should give up ! If the limit on max failures is hit, 
+      # give up.
       err = service.getMasterFailed()
-      if err is not None:
+      if (err is not None) and \
+            (service.getMasterFailureCount() > \
+                      self.cfg['ringmaster']['max-master-failures']):
+        self.log.critical("Detected errors (%s) beyond allowed number"\
+                            " of failures (%s). Flagging error to client" \
+                            % (service.getMasterFailureCount(), \
+                              self.cfg['ringmaster']['max-master-failures']))
         addr = "Error: " + err
       elif (service.isMasterInitialized()):
         addr = service.getMasterAddrs()[0]