You are viewing a plain text version of this content. The canonical link for it is here.
Posted to common-commits@hadoop.apache.org by ni...@apache.org on 2008/03/03 17:44:46 UTC

svn commit: r633166 - in /hadoop/core/trunk: CHANGES.txt src/contrib/hod/hodlib/RingMaster/idleJobTracker.py

Author: nigel
Date: Mon Mar  3 08:44:44 2008
New Revision: 633166

URL: http://svn.apache.org/viewvc?rev=633166&view=rev
Log:
HADOOP-2847.  Ensure idle cluster cleanup works even if the JobTracker becomes unresponsive to RPC calls. Contributed by Hemanth Yamijala.

Modified:
    hadoop/core/trunk/CHANGES.txt
    hadoop/core/trunk/src/contrib/hod/hodlib/RingMaster/idleJobTracker.py

Modified: hadoop/core/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/hadoop/core/trunk/CHANGES.txt?rev=633166&r1=633165&r2=633166&view=diff
==============================================================================
--- hadoop/core/trunk/CHANGES.txt (original)
+++ hadoop/core/trunk/CHANGES.txt Mon Mar  3 08:44:44 2008
@@ -216,6 +216,9 @@
     HADOOP-2923.  Add SequenceFileAsBinaryInputFormat, which was
     missed in the commit for HADOOP-2603. (cdouglas via omalley)
 
+    HADOOP-2847.  Ensure idle cluster cleanup works even if the JobTracker
+    becomes unresponsive to RPC calls. (Hemanth Yamijala via nigel)
+
 Release 0.16.0 - 2008-02-07
 
   INCOMPATIBLE CHANGES

Modified: hadoop/core/trunk/src/contrib/hod/hodlib/RingMaster/idleJobTracker.py
URL: http://svn.apache.org/viewvc/hadoop/core/trunk/src/contrib/hod/hodlib/RingMaster/idleJobTracker.py?rev=633166&r1=633165&r2=633166&view=diff
==============================================================================
--- hadoop/core/trunk/src/contrib/hod/hodlib/RingMaster/idleJobTracker.py (original)
+++ hadoop/core/trunk/src/contrib/hod/hodlib/RingMaster/idleJobTracker.py Mon Mar  3 08:44:44 2008
@@ -31,6 +31,13 @@
   def getStatus(self):
     return self.__status
 
+class HadoopClientException(Exception):
+  """This class represents an exception that is raised when we fail in
+     running the job client."""
+  
+  def __init__(self, errorCode):
+    self.errorCode = errorCode
+  
 class JobTrackerMonitor:
   """This class monitors the JobTracker of an allocated cluster
      periodically to detect whether it is idle. If it is found
@@ -134,7 +141,17 @@
 
   def __isIdle(self):
     """This method checks if the JobTracker is idle beyond a certain limit."""
-    if self.__getJobCount() == 0:
+    jobCount = 0
+    err = False
+
+    try:
+      jobCount = self.__getJobCount()
+    except HadoopClientException, hce:
+      self.__log.debug('HadoopClientException handled in getting job count. \
+                                      Error code: %s' % hce.errorCode)
+      err = True
+
+    if (jobCount==0) or err:
       if self.__firstIdleTime == 0:
         #detecting idleness for the first time
         self.__firstIdleTime = time.time()
@@ -145,6 +162,7 @@
     else:
       # reset idleness time
       self.__firstIdleTime = 0
+      
     return False
 
   def __getJobCount(self):
@@ -164,6 +182,11 @@
         match = self.__jobCountRegExp.match(line)
         if match:
           jobs = int(match.group(1))
+    elif jtStatusCommand.exit_code() == 1:
+      # for now, exit code 1 comes for any exception raised by JobClient. If hadoop gets
+      # to differentiate and give more granular exit codes, we can check for those errors
+      # corresponding to network errors etc.
+      raise HadoopClientException(jtStatusCommand.exit_code())
     return jobs
 
   def __isCompatibleHadoopVersion(self, expectedVersion):