You are viewing a plain text version of this content. The canonical link for it is here.
Posted to common-commits@hadoop.apache.org by dd...@apache.org on 2008/06/12 12:32:27 UTC

svn commit: r667035 - in /hadoop/core/branches/branch-0.18/src/contrib/hod: CHANGES.txt hodlib/Hod/hadoop.py hodlib/Hod/hod.py hodlib/NodePools/torque.py hodlib/Schedulers/torque.py testing/testHod.py testing/testRingmasterRPCs.py

Author: ddas
Date: Thu Jun 12 03:32:26 2008
New Revision: 667035

URL: http://svn.apache.org/viewvc?rev=667035&view=rev
Log:
Merge -r 667032:667033 from trunk onto 0.18 branch. Fixes HADOOP-3523.

Modified:
    hadoop/core/branches/branch-0.18/src/contrib/hod/CHANGES.txt
    hadoop/core/branches/branch-0.18/src/contrib/hod/hodlib/Hod/hadoop.py
    hadoop/core/branches/branch-0.18/src/contrib/hod/hodlib/Hod/hod.py
    hadoop/core/branches/branch-0.18/src/contrib/hod/hodlib/NodePools/torque.py
    hadoop/core/branches/branch-0.18/src/contrib/hod/hodlib/Schedulers/torque.py
    hadoop/core/branches/branch-0.18/src/contrib/hod/testing/testHod.py
    hadoop/core/branches/branch-0.18/src/contrib/hod/testing/testRingmasterRPCs.py

Modified: hadoop/core/branches/branch-0.18/src/contrib/hod/CHANGES.txt
URL: http://svn.apache.org/viewvc/hadoop/core/branches/branch-0.18/src/contrib/hod/CHANGES.txt?rev=667035&r1=667034&r2=667035&view=diff
==============================================================================
--- hadoop/core/branches/branch-0.18/src/contrib/hod/CHANGES.txt (original)
+++ hadoop/core/branches/branch-0.18/src/contrib/hod/CHANGES.txt Thu Jun 12 03:32:26 2008
@@ -1,7 +1,6 @@
 HOD Change Log
 
-
-Trunk (unreleased changes)
+Release 0.18.0 - Unreleased
 
   INCOMPATIBLE CHANGES
 
@@ -29,10 +28,13 @@
  
   BUG FIXES
 
-    HADOOP-2961: Avoids unnecessary checks for some configuration parameters
+    HADOOP-2961. Avoids unnecessary checks for some configuration parameters
     related to service configuration. (Vinod Kumar Vavilapalli via ddas)
 
-Release 0.17.0 - Unreleased
+    HADOOP-3523. Fixes auto-deallocation of cluster if job id is not found in
+    Torque's job list (Hemanth Yamijala via ddas)
+
+Release 0.17.0 - 2008-05-18
 
   INCOMPATIBLE CHANGES
 

Modified: hadoop/core/branches/branch-0.18/src/contrib/hod/hodlib/Hod/hadoop.py
URL: http://svn.apache.org/viewvc/hadoop/core/branches/branch-0.18/src/contrib/hod/hodlib/Hod/hadoop.py?rev=667035&r1=667034&r2=667035&view=diff
==============================================================================
--- hadoop/core/branches/branch-0.18/src/contrib/hod/hodlib/Hod/hadoop.py (original)
+++ hadoop/core/branches/branch-0.18/src/contrib/hod/hodlib/Hod/hadoop.py Thu Jun 12 03:32:26 2008
@@ -431,7 +431,9 @@
     """Returns True if the JobId that represents this cluster
        is in the Completed or exiting state."""
     jobInfo = self.__nodePool.getJobInfo(jobId)
-    state = jobInfo['job_state']
+    state = None
+    if jobInfo is not None and jobInfo.has_key('job_state'):
+      state = jobInfo['job_state']
     return ((state == 'C') or (state == 'E'))
 
   def cleanup(self):

Modified: hadoop/core/branches/branch-0.18/src/contrib/hod/hodlib/Hod/hod.py
URL: http://svn.apache.org/viewvc/hadoop/core/branches/branch-0.18/src/contrib/hod/hodlib/Hod/hod.py?rev=667035&r1=667034&r2=667035&view=diff
==============================================================================
--- hadoop/core/branches/branch-0.18/src/contrib/hod/hodlib/Hod/hod.py (original)
+++ hadoop/core/branches/branch-0.18/src/contrib/hod/hodlib/Hod/hod.py Thu Jun 12 03:32:26 2008
@@ -307,7 +307,7 @@
           self.__remove_cluster(clusterDir)
           self.__clusterState.clear()
         else:
-          self.__log.critical("Found a previously allocated cluster at cluster directory '%s'. Deallocate the cluster first." % (clusterDir))
+          self.__log.critical("Found a previously allocated cluster at cluster directory '%s'. HOD cannot determine if this cluster can be automatically deallocated. Deallocate the cluster if it is unused." % (clusterDir))
           self.__opCode = 12
           return
  

Modified: hadoop/core/branches/branch-0.18/src/contrib/hod/hodlib/NodePools/torque.py
URL: http://svn.apache.org/viewvc/hadoop/core/branches/branch-0.18/src/contrib/hod/hodlib/NodePools/torque.py?rev=667035&r1=667034&r2=667035&view=diff
==============================================================================
--- hadoop/core/branches/branch-0.18/src/contrib/hod/hodlib/NodePools/torque.py (original)
+++ hadoop/core/branches/branch-0.18/src/contrib/hod/hodlib/NodePools/torque.py Thu Jun 12 03:32:26 2008
@@ -270,7 +270,8 @@
 
   def getJobInfo(self, jobId=None):
     #torque error code when credentials fail, a temporary condition sometimes.
-    credFailureErrorCode = 171 
+    credFailureErrorCode = 171
+    jobNonExistentErrorCode = 153
     credFailureRetries = 10
     i = 0
     self.__jobInfo = None
@@ -283,6 +284,12 @@
       if exitCode == 0:
         self.__jobInfo = qstatInfo
         break
+      elif exitCode == jobNonExistentErrorCode:
+        # This really means that the job completed
+        # However, setting only job_state for now, not 
+        # any other attributes, as none seem required.
+        self.__jobInfo = { 'job_state' : 'C' }
+        break
       else:
         if exitCode == credFailureErrorCode:
           time.sleep(1)

Modified: hadoop/core/branches/branch-0.18/src/contrib/hod/hodlib/Schedulers/torque.py
URL: http://svn.apache.org/viewvc/hadoop/core/branches/branch-0.18/src/contrib/hod/hodlib/Schedulers/torque.py?rev=667035&r1=667034&r2=667035&view=diff
==============================================================================
--- hadoop/core/branches/branch-0.18/src/contrib/hod/hodlib/Schedulers/torque.py (original)
+++ hadoop/core/branches/branch-0.18/src/contrib/hod/hodlib/Schedulers/torque.py Thu Jun 12 03:32:26 2008
@@ -93,7 +93,7 @@
     
     exitCode = qstatProcess.exit_code()
     if exitCode > 0:
-      self.__log.error('qstat error: %s' % qstatProcess.exit_status_string())
+      self.__log.warn('qstat error: %s' % qstatProcess.exit_status_string())
     else:
       qstatInfo = {}
       for line in qstatProcess.output():

Modified: hadoop/core/branches/branch-0.18/src/contrib/hod/testing/testHod.py
URL: http://svn.apache.org/viewvc/hadoop/core/branches/branch-0.18/src/contrib/hod/testing/testHod.py?rev=667035&r1=667034&r2=667035&view=diff
==============================================================================
--- hadoop/core/branches/branch-0.18/src/contrib/hod/testing/testHod.py (original)
+++ hadoop/core/branches/branch-0.18/src/contrib/hod/testing/testHod.py Thu Jun 12 03:32:26 2008
@@ -185,7 +185,10 @@
     userState = { clusterDir : jobid }
     self.__setupClusterState(userState, False)
     self.client._op_allocate(['allocate', clusterDir, '3'])
-    self.assertTrue(self.log.hasMessage("Found a previously allocated cluster at cluster directory '%s'. Deallocate the cluster first." % (clusterDir), 'critical'))
+    self.assertTrue(self.log.hasMessage("Found a previously allocated cluster at "\
+                      "cluster directory '%s'. HOD cannot determine if this cluster "\
+                      "can be automatically deallocated. Deallocate the cluster if it "\
+                      "is unused." % (clusterDir), 'critical'))
     os.rmdir(clusterDir)
 
   def __setupClusterState(self, clusterStateMap, verifyDirIsAbsent=True):

Modified: hadoop/core/branches/branch-0.18/src/contrib/hod/testing/testRingmasterRPCs.py
URL: http://svn.apache.org/viewvc/hadoop/core/branches/branch-0.18/src/contrib/hod/testing/testRingmasterRPCs.py?rev=667035&r1=667034&r2=667035&view=diff
==============================================================================
--- hadoop/core/branches/branch-0.18/src/contrib/hod/testing/testRingmasterRPCs.py (original)
+++ hadoop/core/branches/branch-0.18/src/contrib/hod/testing/testRingmasterRPCs.py Thu Jun 12 03:32:26 2008
@@ -68,7 +68,8 @@
                             'batch-home': '/home/y/'
                           }, 
        'ringmaster': {
-                      'max-connect' : 2
+                      'max-connect' : 2,
+                      'max-master-failures' : 5
                      }, 
        'hodring': {
                   },