You are viewing a plain text version of this content. The canonical link for it is here.
Posted to common-commits@hadoop.apache.org by om...@apache.org on 2011/03/04 05:36:41 UTC

svn commit: r1077615 - in /hadoop/common/branches/branch-0.20-security-patches/src: mapred/org/apache/hadoop/mapred/JobTracker.java test/org/apache/hadoop/mapred/TestTrackerBlacklistAcrossJobs.java

Author: omalley
Date: Fri Mar  4 04:36:41 2011
New Revision: 1077615

URL: http://svn.apache.org/viewvc?rev=1077615&view=rev
Log:
commit f1513dc4f84d112221ea5fd496513d904b7bd20f
Author: Chris Douglas <cd...@apache.org>
Date:   Wed Jul 28 18:29:39 2010 -0700

     Fix failing TrackerBlacklistAcrossJobs and maintain separation
    between graylist (by heuristic) and blacklist (by health script) from

Modified:
    hadoop/common/branches/branch-0.20-security-patches/src/mapred/org/apache/hadoop/mapred/JobTracker.java
    hadoop/common/branches/branch-0.20-security-patches/src/test/org/apache/hadoop/mapred/TestTrackerBlacklistAcrossJobs.java

Modified: hadoop/common/branches/branch-0.20-security-patches/src/mapred/org/apache/hadoop/mapred/JobTracker.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-0.20-security-patches/src/mapred/org/apache/hadoop/mapred/JobTracker.java?rev=1077615&r1=1077614&r2=1077615&view=diff
==============================================================================
--- hadoop/common/branches/branch-0.20-security-patches/src/mapred/org/apache/hadoop/mapred/JobTracker.java (original)
+++ hadoop/common/branches/branch-0.20-security-patches/src/mapred/org/apache/hadoop/mapred/JobTracker.java Fri Mar  4 04:36:41 2011
@@ -693,7 +693,6 @@ public class JobTracker implements MRCon
 
     private int numFaultBuckets;
     private long bucketWidth;
-    private boolean isHealthy;
     private HashMap<ReasonForBlackListing, String> blackRfbMap;
     private HashMap<ReasonForBlackListing, String> grayRfbMap;
 
@@ -704,7 +703,6 @@ public class JobTracker implements MRCon
       lastRotated = (time / bucketWidth) * bucketWidth;
       blacklisted = false;
       graylisted = false;
-      isHealthy = true;
       blackRfbMap = new HashMap<ReasonForBlackListing, String>();
       grayRfbMap = new HashMap<ReasonForBlackListing, String>();
     }
@@ -767,14 +765,6 @@ public class JobTracker implements MRCon
       }
     }
 
-    public void setHealth(boolean isHealthy) {
-      this.isHealthy = isHealthy;
-    }
-
-    public boolean isHealthy() {
-      return isHealthy;
-    }
-
     public String getTrackerBlackOrGraylistReport(boolean gray) {
       StringBuffer sb = new StringBuffer();
       HashMap<ReasonForBlackListing, String> rfbMap =
@@ -845,8 +835,7 @@ public class JobTracker implements MRCon
         long now = clock.getTime();
         FaultInfo fi = getFaultInfo(hostName, true);
         fi.incrFaultCount(now);
-        // check heuristics, and add either to blacklist (original behavior)
-        // or to graylist (new, optional behavior):
+        // check heuristics, and add to the graylist if over the limit:
         if (exceedsFaults(fi, now)) {
           LOG.info("Adding " + hostName + " to the graylist across all jobs");
           String reason = String.format(FaultInfo.FAULT_FORMAT_STRING,
@@ -877,7 +866,7 @@ public class JobTracker implements MRCon
     private boolean exceedsFaults(FaultInfo fi, long timeStamp) {
       int faultCount = fi.getFaultCount(timeStamp);
       if (faultCount >= TRACKER_FAULT_THRESHOLD) {
-        // calculate avgBlackLists
+        // calculate average faults across all nodes
         long clusterSize = getClusterStatus().getTaskTrackers();
         long sum = 0;
         for (FaultInfo f : potentiallyFaultyTrackers.values()) {
@@ -961,8 +950,8 @@ public class JobTracker implements MRCon
      * newest ("current") bucket.  Thus TRACKER_FAULT_TIMEOUT_WINDOW
      * determines the timeout value for TaskTracker faults (in combination
      * with TRACKER_FAULT_BUCKET_WIDTH), and the sum over all buckets is
-     * compared with TRACKER_FAULT_THRESHOLD to determine whether global
-     * blacklisting is warranted (or, alternatively, if it should be lifted).
+     * compared with TRACKER_FAULT_THRESHOLD to determine whether graylisting
+     * is warranted (or, alternatively, if it should be lifted).
      *
      * Assumes JobTracker is locked on entry.
      * 
@@ -1038,13 +1027,16 @@ public class JobTracker implements MRCon
       synchronized (potentiallyFaultyTrackers) {
         FaultInfo fi = potentiallyFaultyTrackers.remove(hostName);
         if (fi != null) {
+          // a tracker can be both blacklisted and graylisted, so check both
           if (fi.isGraylisted()) {
             LOG.info("Removing " + hostName + " from graylist");
             decrGraylistedTrackers(getNumTaskTrackersOnHost(hostName));
-          } else if (fi.isBlacklisted()) {
+          }
+          if (fi.isBlacklisted()) {
             LOG.info("Removing " + hostName + " from blacklist");
             addHostCapacity(hostName);
           }
+          // no need for fi.unBlacklist() for either one:  fi is already gone
         }
       }
     }
@@ -1067,7 +1059,8 @@ public class JobTracker implements MRCon
       }
     }
     
-    // This is called on tracker's restart or after a day of blacklist.
+    // This is called when a tracker is restarted or when the health-check
+    // script reports it healthy.  (Not called for graylisting.)
     private void addHostCapacity(String hostName) {
       synchronized (taskTrackers) {
         int numTrackersOnHost = 0;
@@ -1144,7 +1137,6 @@ public class JobTracker implements MRCon
       // MAPREDUCE-211 for details.  We never use graylisting for this path.)
       if (!isHealthy) {
         fi = getFaultInfo(hostName, true);
-        fi.setHealth(isHealthy);
         synchronized (potentiallyFaultyTrackers) {
           blacklistTracker(hostName, reason,
                            ReasonForBlackListing.NODE_UNHEALTHY, false);
@@ -2076,7 +2068,7 @@ public class JobTracker implements MRCon
   // Number of resolved entries
   int numResolved;
 
-  // statistics about TaskTrackers with faults; may lead to blacklisting
+  // statistics about TaskTrackers with faults; may lead to graylisting
   private FaultyTrackersInfo faultyTrackers = new FaultyTrackersInfo();
   
   private JobTrackerStatistics statistics = 
@@ -2216,8 +2208,8 @@ public class JobTracker implements MRCon
              conf.getInt("mapred.job.tracker.retiredjobs.cache.size", 1000);
     MAX_COMPLETE_USER_JOBS_IN_MEMORY = conf.getInt("mapred.jobtracker.completeuserjobs.maximum", 100);
 
-    // values related to heuristic blacklisting ("fault" == a per-job
-    // blacklisting; too many faults => node blacklisted across all jobs):
+    // values related to heuristic graylisting (a "fault" is a per-job
+    // blacklisting; too many faults => node is graylisted across all jobs):
     TRACKER_FAULT_TIMEOUT_WINDOW =  // 3 hours
       conf.getInt("mapred.jobtracker.blacklist.fault-timeout-window", 3 * 60);
     TRACKER_FAULT_BUCKET_WIDTH =    // 15 minutes
@@ -2865,7 +2857,7 @@ public class JobTracker implements MRCon
     addJobForCleanup(id);
 
     // add the (single-job) blacklisted trackers to potentially faulty list
-    // for possible heuristic blacklisting across all jobs
+    // for possible heuristic graylisting across all jobs
     if (job.getStatus().getRunState() == JobStatus.SUCCEEDED) {
       if (job.getNoOfBlackListedTrackers() > 0) {
         for (String hostName : job.getBlackListedTrackers()) {
@@ -3003,13 +2995,14 @@ public class JobTracker implements MRCon
    * cluster. The first element in the returned list contains the list of
    * active tracker names; the second element in the returned list contains
    * the list of blacklisted tracker names; and the third contains the list
-   * of graylisted tracker names.  Note that the blacklist is disjoint from
-   * the active list, but the graylist is not:  graylisted trackers are still
-   * active and therefore may appear in both lists.  (Graylisted trackers
-   * could conceivably appear on the blacklist rather than the active list;
-   * blacklisting comes about via the health-check script, while graylisting
-   * is heuristically based on the number of per-job blacklistings in a
-   * specified time interval.)
+   * of graylisted tracker names.  Note that the blacklist is disjoint
+   * from the active list, but the graylist is not:  initially, graylisted
+   * trackers are still active and therefore will appear in both lists.
+   * (Graylisted trackers can later be blacklisted, in which case they'll
+   * be removed from the active list and added to the blacklist, but they
+   * remain on the graylist in this case.  Blacklisting comes about via the
+   * health-check script, while graylisting is heuristically based on the
+   * number of per-job blacklistings in a specified time interval.)
    */
   // This method is synchronized to make sure that the locking order
   // "taskTrackers lock followed by faultyTrackers.potentiallyFaultyTrackers
@@ -4875,8 +4868,8 @@ public class JobTracker implements MRCon
     String trackerName = tracker.getTrackerName();
     // Remove completely after marking the tasks as 'KILLED'
     lostTaskTracker(tracker);
-    // tracker is lost, and if it is blacklisted, remove
-    // it from the count of blacklisted trackers in the cluster
+    // tracker is lost; if it is blacklisted and/or graylisted, remove
+    // it from the relevant count(s) of trackers in the cluster
     if (isBlacklisted(trackerName)) {
      faultyTrackers.decrBlacklistedTrackers(1);
     }

Modified: hadoop/common/branches/branch-0.20-security-patches/src/test/org/apache/hadoop/mapred/TestTrackerBlacklistAcrossJobs.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-0.20-security-patches/src/test/org/apache/hadoop/mapred/TestTrackerBlacklistAcrossJobs.java?rev=1077615&r1=1077614&r2=1077615&view=diff
==============================================================================
--- hadoop/common/branches/branch-0.20-security-patches/src/test/org/apache/hadoop/mapred/TestTrackerBlacklistAcrossJobs.java (original)
+++ hadoop/common/branches/branch-0.20-security-patches/src/test/org/apache/hadoop/mapred/TestTrackerBlacklistAcrossJobs.java Fri Mar  4 04:36:41 2011
@@ -61,7 +61,7 @@ public class TestTrackerBlacklistAcrossJ
     MiniMRCluster mr = null;
     FileSystem fileSys = null;
     Configuration conf = new Configuration();
-    // setup dfs and input
+    // set up dfs and input
     dfs = new MiniDFSCluster(conf, 1, true, null, hosts);
     fileSys = dfs.getFileSystem();
     if (!fileSys.mkdirs(inDir)) {
@@ -75,7 +75,7 @@ public class TestTrackerBlacklistAcrossJ
     mr = new MiniMRCluster(3, fileSys.getUri().toString(),
                            1, null, hosts, jtConf);
 
-    // setup job configuration
+    // set up job configuration
     JobConf mrConf = mr.createJobConf();
     JobConf job = new JobConf(mrConf);
     job.setInt("mapred.max.tracker.failures", 1);
@@ -93,16 +93,24 @@ public class TestTrackerBlacklistAcrossJ
     JobClient jc = new JobClient(mrConf);
     RunningJob running = JobClient.runJob(job);
     assertEquals("Job failed", JobStatus.SUCCEEDED, running.getJobState());
-    assertEquals("Didn't blacklist the host", 1, 
+    // heuristic blacklisting is graylisting as of 0.20.Fred
+    assertEquals("Blacklisted the host", 0,
       jc.getClusterStatus().getBlacklistedTrackers());
+    assertEquals("Didn't graylist the host", 1,
+      jc.getClusterStatus().getGraylistedTrackers());
     assertEquals("Fault count should be 1", 1, mr.getFaultCount(hosts[0]));
 
     // run the same job once again 
-    // there should be no change in blacklist count
+    // there should be no change in blacklist or graylist count, but fault
+    // count (per-job blacklistings) should go up by one
     running = JobClient.runJob(job);
     assertEquals("Job failed", JobStatus.SUCCEEDED, running.getJobState());
-    assertEquals("Didn't blacklist the host", 1,
+    assertEquals("Blacklisted the host", 0,
       jc.getClusterStatus().getBlacklistedTrackers());
-    assertEquals("Fault count should be 1", 1, mr.getFaultCount(hosts[0]));
+    assertEquals("Didn't graylist the host", 1,
+      jc.getClusterStatus().getGraylistedTrackers());
+    // previously this asserted 1, but makes no sense:  each per-job
+    // blacklisting counts as a new fault, so 2 runs => 2 faults:
+    assertEquals("Fault count should be 2", 2, mr.getFaultCount(hosts[0]));
   }
 }