You are viewing a plain text version of this content. The canonical link for it is here.
Posted to common-commits@hadoop.apache.org by om...@apache.org on 2011/03/04 05:36:41 UTC
svn commit: r1077615 - in
/hadoop/common/branches/branch-0.20-security-patches/src:
mapred/org/apache/hadoop/mapred/JobTracker.java
test/org/apache/hadoop/mapred/TestTrackerBlacklistAcrossJobs.java
Author: omalley
Date: Fri Mar 4 04:36:41 2011
New Revision: 1077615
URL: http://svn.apache.org/viewvc?rev=1077615&view=rev
Log:
commit f1513dc4f84d112221ea5fd496513d904b7bd20f
Author: Chris Douglas <cd...@apache.org>
Date: Wed Jul 28 18:29:39 2010 -0700
Fix failing TrackerBlacklistAcrossJobs and maintain separation
between graylist (by heuristic) and blacklist (by health script) from
Modified:
hadoop/common/branches/branch-0.20-security-patches/src/mapred/org/apache/hadoop/mapred/JobTracker.java
hadoop/common/branches/branch-0.20-security-patches/src/test/org/apache/hadoop/mapred/TestTrackerBlacklistAcrossJobs.java
Modified: hadoop/common/branches/branch-0.20-security-patches/src/mapred/org/apache/hadoop/mapred/JobTracker.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-0.20-security-patches/src/mapred/org/apache/hadoop/mapred/JobTracker.java?rev=1077615&r1=1077614&r2=1077615&view=diff
==============================================================================
--- hadoop/common/branches/branch-0.20-security-patches/src/mapred/org/apache/hadoop/mapred/JobTracker.java (original)
+++ hadoop/common/branches/branch-0.20-security-patches/src/mapred/org/apache/hadoop/mapred/JobTracker.java Fri Mar 4 04:36:41 2011
@@ -693,7 +693,6 @@ public class JobTracker implements MRCon
private int numFaultBuckets;
private long bucketWidth;
- private boolean isHealthy;
private HashMap<ReasonForBlackListing, String> blackRfbMap;
private HashMap<ReasonForBlackListing, String> grayRfbMap;
@@ -704,7 +703,6 @@ public class JobTracker implements MRCon
lastRotated = (time / bucketWidth) * bucketWidth;
blacklisted = false;
graylisted = false;
- isHealthy = true;
blackRfbMap = new HashMap<ReasonForBlackListing, String>();
grayRfbMap = new HashMap<ReasonForBlackListing, String>();
}
@@ -767,14 +765,6 @@ public class JobTracker implements MRCon
}
}
- public void setHealth(boolean isHealthy) {
- this.isHealthy = isHealthy;
- }
-
- public boolean isHealthy() {
- return isHealthy;
- }
-
public String getTrackerBlackOrGraylistReport(boolean gray) {
StringBuffer sb = new StringBuffer();
HashMap<ReasonForBlackListing, String> rfbMap =
@@ -845,8 +835,7 @@ public class JobTracker implements MRCon
long now = clock.getTime();
FaultInfo fi = getFaultInfo(hostName, true);
fi.incrFaultCount(now);
- // check heuristics, and add either to blacklist (original behavior)
- // or to graylist (new, optional behavior):
+ // check heuristics, and add to the graylist if over the limit:
if (exceedsFaults(fi, now)) {
LOG.info("Adding " + hostName + " to the graylist across all jobs");
String reason = String.format(FaultInfo.FAULT_FORMAT_STRING,
@@ -877,7 +866,7 @@ public class JobTracker implements MRCon
private boolean exceedsFaults(FaultInfo fi, long timeStamp) {
int faultCount = fi.getFaultCount(timeStamp);
if (faultCount >= TRACKER_FAULT_THRESHOLD) {
- // calculate avgBlackLists
+ // calculate average faults across all nodes
long clusterSize = getClusterStatus().getTaskTrackers();
long sum = 0;
for (FaultInfo f : potentiallyFaultyTrackers.values()) {
@@ -961,8 +950,8 @@ public class JobTracker implements MRCon
* newest ("current") bucket. Thus TRACKER_FAULT_TIMEOUT_WINDOW
* determines the timeout value for TaskTracker faults (in combination
* with TRACKER_FAULT_BUCKET_WIDTH), and the sum over all buckets is
- * compared with TRACKER_FAULT_THRESHOLD to determine whether global
- * blacklisting is warranted (or, alternatively, if it should be lifted).
+ * compared with TRACKER_FAULT_THRESHOLD to determine whether graylisting
+ * is warranted (or, alternatively, if it should be lifted).
*
* Assumes JobTracker is locked on entry.
*
@@ -1038,13 +1027,16 @@ public class JobTracker implements MRCon
synchronized (potentiallyFaultyTrackers) {
FaultInfo fi = potentiallyFaultyTrackers.remove(hostName);
if (fi != null) {
+ // a tracker can be both blacklisted and graylisted, so check both
if (fi.isGraylisted()) {
LOG.info("Removing " + hostName + " from graylist");
decrGraylistedTrackers(getNumTaskTrackersOnHost(hostName));
- } else if (fi.isBlacklisted()) {
+ }
+ if (fi.isBlacklisted()) {
LOG.info("Removing " + hostName + " from blacklist");
addHostCapacity(hostName);
}
+ // no need for fi.unBlacklist() for either one: fi is already gone
}
}
}
@@ -1067,7 +1059,8 @@ public class JobTracker implements MRCon
}
}
- // This is called on tracker's restart or after a day of blacklist.
+ // This is called when a tracker is restarted or when the health-check
+ // script reports it healthy. (Not called for graylisting.)
private void addHostCapacity(String hostName) {
synchronized (taskTrackers) {
int numTrackersOnHost = 0;
@@ -1144,7 +1137,6 @@ public class JobTracker implements MRCon
// MAPREDUCE-211 for details. We never use graylisting for this path.)
if (!isHealthy) {
fi = getFaultInfo(hostName, true);
- fi.setHealth(isHealthy);
synchronized (potentiallyFaultyTrackers) {
blacklistTracker(hostName, reason,
ReasonForBlackListing.NODE_UNHEALTHY, false);
@@ -2076,7 +2068,7 @@ public class JobTracker implements MRCon
// Number of resolved entries
int numResolved;
- // statistics about TaskTrackers with faults; may lead to blacklisting
+ // statistics about TaskTrackers with faults; may lead to graylisting
private FaultyTrackersInfo faultyTrackers = new FaultyTrackersInfo();
private JobTrackerStatistics statistics =
@@ -2216,8 +2208,8 @@ public class JobTracker implements MRCon
conf.getInt("mapred.job.tracker.retiredjobs.cache.size", 1000);
MAX_COMPLETE_USER_JOBS_IN_MEMORY = conf.getInt("mapred.jobtracker.completeuserjobs.maximum", 100);
- // values related to heuristic blacklisting ("fault" == a per-job
- // blacklisting; too many faults => node blacklisted across all jobs):
+ // values related to heuristic graylisting (a "fault" is a per-job
+ // blacklisting; too many faults => node is graylisted across all jobs):
TRACKER_FAULT_TIMEOUT_WINDOW = // 3 hours
conf.getInt("mapred.jobtracker.blacklist.fault-timeout-window", 3 * 60);
TRACKER_FAULT_BUCKET_WIDTH = // 15 minutes
@@ -2865,7 +2857,7 @@ public class JobTracker implements MRCon
addJobForCleanup(id);
// add the (single-job) blacklisted trackers to potentially faulty list
- // for possible heuristic blacklisting across all jobs
+ // for possible heuristic graylisting across all jobs
if (job.getStatus().getRunState() == JobStatus.SUCCEEDED) {
if (job.getNoOfBlackListedTrackers() > 0) {
for (String hostName : job.getBlackListedTrackers()) {
@@ -3003,13 +2995,14 @@ public class JobTracker implements MRCon
* cluster. The first element in the returned list contains the list of
* active tracker names; the second element in the returned list contains
* the list of blacklisted tracker names; and the third contains the list
- * of graylisted tracker names. Note that the blacklist is disjoint from
- * the active list, but the graylist is not: graylisted trackers are still
- * active and therefore may appear in both lists. (Graylisted trackers
- * could conceivably appear on the blacklist rather than the active list;
- * blacklisting comes about via the health-check script, while graylisting
- * is heuristically based on the number of per-job blacklistings in a
- * specified time interval.)
+ * of graylisted tracker names. Note that the blacklist is disjoint
+ * from the active list, but the graylist is not: initially, graylisted
+ * trackers are still active and therefore will appear in both lists.
+ * (Graylisted trackers can later be blacklisted, in which case they'll
+ * be removed from the active list and added to the blacklist, but they
+ * remain on the graylist in this case. Blacklisting comes about via the
+ * health-check script, while graylisting is heuristically based on the
+ * number of per-job blacklistings in a specified time interval.)
*/
// This method is synchronized to make sure that the locking order
// "taskTrackers lock followed by faultyTrackers.potentiallyFaultyTrackers
@@ -4875,8 +4868,8 @@ public class JobTracker implements MRCon
String trackerName = tracker.getTrackerName();
// Remove completely after marking the tasks as 'KILLED'
lostTaskTracker(tracker);
- // tracker is lost, and if it is blacklisted, remove
- // it from the count of blacklisted trackers in the cluster
+ // tracker is lost; if it is blacklisted and/or graylisted, remove
+ // it from the relevant count(s) of trackers in the cluster
if (isBlacklisted(trackerName)) {
faultyTrackers.decrBlacklistedTrackers(1);
}
Modified: hadoop/common/branches/branch-0.20-security-patches/src/test/org/apache/hadoop/mapred/TestTrackerBlacklistAcrossJobs.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-0.20-security-patches/src/test/org/apache/hadoop/mapred/TestTrackerBlacklistAcrossJobs.java?rev=1077615&r1=1077614&r2=1077615&view=diff
==============================================================================
--- hadoop/common/branches/branch-0.20-security-patches/src/test/org/apache/hadoop/mapred/TestTrackerBlacklistAcrossJobs.java (original)
+++ hadoop/common/branches/branch-0.20-security-patches/src/test/org/apache/hadoop/mapred/TestTrackerBlacklistAcrossJobs.java Fri Mar 4 04:36:41 2011
@@ -61,7 +61,7 @@ public class TestTrackerBlacklistAcrossJ
MiniMRCluster mr = null;
FileSystem fileSys = null;
Configuration conf = new Configuration();
- // setup dfs and input
+ // set up dfs and input
dfs = new MiniDFSCluster(conf, 1, true, null, hosts);
fileSys = dfs.getFileSystem();
if (!fileSys.mkdirs(inDir)) {
@@ -75,7 +75,7 @@ public class TestTrackerBlacklistAcrossJ
mr = new MiniMRCluster(3, fileSys.getUri().toString(),
1, null, hosts, jtConf);
- // setup job configuration
+ // set up job configuration
JobConf mrConf = mr.createJobConf();
JobConf job = new JobConf(mrConf);
job.setInt("mapred.max.tracker.failures", 1);
@@ -93,16 +93,24 @@ public class TestTrackerBlacklistAcrossJ
JobClient jc = new JobClient(mrConf);
RunningJob running = JobClient.runJob(job);
assertEquals("Job failed", JobStatus.SUCCEEDED, running.getJobState());
- assertEquals("Didn't blacklist the host", 1,
+ // heuristic blacklisting is graylisting as of 0.20.Fred
+ assertEquals("Blacklisted the host", 0,
jc.getClusterStatus().getBlacklistedTrackers());
+ assertEquals("Didn't graylist the host", 1,
+ jc.getClusterStatus().getGraylistedTrackers());
assertEquals("Fault count should be 1", 1, mr.getFaultCount(hosts[0]));
// run the same job once again
- // there should be no change in blacklist count
+ // there should be no change in blacklist or graylist count, but fault
+ // count (per-job blacklistings) should go up by one
running = JobClient.runJob(job);
assertEquals("Job failed", JobStatus.SUCCEEDED, running.getJobState());
- assertEquals("Didn't blacklist the host", 1,
+ assertEquals("Blacklisted the host", 0,
jc.getClusterStatus().getBlacklistedTrackers());
- assertEquals("Fault count should be 1", 1, mr.getFaultCount(hosts[0]));
+ assertEquals("Didn't graylist the host", 1,
+ jc.getClusterStatus().getGraylistedTrackers());
+ // previously this asserted 1, but makes no sense: each per-job
+ // blacklisting counts as a new fault, so 2 runs => 2 faults:
+ assertEquals("Fault count should be 2", 2, mr.getFaultCount(hosts[0]));
}
}