You are viewing a plain text version of this content. The canonical link for it is here.
Posted to common-commits@hadoop.apache.org by ac...@apache.org on 2008/10/03 17:10:39 UTC
svn commit: r701400 - in /hadoop/core/branches/branch-0.19: ./ CHANGES.txt
src/mapred/org/apache/hadoop/mapred/ReduceTask.java
Author: acmurthy
Date: Fri Oct 3 08:10:38 2008
New Revision: 701400
URL: http://svn.apache.org/viewvc?rev=701400&view=rev
Log:
Merge -r 701397:701398 from trunk to branch-0.19 to fix HADOOP-4246.
Modified:
hadoop/core/branches/branch-0.19/ (props changed)
hadoop/core/branches/branch-0.19/CHANGES.txt
hadoop/core/branches/branch-0.19/src/mapred/org/apache/hadoop/mapred/ReduceTask.java
Propchange: hadoop/core/branches/branch-0.19/
------------------------------------------------------------------------------
--- svn:mergeinfo (original)
+++ svn:mergeinfo Fri Oct 3 08:10:38 2008
@@ -1 +1 @@
-/hadoop/core/trunk:697306,698176,699056,699098,699415,699424,699444,699490,699517,700163,700628,700923,701273
+/hadoop/core/trunk:697306,698176,699056,699098,699415,699424,699444,699490,699517,700163,700628,700923,701273,701398
Modified: hadoop/core/branches/branch-0.19/CHANGES.txt
URL: http://svn.apache.org/viewvc/hadoop/core/branches/branch-0.19/CHANGES.txt?rev=701400&r1=701399&r2=701400&view=diff
==============================================================================
--- hadoop/core/branches/branch-0.19/CHANGES.txt (original)
+++ hadoop/core/branches/branch-0.19/CHANGES.txt Fri Oct 3 08:10:38 2008
@@ -787,6 +787,11 @@
HADOOP-4319. fuse-dfs dfs_read function returns as many bytes as it is
told to read unlesss end-of-file is reached. (Pete Wyckoff via dhruba)
+ HADOOP-4246. Ensure we have the correct lower bound on the number of
+ retries for fetching map-outputs; also fixed the case where the reducer
+ automatically kills on too many unique map-outputs could not be fetched
+ for small jobs. (Amareshwari Sri Ramadasu via acmurthy)
+
Release 0.18.2 - Unreleased
BUG FIXES
Modified: hadoop/core/branches/branch-0.19/src/mapred/org/apache/hadoop/mapred/ReduceTask.java
URL: http://svn.apache.org/viewvc/hadoop/core/branches/branch-0.19/src/mapred/org/apache/hadoop/mapred/ReduceTask.java?rev=701400&r1=701399&r2=701400&view=diff
==============================================================================
--- hadoop/core/branches/branch-0.19/src/mapred/org/apache/hadoop/mapred/ReduceTask.java (original)
+++ hadoop/core/branches/branch-0.19/src/mapred/org/apache/hadoop/mapred/ReduceTask.java Fri Oct 3 08:10:38 2008
@@ -615,13 +615,18 @@
* Maximum percent of shuffle execution time required to keep the reducer alive.
*/
private static final float MAX_ALLOWED_STALL_TIME_PERCENT = 0.5f;
+
+ /**
+ * Minimum number of map fetch retries.
+ */
+ private static final int MIN_FETCH_RETRIES_PER_MAP = 2;
/**
* Maximum no. of unique maps from which we failed to fetch map-outputs
* even after {@link #maxFetchRetriesPerMap} retries; after this the
* reduce task is failed.
*/
- private static final int MAX_FAILED_UNIQUE_FETCHES = 5;
+ private int maxFailedUniqueFetches = 5;
/**
* The maps from which we fail to fetch map-outputs
@@ -1553,8 +1558,10 @@
// the order is 4,8,16,32,64,128. sum of which is 252 sec = 4.2 min
// optimizing for the base 2
- this.maxFetchRetriesPerMap = getClosestPowerOf2((this.maxBackoff * 1000
- / BACKOFF_INIT) + 1);
+ this.maxFetchRetriesPerMap = Math.max(MIN_FETCH_RETRIES_PER_MAP,
+ getClosestPowerOf2((this.maxBackoff * 1000 / BACKOFF_INIT) + 1));
+ this.maxFailedUniqueFetches = Math.min(numMaps,
+ this.maxFailedUniqueFetches);
this.maxInMemOutputs = conf.getInt("mapred.inmem.merge.threshold", 1000);
this.maxInMemCopyPer =
conf.getFloat("mapred.job.shuffle.merge.percent", 0.66f);
@@ -1909,7 +1916,8 @@
>= MAX_ALLOWED_STALL_TIME_PERCENT);
// kill if not healthy and has insufficient progress
- if ((fetchFailedMaps.size() >= MAX_FAILED_UNIQUE_FETCHES)
+ if ((fetchFailedMaps.size() >= maxFailedUniqueFetches ||
+ fetchFailedMaps.size() == (numMaps - copiedMapOutputs.size()))
&& !reducerHealthy
&& (!reducerProgressedEnough || reducerStalled)) {
LOG.fatal("Shuffle failed with too many fetch failures " +
@@ -2249,8 +2257,8 @@
if (duration > maxMapRuntime) {
maxMapRuntime = duration;
// adjust max-fetch-retries based on max-map-run-time
- maxFetchRetriesPerMap =
- getClosestPowerOf2((maxMapRuntime / BACKOFF_INIT) + 1);
+ maxFetchRetriesPerMap = Math.max(MIN_FETCH_RETRIES_PER_MAP,
+ getClosestPowerOf2((maxMapRuntime / BACKOFF_INIT) + 1));
}
URL mapOutputLocation = new URL(event.getTaskTrackerHttp() +
"/mapOutput?job=" + taskId.getJobID() +