You are viewing a plain text version of this content. The canonical link for it is here.
Posted to common-commits@hadoop.apache.org by ac...@apache.org on 2008/10/03 17:10:39 UTC

svn commit: r701400 - in /hadoop/core/branches/branch-0.19: ./ CHANGES.txt src/mapred/org/apache/hadoop/mapred/ReduceTask.java

Author: acmurthy
Date: Fri Oct  3 08:10:38 2008
New Revision: 701400

URL: http://svn.apache.org/viewvc?rev=701400&view=rev
Log:
Merge -r 701397:701398 from trunk to branch-0.19 to fix HADOOP-4246.

Modified:
    hadoop/core/branches/branch-0.19/   (props changed)
    hadoop/core/branches/branch-0.19/CHANGES.txt
    hadoop/core/branches/branch-0.19/src/mapred/org/apache/hadoop/mapred/ReduceTask.java

Propchange: hadoop/core/branches/branch-0.19/
------------------------------------------------------------------------------
--- svn:mergeinfo (original)
+++ svn:mergeinfo Fri Oct  3 08:10:38 2008
@@ -1 +1 @@
-/hadoop/core/trunk:697306,698176,699056,699098,699415,699424,699444,699490,699517,700163,700628,700923,701273
+/hadoop/core/trunk:697306,698176,699056,699098,699415,699424,699444,699490,699517,700163,700628,700923,701273,701398

Modified: hadoop/core/branches/branch-0.19/CHANGES.txt
URL: http://svn.apache.org/viewvc/hadoop/core/branches/branch-0.19/CHANGES.txt?rev=701400&r1=701399&r2=701400&view=diff
==============================================================================
--- hadoop/core/branches/branch-0.19/CHANGES.txt (original)
+++ hadoop/core/branches/branch-0.19/CHANGES.txt Fri Oct  3 08:10:38 2008
@@ -787,6 +787,11 @@
     HADOOP-4319. fuse-dfs dfs_read function returns as many bytes as it is
     told to read unlesss end-of-file is reached.  (Pete Wyckoff via dhruba)
 
+    HADOOP-4246. Ensure we have the correct lower bound on the number of
+    retries for fetching map-outputs; also fixed the case where the reducer
+    automatically kills on too many unique map-outputs could not be fetched
+    for small jobs. (Amareshwari Sri Ramadasu via acmurthy)  
+    
 Release 0.18.2 - Unreleased
 
   BUG FIXES

Modified: hadoop/core/branches/branch-0.19/src/mapred/org/apache/hadoop/mapred/ReduceTask.java
URL: http://svn.apache.org/viewvc/hadoop/core/branches/branch-0.19/src/mapred/org/apache/hadoop/mapred/ReduceTask.java?rev=701400&r1=701399&r2=701400&view=diff
==============================================================================
--- hadoop/core/branches/branch-0.19/src/mapred/org/apache/hadoop/mapred/ReduceTask.java (original)
+++ hadoop/core/branches/branch-0.19/src/mapred/org/apache/hadoop/mapred/ReduceTask.java Fri Oct  3 08:10:38 2008
@@ -615,13 +615,18 @@
      * Maximum percent of shuffle execution time required to keep the reducer alive.
      */
     private static final float MAX_ALLOWED_STALL_TIME_PERCENT = 0.5f;
+    
+    /**
+     * Minimum number of map fetch retries.
+     */
+    private static final int MIN_FETCH_RETRIES_PER_MAP = 2;
 
     /**
      * Maximum no. of unique maps from which we failed to fetch map-outputs
      * even after {@link #maxFetchRetriesPerMap} retries; after this the
      * reduce task is failed.
      */
-    private static final int MAX_FAILED_UNIQUE_FETCHES = 5;
+    private int maxFailedUniqueFetches = 5;
 
     /**
      * The maps from which we fail to fetch map-outputs 
@@ -1553,8 +1558,10 @@
       // the order is 4,8,16,32,64,128. sum of which is 252 sec = 4.2 min
       
       // optimizing for the base 2
-      this.maxFetchRetriesPerMap = getClosestPowerOf2((this.maxBackoff * 1000 
-                                                       / BACKOFF_INIT) + 1); 
+      this.maxFetchRetriesPerMap = Math.max(MIN_FETCH_RETRIES_PER_MAP, 
+             getClosestPowerOf2((this.maxBackoff * 1000 / BACKOFF_INIT) + 1));
+      this.maxFailedUniqueFetches = Math.min(numMaps, 
+                                             this.maxFailedUniqueFetches);
       this.maxInMemOutputs = conf.getInt("mapred.inmem.merge.threshold", 1000);
       this.maxInMemCopyPer =
         conf.getFloat("mapred.job.shuffle.merge.percent", 0.66f);
@@ -1909,7 +1916,8 @@
                      >= MAX_ALLOWED_STALL_TIME_PERCENT);
                 
                 // kill if not healthy and has insufficient progress
-                if ((fetchFailedMaps.size() >= MAX_FAILED_UNIQUE_FETCHES)
+                if ((fetchFailedMaps.size() >= maxFailedUniqueFetches ||
+                     fetchFailedMaps.size() == (numMaps - copiedMapOutputs.size()))
                     && !reducerHealthy 
                     && (!reducerProgressedEnough || reducerStalled)) { 
                   LOG.fatal("Shuffle failed with too many fetch failures " + 
@@ -2249,8 +2257,8 @@
             if (duration > maxMapRuntime) {
               maxMapRuntime = duration; 
               // adjust max-fetch-retries based on max-map-run-time
-              maxFetchRetriesPerMap = 
-                  getClosestPowerOf2((maxMapRuntime / BACKOFF_INIT) + 1);
+              maxFetchRetriesPerMap = Math.max(MIN_FETCH_RETRIES_PER_MAP, 
+                getClosestPowerOf2((maxMapRuntime / BACKOFF_INIT) + 1));
             }
             URL mapOutputLocation = new URL(event.getTaskTrackerHttp() + 
                                     "/mapOutput?job=" + taskId.getJobID() +