You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by nz...@apache.org on 2011/05/04 21:07:17 UTC

svn commit: r1099560 - /hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/MapRedTask.java

Author: nzhang
Date: Wed May  4 19:07:17 2011
New Revision: 1099560

URL: http://svn.apache.org/viewvc?rev=1099560&view=rev
Log:
HIVE-2146. Block Sampling should adjust number of reducers accordingly to make it useful (Siying Dong via Ning Zhang)

Modified:
    hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/MapRedTask.java

Modified: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/MapRedTask.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/MapRedTask.java?rev=1099560&r1=1099559&r2=1099560&view=diff
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/MapRedTask.java (original)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/MapRedTask.java Wed May  4 19:07:17 2011
@@ -368,8 +368,34 @@ public class MapRedTask extends ExecDriv
 
     long totalInputFileSize = inputSummary.getLength();
 
-    LOG.info("BytesPerReducer=" + bytesPerReducer + " maxReducers="
+    // if all inputs are sampled, we should shrink the size of reducers accordingly.
+    double highestSamplePercentage = 0;
+    boolean allSample = false;
+    for (String alias : work.getAliasToWork().keySet()) {
+      if (work.getNameToSplitSample().containsKey(alias)) {
+        allSample = true;
+        double rate = work.getNameToSplitSample().get(alias).getPercent();
+        if (rate > highestSamplePercentage) {
+          highestSamplePercentage = rate;
+        }
+      } else {
+        allSample = false;
+        break;
+      }
+    }
+    if (allSample) {
+      // This is a little bit dangerous if inputs turns out not to be able to be sampled.
+      // In that case, we significantly underestimate number of reducers.
+      // It's the same as other cases of estimateNumberOfReducers(). It's just our best
+      // guess and there is no guarantee.
+      totalInputFileSize = Math.min((long) (totalInputFileSize * highestSamplePercentage / 100D)
+          , totalInputFileSize);
+      LOG.info("BytesPerReducer=" + bytesPerReducer + " maxReducers="
+          + maxReducers + " estimated totalInputFileSize=" + totalInputFileSize);
+    } else {
+      LOG.info("BytesPerReducer=" + bytesPerReducer + " maxReducers="
         + maxReducers + " totalInputFileSize=" + totalInputFileSize);
+    }
 
     int reducers = (int) ((totalInputFileSize + bytesPerReducer - 1) / bytesPerReducer);
     reducers = Math.max(1, reducers);