You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by na...@apache.org on 2009/06/16 02:35:31 UTC

svn commit: r785036 - in /hadoop/hive/trunk: CHANGES.txt common/src/java/org/apache/hadoop/hive/conf/HiveConf.java conf/hive-default.xml ql/src/java/org/apache/hadoop/hive/ql/exec/GroupByOperator.java

Author: namit
Date: Tue Jun 16 00:35:31 2009
New Revision: 785036

URL: http://svn.apache.org/viewvc?rev=785036&view=rev
Log:
HIVE-561. Make hash aggregation threshold configurable
(Zheng Shao via namit)


Modified:
    hadoop/hive/trunk/CHANGES.txt
    hadoop/hive/trunk/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
    hadoop/hive/trunk/conf/hive-default.xml
    hadoop/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/GroupByOperator.java

Modified: hadoop/hive/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/hadoop/hive/trunk/CHANGES.txt?rev=785036&r1=785035&r2=785036&view=diff
==============================================================================
--- hadoop/hive/trunk/CHANGES.txt (original)
+++ hadoop/hive/trunk/CHANGES.txt Tue Jun 16 00:35:31 2009
@@ -236,6 +236,9 @@
     HIVE-547. Better logging in ExecDriver
     (Zheng Shao via namit)
 
+    HIVE-561. Make hash aggregation threshold configurable
+    (Zheng Shao via namit)
+
 Release 0.3.1 - Unreleased
 
   INCOMPATIBLE CHANGES

Modified: hadoop/hive/trunk/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
URL: http://svn.apache.org/viewvc/hadoop/hive/trunk/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java?rev=785036&r1=785035&r2=785036&view=diff
==============================================================================
--- hadoop/hive/trunk/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java (original)
+++ hadoop/hive/trunk/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java Tue Jun 16 00:35:31 2009
@@ -117,6 +117,7 @@
     HIVEMAPJOINCACHEROWS("hive.mapjoin.cache.numrows", 10000),
     HIVEGROUPBYMAPINTERVAL("hive.groupby.mapaggr.checkinterval", 100000),
     HIVEMAPAGGRHASHMEMORY("hive.map.aggr.hash.percentmemory", (float)0.5),
+    HIVEMAPAGGRHASHMINREDUCTION("hive.map.aggr.hash.min.reduction", (float)0.5),
     
     // Default file format for CREATE TABLE statement
     // Options: TextFile, SequenceFile

Modified: hadoop/hive/trunk/conf/hive-default.xml
URL: http://svn.apache.org/viewvc/hadoop/hive/trunk/conf/hive-default.xml?rev=785036&r1=785035&r2=785036&view=diff
==============================================================================
--- hadoop/hive/trunk/conf/hive-default.xml (original)
+++ hadoop/hive/trunk/conf/hive-default.xml Tue Jun 16 00:35:31 2009
@@ -145,6 +145,14 @@
 </property>
 
 <property>
+  <name>hive.map.aggr.hash.min.reduction</name>
+  <value>0.5</value>
+  <description>Hash aggregation will be turned off if the ratio between hash
+  table size and input rows is bigger than this number. Set to 1 to make sure
+  hash aggregation is never turned off.</description>
+</property>
+
+<property>
   <name>hive.optimize.ppd</name>
   <value>false</value>
   <description>Whether to enable predicate pushdown</description>

Modified: hadoop/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/GroupByOperator.java
URL: http://svn.apache.org/viewvc/hadoop/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/GroupByOperator.java?rev=785036&r1=785035&r2=785036&view=diff
==============================================================================
--- hadoop/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/GroupByOperator.java (original)
+++ hadoop/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/GroupByOperator.java Tue Jun 16 00:35:31 2009
@@ -97,6 +97,7 @@
   transient long    numRowsHashTbl;
   transient int     groupbyMapAggrInterval;
   transient long    numRowsCompareHashAggr;
+  transient float   minReductionHashAggr;
 
   
   /**
@@ -130,7 +131,7 @@
   transient int           totalVariableSize;
   transient int           numEntriesVarSize;
   transient int           numEntriesHashTable;
-
+  
   public void initializeOp(Configuration hconf, Reporter reporter, ObjectInspector[] inputObjInspector) throws HiveException {
 
     totalMemory = Runtime.getRuntime().totalMemory();
@@ -227,6 +228,7 @@
 
       // compare every groupbyMapAggrInterval rows
       numRowsCompareHashAggr = groupbyMapAggrInterval;
+      minReductionHashAggr = HiveConf.getFloatVar(hconf, HiveConf.ConfVars.HIVEMAPAGGRHASHMINREDUCTION);
     }
 
     // init objectInspectors
@@ -475,13 +477,16 @@
       if (numRowsInput == numRowsCompareHashAggr) {
         numRowsCompareHashAggr += groupbyMapAggrInterval;
         // map-side aggregation should reduce the entries by at-least half
-        if ((numRowsHashTbl * 2) > numRowsInput) {
-          LOG.warn("Disable Hash Aggr: #hash table = " + numRowsHashTbl + " #total = " + numRowsInput);
+        if (numRowsHashTbl > numRowsInput * minReductionHashAggr) {
+          LOG.warn("Disable Hash Aggr: #hash table = " + numRowsHashTbl + " #total = " + numRowsInput 
+              + " reduction = " + 1.0*(numRowsHashTbl/numRowsInput) + " minReduction = " + minReductionHashAggr);
           flush(true);
           hashAggr = false;
         }
-        else
-          LOG.trace("Hash Aggr Enabled: #hash table = " + numRowsHashTbl + " #total = " + numRowsInput);
+        else {
+          LOG.trace("Hash Aggr Enabled: #hash table = " + numRowsHashTbl + " #total = " + numRowsInput 
+              + " reduction = " + 1.0*(numRowsHashTbl/numRowsInput) + " minReduction = " + minReductionHashAggr);
+        }
       }
     }