You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by ha...@apache.org on 2014/08/22 17:30:33 UTC
svn commit: r1619839 - in /hive/trunk/ql/src: java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java test/results/clientpositive/annotate_stats_join.q.out

Author: hashutosh
Date: Fri Aug 22 15:30:33 2014
New Revision: 1619839

URL: http://svn.apache.org/r1619839
Log:
HIVE-7836 : Ease-out denominator for multi-attribute join case in statistics annotation (Prasanth J via Ashutosh Chauhan)

Modified:
    hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java
    hive/trunk/ql/src/test/results/clientpositive/annotate_stats_join.q.out

Modified: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java?rev=1619839&r1=1619838&r2=1619839&view=diff
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java (original)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java Fri Aug 22 15:30:33 2014
@@ -20,6 +20,7 @@ package org.apache.hadoop.hive.ql.optimi
 
 import com.google.common.collect.Lists;
 import com.google.common.collect.Maps;
+
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.hadoop.hive.conf.HiveConf;
@@ -811,6 +812,7 @@ public class StatsRulesProcFactory {
           // 2 relations, multiple attributes
           boolean multiAttr = false;
           int numAttr = 1;
+          int numParent = parents.size();
 
           Map<String, ColStatistics> joinedColStats = Maps.newHashMap();
           Map<Integer, List<String>> joinKeys = Maps.newHashMap();
@@ -873,12 +875,19 @@ public class StatsRulesProcFactory {
                   perAttrDVs.add(cs.getCountDistint());
                 }
               }
+
               distinctVals.add(getDenominator(perAttrDVs));
               perAttrDVs.clear();
             }
 
-            for (Long l : distinctVals) {
-              denom *= l;
+            if (numAttr > numParent) {
+              // To avoid denominator getting larger and aggressively reducing
+              // number of rows, we will ease out denominator.
+              denom = getEasedOutDenominator(distinctVals);
+            } else {
+              for (Long l : distinctVals) {
+                denom *= l;
+              }
             }
           } else {
             for (List<String> jkeys : joinKeys.values()) {
@@ -983,6 +992,20 @@ public class StatsRulesProcFactory {
       return null;
     }
 
+    private Long getEasedOutDenominator(List<Long> distinctVals) {
+      // Exponential back-off for NDVs.
+      // 1) Descending order sort of NDVs
+      // 2) denominator = NDV1 * (NDV2 ^ (1/2)) * (NDV3 ^ (1/4))) * ....
+      Collections.sort(distinctVals, Collections.reverseOrder());
+
+      long denom = distinctVals.get(0);
+      for (int i = 1; i < distinctVals.size(); i++) {
+        denom = (long) (denom * Math.pow(distinctVals.get(i), 1.0 / (1 << i)));
+      }
+
+      return denom;
+    }
+
     private void updateStatsForJoinType(Statistics stats, long newNumRows,
         JoinDesc conf, Map<String, Long> rowCountParents,
         Map<String, String> outInTabAlias) {

Modified: hive/trunk/ql/src/test/results/clientpositive/annotate_stats_join.q.out
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/results/clientpositive/annotate_stats_join.q.out?rev=1619839&r1=1619838&r2=1619839&view=diff
==============================================================================
--- hive/trunk/ql/src/test/results/clientpositive/annotate_stats_join.q.out (original)
+++ hive/trunk/ql/src/test/results/clientpositive/annotate_stats_join.q.out Fri Aug 22 15:30:33 2014
@@ -391,17 +391,17 @@ STAGE PLANS:
             0 {KEY.reducesinkkey1} {KEY.reducesinkkey0} {VALUE._col0}
             1 {KEY.reducesinkkey0} {KEY.reducesinkkey1}
           outputColumnNames: _col0, _col1, _col2, _col6, _col7
-          Statistics: Num rows: 1 Data size: 194 Basic stats: COMPLETE Column stats: COMPLETE
+          Statistics: Num rows: 11 Data size: 2134 Basic stats: COMPLETE Column stats: COMPLETE
           Filter Operator
             predicate: (((_col1 = _col6) and (_col0 = _col7)) and (_col7 = _col0)) (type: boolean)
-            Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: COMPLETE
+            Statistics: Num rows: 1 Data size: 194 Basic stats: COMPLETE Column stats: COMPLETE
             Select Operator
               expressions: _col0 (type: string), _col1 (type: int), _col2 (type: int), _col6 (type: int), _col7 (type: string)
               outputColumnNames: _col0, _col1, _col2, _col3, _col4
-              Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: COMPLETE
+              Statistics: Num rows: 1 Data size: 194 Basic stats: COMPLETE Column stats: COMPLETE
               File Output Operator
                 compressed: false
-                Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: COMPLETE
+                Statistics: Num rows: 1 Data size: 194 Basic stats: COMPLETE Column stats: COMPLETE
                 table:
                     input format: org.apache.hadoop.mapred.TextInputFormat
                     output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat