You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by se...@apache.org on 2017/02/27 20:24:12 UTC
[10/20] hive git commit: HIVE-16023: Wrong estimation for number of rows generated by IN expression (Jesus Camacho Rodriguez, reviewed by Ashutosh Chauhan)

HIVE-16023: Wrong estimation for number of rows generated by IN expression (Jesus Camacho Rodriguez, reviewed by Ashutosh Chauhan)


Project: http://git-wip-us.apache.org/repos/asf/hive/repo
Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/10449a7a
Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/10449a7a
Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/10449a7a

Branch: refs/heads/hive-14535
Commit: 10449a7af25ee0db52284010f9168c46cd398bd4
Parents: 2f6f6bd
Author: Jesus Camacho Rodriguez <jc...@apache.org>
Authored: Thu Feb 23 12:05:12 2017 +0000
Committer: Jesus Camacho Rodriguez <jc...@apache.org>
Committed: Sat Feb 25 10:24:49 2017 +0000

----------------------------------------------------------------------
 .../stats/annotation/StatsRulesProcFactory.java | 46 +++++++++++++-------
 .../clientpositive/llap/explainuser_2.q.out     | 28 ++++++------
 .../clientpositive/llap/vectorization_0.q.out   | 16 +++----
 .../clientpositive/remove_exprs_stats.q.out     | 18 ++++----
 4 files changed, 61 insertions(+), 47 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hive/blob/10449a7a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java
index bdb09a8..d9f70a7 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java
@@ -320,8 +320,13 @@ public class StatsRulesProcFactory {
       long newNumRows = 0;
       Statistics andStats = null;
 
-      if (stats.getNumRows() <= 1 || stats.getDataSize() <= 0)
+      if (stats.getNumRows() <= 1 || stats.getDataSize() <= 0) {
+        if (isDebugEnabled) {
+          LOG.debug("Estimating row count for " + pred + " Original num rows: " + stats.getNumRows() +
+              " Original data size: " + stats.getDataSize() + " New num rows: 1");
+        }
         return 1;
+      }
 
       if (pred instanceof ExprNodeGenericFuncDesc) {
         ExprNodeGenericFuncDesc genFunc = (ExprNodeGenericFuncDesc) pred;
@@ -378,23 +383,31 @@ public class StatsRulesProcFactory {
         if (colType.equalsIgnoreCase(serdeConstants.BOOLEAN_TYPE_NAME)) {
           ColStatistics cs = stats.getColumnStatisticsFromColName(colName);
           if (cs != null) {
-            return cs.getNumTrues();
+            newNumRows = cs.getNumTrues();
+          } else {
+            // default
+            newNumRows = stats.getNumRows() / 2;
           }
+        } else {
+          // if not boolean column return half the number of rows
+          newNumRows = stats.getNumRows() / 2;
         }
-
-        // if not boolean column return half the number of rows
-        return stats.getNumRows() / 2;
       } else if (pred instanceof ExprNodeConstantDesc) {
 
         // special case for handling false constants
         ExprNodeConstantDesc encd = (ExprNodeConstantDesc) pred;
         if (Boolean.FALSE.equals(encd.getValue())) {
-          return 0;
+          newNumRows = 0;
         } else {
-          return stats.getNumRows();
+          newNumRows = stats.getNumRows();
         }
       }
 
+      if (isDebugEnabled) {
+        LOG.debug("Estimating row count for " + pred + " Original num rows: " + stats.getNumRows() +
+            " New num rows: " + newNumRows);
+      }
+
       return newNumRows;
     }
 
@@ -476,15 +489,16 @@ public class StatsRulesProcFactory {
       }
 
       // 3. Calculate IN selectivity
-      float factor = 1;
+      double factor = 1d;
       for (int i = 0; i < columnStats.size(); i++) {
         long dvs = columnStats.get(i) == null ? 0 : columnStats.get(i).getCountDistint();
-        // ( num of distinct vals for col / num of rows ) * num of distinct vals for col in IN clause
-        float columnFactor = dvs == 0 ? 0.5f : ((float)dvs / numRows) * values.get(i).size();
-        factor *= columnFactor;
+        // (num of distinct vals for col in IN clause  / num of distinct vals for col )
+        double columnFactor = dvs == 0 ? 0.5d : ((double) values.get(i).size() / dvs);
+        // max can be 1, even when ndv is larger in IN clause than in column stats
+        factor *= columnFactor > 1d ? 1d : columnFactor;
       }
       float inFactor = HiveConf.getFloatVar(aspCtx.getConf(), HiveConf.ConfVars.HIVE_STATS_IN_CLAUSE_FACTOR);
-      return Math.round( (double)numRows * factor * inFactor);
+      return Math.round( (double) numRows * factor * inFactor);
     }
 
     private long evaluateBetweenExpr(Statistics stats, ExprNodeDesc pred, AnnotateStatsProcCtx aspCtx,
@@ -1828,11 +1842,11 @@ public class StatsRulesProcFactory {
         Map<Integer, Long> rowCountParents) {
 
       if (newNumRows < 0) {
-        LOG.info("STATS-" + jop.toString() + ": Overflow in number of rows."
+        LOG.debug("STATS-" + jop.toString() + ": Overflow in number of rows. "
           + newNumRows + " rows will be set to Long.MAX_VALUE");
       }
       if (newNumRows == 0) {
-        LOG.info("STATS-" + jop.toString() + ": Equals 0 in number of rows."
+        LOG.debug("STATS-" + jop.toString() + ": Equals 0 in number of rows. "
             + newNumRows + " rows will be set to 1");
         newNumRows = 1;
       }
@@ -2252,12 +2266,12 @@ public class StatsRulesProcFactory {
       boolean updateNDV) {
 
     if (newNumRows < 0) {
-      LOG.info("STATS-" + op.toString() + ": Overflow in number of rows."
+      LOG.debug("STATS-" + op.toString() + ": Overflow in number of rows. "
           + newNumRows + " rows will be set to Long.MAX_VALUE");
       newNumRows = StatsUtils.getMaxIfOverflow(newNumRows);
     }
     if (newNumRows == 0) {
-      LOG.info("STATS-" + op.toString() + ": Equals 0 in number of rows."
+      LOG.debug("STATS-" + op.toString() + ": Equals 0 in number of rows. "
           + newNumRows + " rows will be set to 1");
       newNumRows = 1;
     }

http://git-wip-us.apache.org/repos/asf/hive/blob/10449a7a/ql/src/test/results/clientpositive/llap/explainuser_2.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/llap/explainuser_2.q.out b/ql/src/test/results/clientpositive/llap/explainuser_2.q.out
index 67f960a..e898111 100644
--- a/ql/src/test/results/clientpositive/llap/explainuser_2.q.out
+++ b/ql/src/test/results/clientpositive/llap/explainuser_2.q.out
@@ -298,29 +298,29 @@ Stage-0
     Stage-1
       Reducer 5 llap
       File Output Operator [FS_55]
-        Limit [LIM_54] (rows=24 width=285)
+        Limit [LIM_54] (rows=14 width=285)
           Number of rows:100
-          Select Operator [SEL_53] (rows=24 width=285)
+          Select Operator [SEL_53] (rows=14 width=285)
             Output:["_col0","_col1","_col2","_col3","_col4","_col5"]
           <-Reducer 4 [SIMPLE_EDGE] llap
             SHUFFLE [RS_52]
-              Group By Operator [GBY_50] (rows=24 width=285)
+              Group By Operator [GBY_50] (rows=14 width=285)
                 Output:["_col0","_col1","_col2","_col3","_col4","_col5"],aggregations:["count(VALUE._col0)","count(VALUE._col1)","count(VALUE._col2)"],keys:KEY._col0, KEY._col1, KEY._col2
               <-Reducer 3 [SIMPLE_EDGE] llap
                 SHUFFLE [RS_49]
                   PartitionCols:_col0, _col1, _col2
-                  Group By Operator [GBY_48] (rows=24 width=285)
+                  Group By Operator [GBY_48] (rows=14 width=285)
                     Output:["_col0","_col1","_col2","_col3","_col4","_col5"],aggregations:["count(_col13)","count(_col21)","count(_col3)"],keys:_col12, _col20, _col2
-                    Select Operator [SEL_47] (rows=650 width=534)
+                    Select Operator [SEL_47] (rows=363 width=534)
                       Output:["_col12","_col20","_col2","_col13","_col21","_col3"]
-                      Merge Join Operator [MERGEJOIN_97] (rows=650 width=534)
+                      Merge Join Operator [MERGEJOIN_97] (rows=363 width=534)
                         Conds:RS_44._col1, _col3=RS_45._col15, _col17(Inner),Output:["_col2","_col3","_col12","_col13","_col20","_col21"]
                       <-Reducer 11 [SIMPLE_EDGE] llap
                         SHUFFLE [RS_45]
                           PartitionCols:_col15, _col17
-                          Select Operator [SEL_40] (rows=190 width=447)
+                          Select Operator [SEL_40] (rows=180 width=447)
                             Output:["_col14","_col15","_col17","_col6","_col7"]
-                            Merge Join Operator [MERGEJOIN_96] (rows=190 width=447)
+                            Merge Join Operator [MERGEJOIN_96] (rows=180 width=447)
                               Conds:RS_37._col6, _col4=RS_38._col4, _col2(Inner),Output:["_col2","_col3","_col14","_col15","_col17"]
                             <-Reducer 10 [SIMPLE_EDGE] llap
                               SHUFFLE [RS_37]
@@ -376,7 +376,7 @@ Stage-0
                             <-Reducer 16 [SIMPLE_EDGE] llap
                               SHUFFLE [RS_38]
                                 PartitionCols:_col4, _col2
-                                Merge Join Operator [MERGEJOIN_95] (rows=19 width=356)
+                                Merge Join Operator [MERGEJOIN_95] (rows=18 width=356)
                                   Conds:RS_24._col0=RS_25._col0(Inner),Output:["_col2","_col3","_col4","_col5"]
                                 <-Map 15 [SIMPLE_EDGE] llap
                                   SHUFFLE [RS_24]
@@ -390,16 +390,16 @@ Stage-0
                                 <-Map 17 [SIMPLE_EDGE] llap
                                   SHUFFLE [RS_25]
                                     PartitionCols:_col0
-                                    Select Operator [SEL_23] (rows=500 width=178)
+                                    Select Operator [SEL_23] (rows=7 width=178)
                                       Output:["_col0"]
-                                      Filter Operator [FIL_90] (rows=500 width=178)
+                                      Filter Operator [FIL_90] (rows=7 width=178)
                                         predicate:((value) IN ('2000Q1', '2000Q2', '2000Q3') and key is not null)
                                         TableScan [TS_21] (rows=500 width=178)
                                           default@src,d2,Tbl:COMPLETE,Col:COMPLETE,Output:["key","value"]
                       <-Reducer 2 [SIMPLE_EDGE] llap
                         SHUFFLE [RS_44]
                           PartitionCols:_col1, _col3
-                          Merge Join Operator [MERGEJOIN_91] (rows=414 width=269)
+                          Merge Join Operator [MERGEJOIN_91] (rows=99 width=269)
                             Conds:RS_41._col0=RS_42._col0(Inner),Output:["_col1","_col2","_col3"]
                           <-Map 1 [SIMPLE_EDGE] llap
                             SHUFFLE [RS_41]
@@ -413,9 +413,9 @@ Stage-0
                           <-Map 6 [SIMPLE_EDGE] llap
                             SHUFFLE [RS_42]
                               PartitionCols:_col0
-                              Select Operator [SEL_5] (rows=500 width=178)
+                              Select Operator [SEL_5] (rows=7 width=178)
                                 Output:["_col0"]
-                                Filter Operator [FIL_84] (rows=500 width=178)
+                                Filter Operator [FIL_84] (rows=7 width=178)
                                   predicate:((value) IN ('2000Q1', '2000Q2', '2000Q3') and key is not null)
                                   TableScan [TS_3] (rows=500 width=178)
                                     default@src,d3,Tbl:COMPLETE,Col:COMPLETE,Output:["key","value"]

http://git-wip-us.apache.org/repos/asf/hive/blob/10449a7a/ql/src/test/results/clientpositive/llap/vectorization_0.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/llap/vectorization_0.q.out b/ql/src/test/results/clientpositive/llap/vectorization_0.q.out
index 67fcdaa..af0bad9 100644
--- a/ql/src/test/results/clientpositive/llap/vectorization_0.q.out
+++ b/ql/src/test/results/clientpositive/llap/vectorization_0.q.out
@@ -30777,19 +30777,19 @@ STAGE PLANS:
                   Filter Operator
                     isSamplingPred: false
                     predicate: (cstring1) IN ('biology', 'history', 'topology') (type: boolean)
-                    Statistics: Num rows: 12288 Data size: 862450 Basic stats: COMPLETE Column stats: COMPLETE
+                    Statistics: Num rows: 5 Data size: 470 Basic stats: COMPLETE Column stats: COMPLETE
                     Group By Operator
                       aggregations: count()
                       keys: cstring1 (type: string)
                       mode: hash
                       outputColumnNames: _col0, _col1
-                      Statistics: Num rows: 6144 Data size: 480424 Basic stats: COMPLETE Column stats: COMPLETE
+                      Statistics: Num rows: 2 Data size: 204 Basic stats: COMPLETE Column stats: COMPLETE
                       Reduce Output Operator
                         key expressions: _col0 (type: string)
                         null sort order: a
                         sort order: +
                         Map-reduce partition columns: _col0 (type: string)
-                        Statistics: Num rows: 6144 Data size: 480424 Basic stats: COMPLETE Column stats: COMPLETE
+                        Statistics: Num rows: 2 Data size: 204 Basic stats: COMPLETE Column stats: COMPLETE
                         tag: -1
                         value expressions: _col1 (type: bigint)
                         auto parallelism: true
@@ -30855,16 +30855,16 @@ STAGE PLANS:
                 keys: KEY._col0 (type: string)
                 mode: mergepartial
                 outputColumnNames: _col0, _col1
-                Statistics: Num rows: 6144 Data size: 480424 Basic stats: COMPLETE Column stats: COMPLETE
+                Statistics: Num rows: 2 Data size: 204 Basic stats: COMPLETE Column stats: COMPLETE
                 Select Operator
                   expressions: _col1 (type: bigint), _col0 (type: string)
                   outputColumnNames: _col0, _col1
-                  Statistics: Num rows: 6144 Data size: 480424 Basic stats: COMPLETE Column stats: COMPLETE
+                  Statistics: Num rows: 2 Data size: 204 Basic stats: COMPLETE Column stats: COMPLETE
                   Reduce Output Operator
                     key expressions: _col1 (type: string)
                     null sort order: a
                     sort order: +
-                    Statistics: Num rows: 6144 Data size: 480424 Basic stats: COMPLETE Column stats: COMPLETE
+                    Statistics: Num rows: 2 Data size: 204 Basic stats: COMPLETE Column stats: COMPLETE
                     tag: -1
                     value expressions: _col0 (type: bigint)
                     auto parallelism: false
@@ -30875,13 +30875,13 @@ STAGE PLANS:
               Select Operator
                 expressions: VALUE._col0 (type: bigint), KEY.reducesinkkey0 (type: string)
                 outputColumnNames: _col0, _col1
-                Statistics: Num rows: 6144 Data size: 480424 Basic stats: COMPLETE Column stats: COMPLETE
+                Statistics: Num rows: 2 Data size: 204 Basic stats: COMPLETE Column stats: COMPLETE
                 File Output Operator
                   compressed: false
                   GlobalTableId: 0
 #### A masked pattern was here ####
                   NumFilesPerFileSink: 1
-                  Statistics: Num rows: 6144 Data size: 480424 Basic stats: COMPLETE Column stats: COMPLETE
+                  Statistics: Num rows: 2 Data size: 204 Basic stats: COMPLETE Column stats: COMPLETE
 #### A masked pattern was here ####
                   table:
                       input format: org.apache.hadoop.mapred.SequenceFileInputFormat

http://git-wip-us.apache.org/repos/asf/hive/blob/10449a7a/ql/src/test/results/clientpositive/remove_exprs_stats.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/remove_exprs_stats.q.out b/ql/src/test/results/clientpositive/remove_exprs_stats.q.out
index 8fe688d..567e6b2 100644
--- a/ql/src/test/results/clientpositive/remove_exprs_stats.q.out
+++ b/ql/src/test/results/clientpositive/remove_exprs_stats.q.out
@@ -449,14 +449,14 @@ STAGE PLANS:
             Statistics: Num rows: 8 Data size: 816 Basic stats: COMPLETE Column stats: COMPLETE
             Filter Operator
               predicate: (locid) IN (5) (type: boolean)
-              Statistics: Num rows: 5 Data size: 510 Basic stats: COMPLETE Column stats: COMPLETE
+              Statistics: Num rows: 2 Data size: 204 Basic stats: COMPLETE Column stats: COMPLETE
               Select Operator
                 expressions: state (type: string), locid (type: int), zip (type: bigint), year (type: int)
                 outputColumnNames: _col0, _col1, _col2, _col3
-                Statistics: Num rows: 5 Data size: 510 Basic stats: COMPLETE Column stats: COMPLETE
+                Statistics: Num rows: 2 Data size: 204 Basic stats: COMPLETE Column stats: COMPLETE
                 File Output Operator
                   compressed: false
-                  Statistics: Num rows: 5 Data size: 510 Basic stats: COMPLETE Column stats: COMPLETE
+                  Statistics: Num rows: 2 Data size: 204 Basic stats: COMPLETE Column stats: COMPLETE
                   table:
                       input format: org.apache.hadoop.mapred.SequenceFileInputFormat
                       output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
@@ -485,14 +485,14 @@ STAGE PLANS:
             Statistics: Num rows: 8 Data size: 816 Basic stats: COMPLETE Column stats: COMPLETE
             Filter Operator
               predicate: (locid) IN (5, 2, 3) (type: boolean)
-              Statistics: Num rows: 8 Data size: 816 Basic stats: COMPLETE Column stats: COMPLETE
+              Statistics: Num rows: 5 Data size: 510 Basic stats: COMPLETE Column stats: COMPLETE
               Select Operator
                 expressions: state (type: string), locid (type: int), zip (type: bigint), year (type: int)
                 outputColumnNames: _col0, _col1, _col2, _col3
-                Statistics: Num rows: 8 Data size: 816 Basic stats: COMPLETE Column stats: COMPLETE
+                Statistics: Num rows: 5 Data size: 510 Basic stats: COMPLETE Column stats: COMPLETE
                 File Output Operator
                   compressed: false
-                  Statistics: Num rows: 8 Data size: 816 Basic stats: COMPLETE Column stats: COMPLETE
+                  Statistics: Num rows: 5 Data size: 510 Basic stats: COMPLETE Column stats: COMPLETE
                   table:
                       input format: org.apache.hadoop.mapred.SequenceFileInputFormat
                       output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
@@ -521,14 +521,14 @@ STAGE PLANS:
             Statistics: Num rows: 8 Data size: 816 Basic stats: COMPLETE Column stats: COMPLETE
             Filter Operator
               predicate: (locid) IN (1, 6) (type: boolean)
-              Statistics: Num rows: 8 Data size: 816 Basic stats: COMPLETE Column stats: COMPLETE
+              Statistics: Num rows: 3 Data size: 306 Basic stats: COMPLETE Column stats: COMPLETE
               Select Operator
                 expressions: state (type: string), locid (type: int), zip (type: bigint), year (type: int)
                 outputColumnNames: _col0, _col1, _col2, _col3
-                Statistics: Num rows: 8 Data size: 816 Basic stats: COMPLETE Column stats: COMPLETE
+                Statistics: Num rows: 3 Data size: 306 Basic stats: COMPLETE Column stats: COMPLETE
                 File Output Operator
                   compressed: false
-                  Statistics: Num rows: 8 Data size: 816 Basic stats: COMPLETE Column stats: COMPLETE
+                  Statistics: Num rows: 3 Data size: 306 Basic stats: COMPLETE Column stats: COMPLETE
                   table:
                       input format: org.apache.hadoop.mapred.SequenceFileInputFormat
                       output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat