You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by se...@apache.org on 2017/02/27 20:24:12 UTC
[10/20] hive git commit: HIVE-16023: Wrong estimation for number of
rows generated by IN expression (Jesus Camacho Rodriguez,
reviewed by Ashutosh Chauhan)
HIVE-16023: Wrong estimation for number of rows generated by IN expression (Jesus Camacho Rodriguez, reviewed by Ashutosh Chauhan)
Project: http://git-wip-us.apache.org/repos/asf/hive/repo
Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/10449a7a
Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/10449a7a
Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/10449a7a
Branch: refs/heads/hive-14535
Commit: 10449a7af25ee0db52284010f9168c46cd398bd4
Parents: 2f6f6bd
Author: Jesus Camacho Rodriguez <jc...@apache.org>
Authored: Thu Feb 23 12:05:12 2017 +0000
Committer: Jesus Camacho Rodriguez <jc...@apache.org>
Committed: Sat Feb 25 10:24:49 2017 +0000
----------------------------------------------------------------------
.../stats/annotation/StatsRulesProcFactory.java | 46 +++++++++++++-------
.../clientpositive/llap/explainuser_2.q.out | 28 ++++++------
.../clientpositive/llap/vectorization_0.q.out | 16 +++----
.../clientpositive/remove_exprs_stats.q.out | 18 ++++----
4 files changed, 61 insertions(+), 47 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/hive/blob/10449a7a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java
index bdb09a8..d9f70a7 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java
@@ -320,8 +320,13 @@ public class StatsRulesProcFactory {
long newNumRows = 0;
Statistics andStats = null;
- if (stats.getNumRows() <= 1 || stats.getDataSize() <= 0)
+ if (stats.getNumRows() <= 1 || stats.getDataSize() <= 0) {
+ if (isDebugEnabled) {
+ LOG.debug("Estimating row count for " + pred + " Original num rows: " + stats.getNumRows() +
+ " Original data size: " + stats.getDataSize() + " New num rows: 1");
+ }
return 1;
+ }
if (pred instanceof ExprNodeGenericFuncDesc) {
ExprNodeGenericFuncDesc genFunc = (ExprNodeGenericFuncDesc) pred;
@@ -378,23 +383,31 @@ public class StatsRulesProcFactory {
if (colType.equalsIgnoreCase(serdeConstants.BOOLEAN_TYPE_NAME)) {
ColStatistics cs = stats.getColumnStatisticsFromColName(colName);
if (cs != null) {
- return cs.getNumTrues();
+ newNumRows = cs.getNumTrues();
+ } else {
+ // default
+ newNumRows = stats.getNumRows() / 2;
}
+ } else {
+ // if not boolean column return half the number of rows
+ newNumRows = stats.getNumRows() / 2;
}
-
- // if not boolean column return half the number of rows
- return stats.getNumRows() / 2;
} else if (pred instanceof ExprNodeConstantDesc) {
// special case for handling false constants
ExprNodeConstantDesc encd = (ExprNodeConstantDesc) pred;
if (Boolean.FALSE.equals(encd.getValue())) {
- return 0;
+ newNumRows = 0;
} else {
- return stats.getNumRows();
+ newNumRows = stats.getNumRows();
}
}
+ if (isDebugEnabled) {
+ LOG.debug("Estimating row count for " + pred + " Original num rows: " + stats.getNumRows() +
+ " New num rows: " + newNumRows);
+ }
+
return newNumRows;
}
@@ -476,15 +489,16 @@ public class StatsRulesProcFactory {
}
// 3. Calculate IN selectivity
- float factor = 1;
+ double factor = 1d;
for (int i = 0; i < columnStats.size(); i++) {
long dvs = columnStats.get(i) == null ? 0 : columnStats.get(i).getCountDistint();
- // ( num of distinct vals for col / num of rows ) * num of distinct vals for col in IN clause
- float columnFactor = dvs == 0 ? 0.5f : ((float)dvs / numRows) * values.get(i).size();
- factor *= columnFactor;
+ // (num of distinct vals for col in IN clause / num of distinct vals for col )
+ double columnFactor = dvs == 0 ? 0.5d : ((double) values.get(i).size() / dvs);
+ // max can be 1, even when ndv is larger in IN clause than in column stats
+ factor *= columnFactor > 1d ? 1d : columnFactor;
}
float inFactor = HiveConf.getFloatVar(aspCtx.getConf(), HiveConf.ConfVars.HIVE_STATS_IN_CLAUSE_FACTOR);
- return Math.round( (double)numRows * factor * inFactor);
+ return Math.round( (double) numRows * factor * inFactor);
}
private long evaluateBetweenExpr(Statistics stats, ExprNodeDesc pred, AnnotateStatsProcCtx aspCtx,
@@ -1828,11 +1842,11 @@ public class StatsRulesProcFactory {
Map<Integer, Long> rowCountParents) {
if (newNumRows < 0) {
- LOG.info("STATS-" + jop.toString() + ": Overflow in number of rows."
+ LOG.debug("STATS-" + jop.toString() + ": Overflow in number of rows. "
+ newNumRows + " rows will be set to Long.MAX_VALUE");
}
if (newNumRows == 0) {
- LOG.info("STATS-" + jop.toString() + ": Equals 0 in number of rows."
+ LOG.debug("STATS-" + jop.toString() + ": Equals 0 in number of rows. "
+ newNumRows + " rows will be set to 1");
newNumRows = 1;
}
@@ -2252,12 +2266,12 @@ public class StatsRulesProcFactory {
boolean updateNDV) {
if (newNumRows < 0) {
- LOG.info("STATS-" + op.toString() + ": Overflow in number of rows."
+ LOG.debug("STATS-" + op.toString() + ": Overflow in number of rows. "
+ newNumRows + " rows will be set to Long.MAX_VALUE");
newNumRows = StatsUtils.getMaxIfOverflow(newNumRows);
}
if (newNumRows == 0) {
- LOG.info("STATS-" + op.toString() + ": Equals 0 in number of rows."
+ LOG.debug("STATS-" + op.toString() + ": Equals 0 in number of rows. "
+ newNumRows + " rows will be set to 1");
newNumRows = 1;
}
http://git-wip-us.apache.org/repos/asf/hive/blob/10449a7a/ql/src/test/results/clientpositive/llap/explainuser_2.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/llap/explainuser_2.q.out b/ql/src/test/results/clientpositive/llap/explainuser_2.q.out
index 67f960a..e898111 100644
--- a/ql/src/test/results/clientpositive/llap/explainuser_2.q.out
+++ b/ql/src/test/results/clientpositive/llap/explainuser_2.q.out
@@ -298,29 +298,29 @@ Stage-0
Stage-1
Reducer 5 llap
File Output Operator [FS_55]
- Limit [LIM_54] (rows=24 width=285)
+ Limit [LIM_54] (rows=14 width=285)
Number of rows:100
- Select Operator [SEL_53] (rows=24 width=285)
+ Select Operator [SEL_53] (rows=14 width=285)
Output:["_col0","_col1","_col2","_col3","_col4","_col5"]
<-Reducer 4 [SIMPLE_EDGE] llap
SHUFFLE [RS_52]
- Group By Operator [GBY_50] (rows=24 width=285)
+ Group By Operator [GBY_50] (rows=14 width=285)
Output:["_col0","_col1","_col2","_col3","_col4","_col5"],aggregations:["count(VALUE._col0)","count(VALUE._col1)","count(VALUE._col2)"],keys:KEY._col0, KEY._col1, KEY._col2
<-Reducer 3 [SIMPLE_EDGE] llap
SHUFFLE [RS_49]
PartitionCols:_col0, _col1, _col2
- Group By Operator [GBY_48] (rows=24 width=285)
+ Group By Operator [GBY_48] (rows=14 width=285)
Output:["_col0","_col1","_col2","_col3","_col4","_col5"],aggregations:["count(_col13)","count(_col21)","count(_col3)"],keys:_col12, _col20, _col2
- Select Operator [SEL_47] (rows=650 width=534)
+ Select Operator [SEL_47] (rows=363 width=534)
Output:["_col12","_col20","_col2","_col13","_col21","_col3"]
- Merge Join Operator [MERGEJOIN_97] (rows=650 width=534)
+ Merge Join Operator [MERGEJOIN_97] (rows=363 width=534)
Conds:RS_44._col1, _col3=RS_45._col15, _col17(Inner),Output:["_col2","_col3","_col12","_col13","_col20","_col21"]
<-Reducer 11 [SIMPLE_EDGE] llap
SHUFFLE [RS_45]
PartitionCols:_col15, _col17
- Select Operator [SEL_40] (rows=190 width=447)
+ Select Operator [SEL_40] (rows=180 width=447)
Output:["_col14","_col15","_col17","_col6","_col7"]
- Merge Join Operator [MERGEJOIN_96] (rows=190 width=447)
+ Merge Join Operator [MERGEJOIN_96] (rows=180 width=447)
Conds:RS_37._col6, _col4=RS_38._col4, _col2(Inner),Output:["_col2","_col3","_col14","_col15","_col17"]
<-Reducer 10 [SIMPLE_EDGE] llap
SHUFFLE [RS_37]
@@ -376,7 +376,7 @@ Stage-0
<-Reducer 16 [SIMPLE_EDGE] llap
SHUFFLE [RS_38]
PartitionCols:_col4, _col2
- Merge Join Operator [MERGEJOIN_95] (rows=19 width=356)
+ Merge Join Operator [MERGEJOIN_95] (rows=18 width=356)
Conds:RS_24._col0=RS_25._col0(Inner),Output:["_col2","_col3","_col4","_col5"]
<-Map 15 [SIMPLE_EDGE] llap
SHUFFLE [RS_24]
@@ -390,16 +390,16 @@ Stage-0
<-Map 17 [SIMPLE_EDGE] llap
SHUFFLE [RS_25]
PartitionCols:_col0
- Select Operator [SEL_23] (rows=500 width=178)
+ Select Operator [SEL_23] (rows=7 width=178)
Output:["_col0"]
- Filter Operator [FIL_90] (rows=500 width=178)
+ Filter Operator [FIL_90] (rows=7 width=178)
predicate:((value) IN ('2000Q1', '2000Q2', '2000Q3') and key is not null)
TableScan [TS_21] (rows=500 width=178)
default@src,d2,Tbl:COMPLETE,Col:COMPLETE,Output:["key","value"]
<-Reducer 2 [SIMPLE_EDGE] llap
SHUFFLE [RS_44]
PartitionCols:_col1, _col3
- Merge Join Operator [MERGEJOIN_91] (rows=414 width=269)
+ Merge Join Operator [MERGEJOIN_91] (rows=99 width=269)
Conds:RS_41._col0=RS_42._col0(Inner),Output:["_col1","_col2","_col3"]
<-Map 1 [SIMPLE_EDGE] llap
SHUFFLE [RS_41]
@@ -413,9 +413,9 @@ Stage-0
<-Map 6 [SIMPLE_EDGE] llap
SHUFFLE [RS_42]
PartitionCols:_col0
- Select Operator [SEL_5] (rows=500 width=178)
+ Select Operator [SEL_5] (rows=7 width=178)
Output:["_col0"]
- Filter Operator [FIL_84] (rows=500 width=178)
+ Filter Operator [FIL_84] (rows=7 width=178)
predicate:((value) IN ('2000Q1', '2000Q2', '2000Q3') and key is not null)
TableScan [TS_3] (rows=500 width=178)
default@src,d3,Tbl:COMPLETE,Col:COMPLETE,Output:["key","value"]
http://git-wip-us.apache.org/repos/asf/hive/blob/10449a7a/ql/src/test/results/clientpositive/llap/vectorization_0.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/llap/vectorization_0.q.out b/ql/src/test/results/clientpositive/llap/vectorization_0.q.out
index 67fcdaa..af0bad9 100644
--- a/ql/src/test/results/clientpositive/llap/vectorization_0.q.out
+++ b/ql/src/test/results/clientpositive/llap/vectorization_0.q.out
@@ -30777,19 +30777,19 @@ STAGE PLANS:
Filter Operator
isSamplingPred: false
predicate: (cstring1) IN ('biology', 'history', 'topology') (type: boolean)
- Statistics: Num rows: 12288 Data size: 862450 Basic stats: COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 5 Data size: 470 Basic stats: COMPLETE Column stats: COMPLETE
Group By Operator
aggregations: count()
keys: cstring1 (type: string)
mode: hash
outputColumnNames: _col0, _col1
- Statistics: Num rows: 6144 Data size: 480424 Basic stats: COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 2 Data size: 204 Basic stats: COMPLETE Column stats: COMPLETE
Reduce Output Operator
key expressions: _col0 (type: string)
null sort order: a
sort order: +
Map-reduce partition columns: _col0 (type: string)
- Statistics: Num rows: 6144 Data size: 480424 Basic stats: COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 2 Data size: 204 Basic stats: COMPLETE Column stats: COMPLETE
tag: -1
value expressions: _col1 (type: bigint)
auto parallelism: true
@@ -30855,16 +30855,16 @@ STAGE PLANS:
keys: KEY._col0 (type: string)
mode: mergepartial
outputColumnNames: _col0, _col1
- Statistics: Num rows: 6144 Data size: 480424 Basic stats: COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 2 Data size: 204 Basic stats: COMPLETE Column stats: COMPLETE
Select Operator
expressions: _col1 (type: bigint), _col0 (type: string)
outputColumnNames: _col0, _col1
- Statistics: Num rows: 6144 Data size: 480424 Basic stats: COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 2 Data size: 204 Basic stats: COMPLETE Column stats: COMPLETE
Reduce Output Operator
key expressions: _col1 (type: string)
null sort order: a
sort order: +
- Statistics: Num rows: 6144 Data size: 480424 Basic stats: COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 2 Data size: 204 Basic stats: COMPLETE Column stats: COMPLETE
tag: -1
value expressions: _col0 (type: bigint)
auto parallelism: false
@@ -30875,13 +30875,13 @@ STAGE PLANS:
Select Operator
expressions: VALUE._col0 (type: bigint), KEY.reducesinkkey0 (type: string)
outputColumnNames: _col0, _col1
- Statistics: Num rows: 6144 Data size: 480424 Basic stats: COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 2 Data size: 204 Basic stats: COMPLETE Column stats: COMPLETE
File Output Operator
compressed: false
GlobalTableId: 0
#### A masked pattern was here ####
NumFilesPerFileSink: 1
- Statistics: Num rows: 6144 Data size: 480424 Basic stats: COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 2 Data size: 204 Basic stats: COMPLETE Column stats: COMPLETE
#### A masked pattern was here ####
table:
input format: org.apache.hadoop.mapred.SequenceFileInputFormat
http://git-wip-us.apache.org/repos/asf/hive/blob/10449a7a/ql/src/test/results/clientpositive/remove_exprs_stats.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/remove_exprs_stats.q.out b/ql/src/test/results/clientpositive/remove_exprs_stats.q.out
index 8fe688d..567e6b2 100644
--- a/ql/src/test/results/clientpositive/remove_exprs_stats.q.out
+++ b/ql/src/test/results/clientpositive/remove_exprs_stats.q.out
@@ -449,14 +449,14 @@ STAGE PLANS:
Statistics: Num rows: 8 Data size: 816 Basic stats: COMPLETE Column stats: COMPLETE
Filter Operator
predicate: (locid) IN (5) (type: boolean)
- Statistics: Num rows: 5 Data size: 510 Basic stats: COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 2 Data size: 204 Basic stats: COMPLETE Column stats: COMPLETE
Select Operator
expressions: state (type: string), locid (type: int), zip (type: bigint), year (type: int)
outputColumnNames: _col0, _col1, _col2, _col3
- Statistics: Num rows: 5 Data size: 510 Basic stats: COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 2 Data size: 204 Basic stats: COMPLETE Column stats: COMPLETE
File Output Operator
compressed: false
- Statistics: Num rows: 5 Data size: 510 Basic stats: COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 2 Data size: 204 Basic stats: COMPLETE Column stats: COMPLETE
table:
input format: org.apache.hadoop.mapred.SequenceFileInputFormat
output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
@@ -485,14 +485,14 @@ STAGE PLANS:
Statistics: Num rows: 8 Data size: 816 Basic stats: COMPLETE Column stats: COMPLETE
Filter Operator
predicate: (locid) IN (5, 2, 3) (type: boolean)
- Statistics: Num rows: 8 Data size: 816 Basic stats: COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 5 Data size: 510 Basic stats: COMPLETE Column stats: COMPLETE
Select Operator
expressions: state (type: string), locid (type: int), zip (type: bigint), year (type: int)
outputColumnNames: _col0, _col1, _col2, _col3
- Statistics: Num rows: 8 Data size: 816 Basic stats: COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 5 Data size: 510 Basic stats: COMPLETE Column stats: COMPLETE
File Output Operator
compressed: false
- Statistics: Num rows: 8 Data size: 816 Basic stats: COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 5 Data size: 510 Basic stats: COMPLETE Column stats: COMPLETE
table:
input format: org.apache.hadoop.mapred.SequenceFileInputFormat
output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
@@ -521,14 +521,14 @@ STAGE PLANS:
Statistics: Num rows: 8 Data size: 816 Basic stats: COMPLETE Column stats: COMPLETE
Filter Operator
predicate: (locid) IN (1, 6) (type: boolean)
- Statistics: Num rows: 8 Data size: 816 Basic stats: COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 3 Data size: 306 Basic stats: COMPLETE Column stats: COMPLETE
Select Operator
expressions: state (type: string), locid (type: int), zip (type: bigint), year (type: int)
outputColumnNames: _col0, _col1, _col2, _col3
- Statistics: Num rows: 8 Data size: 816 Basic stats: COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 3 Data size: 306 Basic stats: COMPLETE Column stats: COMPLETE
File Output Operator
compressed: false
- Statistics: Num rows: 8 Data size: 816 Basic stats: COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 3 Data size: 306 Basic stats: COMPLETE Column stats: COMPLETE
table:
input format: org.apache.hadoop.mapred.SequenceFileInputFormat
output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat