You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by jc...@apache.org on 2016/07/18 18:12:33 UTC
hive git commit: HIVE-14265: Partial stats in Join operator may lead
to data size estimate of 0 (Jesus Camacho Rodriguez,
reviewed by Ashutosh Chauhan)
Repository: hive
Updated Branches:
refs/heads/master 3e3f01c3a -> 879026e93
HIVE-14265: Partial stats in Join operator may lead to data size estimate of 0 (Jesus Camacho Rodriguez, reviewed by Ashutosh Chauhan)
Project: http://git-wip-us.apache.org/repos/asf/hive/repo
Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/879026e9
Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/879026e9
Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/879026e9
Branch: refs/heads/master
Commit: 879026e93a0915dff2de5a8360eea7e06836b2a4
Parents: 3e3f01c
Author: Jesus Camacho Rodriguez <jc...@apache.org>
Authored: Mon Jul 18 10:42:37 2016 +0100
Committer: Jesus Camacho Rodriguez <jc...@apache.org>
Committed: Mon Jul 18 18:57:00 2016 +0100
----------------------------------------------------------------------
.../stats/annotation/StatsRulesProcFactory.java | 20 +++-
.../queries/clientpositive/stats_partial_size.q | 8 ++
.../clientpositive/stats_partial_size.q.out | 100 +++++++++++++++++++
3 files changed, 125 insertions(+), 3 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/hive/blob/879026e9/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java
index 2d0417a..42cbc14 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java
@@ -19,8 +19,10 @@
package org.apache.hadoop.hive.ql.optimizer.stats.annotation;
import java.lang.reflect.Field;
+import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
+import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
@@ -34,7 +36,6 @@ import org.apache.hadoop.hive.ql.exec.ColumnInfo;
import org.apache.hadoop.hive.ql.exec.CommonJoinOperator;
import org.apache.hadoop.hive.ql.exec.FilterOperator;
import org.apache.hadoop.hive.ql.exec.GroupByOperator;
-import org.apache.hadoop.hive.ql.exec.JoinOperator;
import org.apache.hadoop.hive.ql.exec.LimitOperator;
import org.apache.hadoop.hive.ql.exec.Operator;
import org.apache.hadoop.hive.ql.exec.OperatorUtils;
@@ -1472,7 +1473,7 @@ public class StatsRulesProcFactory {
// update join statistics
stats.setColumnStats(outColStats);
long newRowCount = inferredRowCount !=-1 ? inferredRowCount : computeNewRowCount(rowCounts, denom, jop);
- updateColStats(stats, newRowCount, jop, rowCountParents);
+ updateColStats(conf, stats, newRowCount, jop, rowCountParents);
jop.setStatistics(stats);
if (isDebugEnabled) {
@@ -1766,7 +1767,7 @@ public class StatsRulesProcFactory {
return result;
}
- private void updateColStats(Statistics stats, long newNumRows,
+ private void updateColStats(HiveConf conf, Statistics stats, long newNumRows,
CommonJoinOperator<? extends JoinDesc> jop,
Map<Integer, Long> rowCountParents) {
@@ -1789,7 +1790,9 @@ public class StatsRulesProcFactory {
// stats for columns from 1st parent should be scaled down by 200/10 = 20x
// and stats for columns from 2nd parent should be scaled down by 200x
List<ColStatistics> colStats = stats.getColumnStats();
+ Set<String> colNameStatsAvailable = new HashSet<>();
for (ColStatistics cs : colStats) {
+ colNameStatsAvailable.add(cs.getColumnName());
int pos = jop.getConf().getReversedExprs().get(cs.getColumnName());
long oldRowCount = rowCountParents.get(pos);
double ratio = (double) newNumRows / (double) oldRowCount;
@@ -1811,6 +1814,17 @@ public class StatsRulesProcFactory {
stats.setColumnStats(colStats);
long newDataSize = StatsUtils
.getDataSizeFromColumnStats(newNumRows, colStats);
+ // Add default size for columns for which stats were not available
+ List<String> neededColumns = new ArrayList<>();
+ for (String colName : jop.getSchema().getColumnNames()) {
+ if (!colNameStatsAvailable.contains(colName)) {
+ neededColumns.add(colName);
+ }
+ }
+ if (neededColumns.size() != 0) {
+ int restColumnsDefaultSize = StatsUtils.estimateRowSizeFromSchema(conf, jop.getSchema().getSignature(), neededColumns);
+ newDataSize = StatsUtils.safeAdd(newDataSize, StatsUtils.safeMult(restColumnsDefaultSize, newNumRows));
+ }
stats.setDataSize(StatsUtils.getMaxIfOverflow(newDataSize));
}
http://git-wip-us.apache.org/repos/asf/hive/blob/879026e9/ql/src/test/queries/clientpositive/stats_partial_size.q
----------------------------------------------------------------------
diff --git a/ql/src/test/queries/clientpositive/stats_partial_size.q b/ql/src/test/queries/clientpositive/stats_partial_size.q
new file mode 100644
index 0000000..c42d351
--- /dev/null
+++ b/ql/src/test/queries/clientpositive/stats_partial_size.q
@@ -0,0 +1,8 @@
+set hive.stats.fetch.column.stats=true;
+
+create table sample_partitioned (x int) partitioned by (y int);
+insert into sample_partitioned partition(y=1) values (1),(2);
+create temporary table sample as select * from sample_partitioned;
+analyze table sample compute statistics for columns;
+
+explain select sample_partitioned.x from sample_partitioned, sample where sample.y = sample_partitioned.y;
http://git-wip-us.apache.org/repos/asf/hive/blob/879026e9/ql/src/test/results/clientpositive/stats_partial_size.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/stats_partial_size.q.out b/ql/src/test/results/clientpositive/stats_partial_size.q.out
new file mode 100644
index 0000000..31adec7
--- /dev/null
+++ b/ql/src/test/results/clientpositive/stats_partial_size.q.out
@@ -0,0 +1,100 @@
+PREHOOK: query: create table sample_partitioned (x int) partitioned by (y int)
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@sample_partitioned
+POSTHOOK: query: create table sample_partitioned (x int) partitioned by (y int)
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@sample_partitioned
+PREHOOK: query: insert into sample_partitioned partition(y=1) values (1),(2)
+PREHOOK: type: QUERY
+PREHOOK: Input: default@values__tmp__table__1
+PREHOOK: Output: default@sample_partitioned@y=1
+POSTHOOK: query: insert into sample_partitioned partition(y=1) values (1),(2)
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@values__tmp__table__1
+POSTHOOK: Output: default@sample_partitioned@y=1
+POSTHOOK: Lineage: sample_partitioned PARTITION(y=1).x EXPRESSION [(values__tmp__table__1)values__tmp__table__1.FieldSchema(name:tmp_values_col1, type:string, comment:), ]
+PREHOOK: query: create temporary table sample as select * from sample_partitioned
+PREHOOK: type: CREATETABLE_AS_SELECT
+PREHOOK: Input: default@sample_partitioned
+PREHOOK: Input: default@sample_partitioned@y=1
+PREHOOK: Output: database:default
+PREHOOK: Output: default@sample
+POSTHOOK: query: create temporary table sample as select * from sample_partitioned
+POSTHOOK: type: CREATETABLE_AS_SELECT
+POSTHOOK: Input: default@sample_partitioned
+POSTHOOK: Input: default@sample_partitioned@y=1
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@sample
+PREHOOK: query: analyze table sample compute statistics for columns
+PREHOOK: type: QUERY
+PREHOOK: Input: default@sample
+#### A masked pattern was here ####
+POSTHOOK: query: analyze table sample compute statistics for columns
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@sample
+#### A masked pattern was here ####
+PREHOOK: query: explain select sample_partitioned.x from sample_partitioned, sample where sample.y = sample_partitioned.y
+PREHOOK: type: QUERY
+POSTHOOK: query: explain select sample_partitioned.x from sample_partitioned, sample where sample.y = sample_partitioned.y
+POSTHOOK: type: QUERY
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Map Operator Tree:
+ TableScan
+ alias: sample_partitioned
+ Statistics: Num rows: 2 Data size: 2 Basic stats: COMPLETE Column stats: PARTIAL
+ Select Operator
+ expressions: x (type: int), y (type: int)
+ outputColumnNames: _col0, _col1
+ Statistics: Num rows: 2 Data size: 8 Basic stats: COMPLETE Column stats: PARTIAL
+ Reduce Output Operator
+ key expressions: _col1 (type: int)
+ sort order: +
+ Map-reduce partition columns: _col1 (type: int)
+ Statistics: Num rows: 2 Data size: 8 Basic stats: COMPLETE Column stats: PARTIAL
+ value expressions: _col0 (type: int)
+ TableScan
+ alias: sample
+ Statistics: Num rows: 2 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE
+ Filter Operator
+ predicate: y is not null (type: boolean)
+ Statistics: Num rows: 2 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE
+ Select Operator
+ expressions: y (type: int)
+ outputColumnNames: _col0
+ Statistics: Num rows: 2 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE
+ Reduce Output Operator
+ key expressions: _col0 (type: int)
+ sort order: +
+ Map-reduce partition columns: _col0 (type: int)
+ Statistics: Num rows: 2 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE
+ Reduce Operator Tree:
+ Join Operator
+ condition map:
+ Inner Join 0 to 1
+ keys:
+ 0 _col1 (type: int)
+ 1 _col0 (type: int)
+ outputColumnNames: _col0
+ Statistics: Num rows: 4 Data size: 16 Basic stats: COMPLETE Column stats: PARTIAL
+ File Output Operator
+ compressed: false
+ Statistics: Num rows: 4 Data size: 16 Basic stats: COMPLETE Column stats: PARTIAL
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+ Processor Tree:
+ ListSink
+