You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by ha...@apache.org on 2018/07/30 22:20:28 UTC
hive git commit: HIVE-20262 : Implement stats annotation rule for the
UDTFOperator (George Pachitariu via Ashutosh Chauhan)
Repository: hive
Updated Branches:
refs/heads/master 44f955ed7 -> 6fa9f6339
HIVE-20262 : Implement stats annotation rule for the UDTFOperator (George Pachitariu via Ashutosh Chauhan)
Signed-off-by: Ashutosh Chauhan <ha...@apache.org>
Project: http://git-wip-us.apache.org/repos/asf/hive/repo
Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/6fa9f633
Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/6fa9f633
Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/6fa9f633
Branch: refs/heads/master
Commit: 6fa9f63394c097547ddca194981779aa9c061317
Parents: 44f955e
Author: George Pachitariu <ge...@gmail.com>
Authored: Mon Jul 30 15:19:35 2018 -0700
Committer: Ashutosh Chauhan <ha...@apache.org>
Committed: Mon Jul 30 15:19:35 2018 -0700
----------------------------------------------------------------------
.../org/apache/hadoop/hive/conf/HiveConf.java | 5 +
.../annotation/AnnotateWithStatistics.java | 3 +
.../stats/annotation/StatsRulesProcFactory.java | 38 +++
.../clientpositive/annotate_stats_udtf.q | 32 +++
.../clientpositive/annotate_stats_udtf.q.out | 255 +++++++++++++++++++
5 files changed, 333 insertions(+)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/hive/blob/6fa9f633/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
----------------------------------------------------------------------
diff --git a/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java b/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
index 39c77b3..cce908f 100644
--- a/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
+++ b/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
@@ -2398,6 +2398,11 @@ public class HiveConf extends Configuration {
"filter operators."),
HIVE_STATS_IN_MIN_RATIO("hive.stats.filter.in.min.ratio", (float) 0.05,
"Output estimation of an IN filter can't be lower than this ratio"),
+ HIVE_STATS_UDTF_FACTOR("hive.stats.udtf.factor", (float) 1.0,
+ "UDTFs change the number of rows of the output. A common UDTF is the explode() method that creates\n" +
+ "multiple rows for each element in the input array. This factor is applied to the number of\n" +
+ "output rows and output size."),
+
// Concurrency
HIVE_SUPPORT_CONCURRENCY("hive.support.concurrency", false,
"Whether Hive supports concurrency control or not. \n" +
http://git-wip-us.apache.org/repos/asf/hive/blob/6fa9f633/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/AnnotateWithStatistics.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/AnnotateWithStatistics.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/AnnotateWithStatistics.java
index 4b3b2ac..cfcb355 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/AnnotateWithStatistics.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/AnnotateWithStatistics.java
@@ -30,6 +30,7 @@ import org.apache.hadoop.hive.ql.exec.MapJoinOperator;
import org.apache.hadoop.hive.ql.exec.ReduceSinkOperator;
import org.apache.hadoop.hive.ql.exec.SelectOperator;
import org.apache.hadoop.hive.ql.exec.TableScanOperator;
+import org.apache.hadoop.hive.ql.exec.UDTFOperator;
import org.apache.hadoop.hive.ql.lib.DefaultRuleDispatcher;
import org.apache.hadoop.hive.ql.lib.Dispatcher;
import org.apache.hadoop.hive.ql.lib.GraphWalker;
@@ -65,6 +66,8 @@ public class AnnotateWithStatistics extends Transform {
StatsRulesProcFactory.getLimitRule());
opRules.put(new RuleRegExp("RS", ReduceSinkOperator.getOperatorName() + "%"),
StatsRulesProcFactory.getReduceSinkRule());
+ opRules.put(new RuleRegExp("UDTF", UDTFOperator.getOperatorName() + "%"),
+ StatsRulesProcFactory.getUDTFRule());
// The dispatcher fires the processor corresponding to the closest matching
// rule and passes the context along
http://git-wip-us.apache.org/repos/asf/hive/blob/6fa9f633/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java
index 3c2b085..997e289 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java
@@ -47,6 +47,7 @@ import org.apache.hadoop.hive.ql.exec.ReduceSinkOperator;
import org.apache.hadoop.hive.ql.exec.RowSchema;
import org.apache.hadoop.hive.ql.exec.SelectOperator;
import org.apache.hadoop.hive.ql.exec.TableScanOperator;
+import org.apache.hadoop.hive.ql.exec.UDTFOperator;
import org.apache.hadoop.hive.ql.exec.Utilities;
import org.apache.hadoop.hive.ql.lib.Node;
import org.apache.hadoop.hive.ql.lib.NodeProcessor;
@@ -2499,6 +2500,39 @@ public class StatsRulesProcFactory {
}
/**
+ * UDTF operator changes the number of rows and thereby the data size.
+ */
+ public static class UDTFStatsRule extends DefaultStatsRule implements NodeProcessor {
+ @Override
+ public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx,
+ Object... nodeOutputs) throws SemanticException {
+ AnnotateStatsProcCtx aspCtx = (AnnotateStatsProcCtx) procCtx;
+ UDTFOperator uop = (UDTFOperator) nd;
+
+ Operator<? extends OperatorDesc> parent = uop.getParentOperators().get(0);
+
+ Statistics parentStats = parent.getStatistics();
+
+ if (parentStats != null) {
+ Statistics st = parentStats.clone();
+
+ float udtfFactor=HiveConf.getFloatVar(aspCtx.getConf(), HiveConf.ConfVars.HIVE_STATS_UDTF_FACTOR);
+ long numRows = (long) (parentStats.getNumRows() * udtfFactor);
+ long dataSize = StatsUtils.safeMult(parentStats.getDataSize(), udtfFactor);
+ st.setNumRows(numRows);
+ st.setDataSize(dataSize);
+
+ if (LOG.isDebugEnabled()) {
+ LOG.debug("[0] STATS-" + uop.toString() + ": " + st.extendedToString());
+ }
+
+ uop.setStatistics(st);
+ }
+ return null;
+ }
+ }
+
+ /**
* Default rule is to aggregate the statistics from all its parent operators.
*/
public static class DefaultStatsRule implements NodeProcessor {
@@ -2584,6 +2618,10 @@ public class StatsRulesProcFactory {
return new ReduceSinkStatsRule();
}
+ public static NodeProcessor getUDTFRule() {
+ return new UDTFStatsRule();
+ }
+
public static NodeProcessor getDefaultRule() {
return new DefaultStatsRule();
}
http://git-wip-us.apache.org/repos/asf/hive/blob/6fa9f633/ql/src/test/queries/clientpositive/annotate_stats_udtf.q
----------------------------------------------------------------------
diff --git a/ql/src/test/queries/clientpositive/annotate_stats_udtf.q b/ql/src/test/queries/clientpositive/annotate_stats_udtf.q
new file mode 100644
index 0000000..74e6ebf
--- /dev/null
+++ b/ql/src/test/queries/clientpositive/annotate_stats_udtf.q
@@ -0,0 +1,32 @@
+-- setting up a table with multiple rows
+drop table if exists HIVE_20262;
+create table HIVE_20262 (a array<int>);
+insert into HIVE_20262 select array(1);
+insert into HIVE_20262 select array(2);
+
+
+set hive.stats.udtf.factor=5;
+
+-- Test when input has a single row
+explain select explode(array(1,2,3,4,5)) as col;
+
+-- Test when input has multiple rows
+explain select explode(a) from HIVE_20262;
+
+-- the output data size should increase
+explain select 1, r from HIVE_20262
+ lateral view explode(a) t as r ;
+
+
+-- Default behaviour tests:
+
+-- 1 is the default value
+set hive.stats.udtf.factor=1;
+
+-- Test when input has a single row
+explain select explode(array(1,2,3,4,5)) as col;
+
+-- Test when input has multiple rows
+explain select explode(a) from HIVE_20262;
+
+
http://git-wip-us.apache.org/repos/asf/hive/blob/6fa9f633/ql/src/test/results/clientpositive/annotate_stats_udtf.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/annotate_stats_udtf.q.out b/ql/src/test/results/clientpositive/annotate_stats_udtf.q.out
new file mode 100644
index 0000000..f526487
--- /dev/null
+++ b/ql/src/test/results/clientpositive/annotate_stats_udtf.q.out
@@ -0,0 +1,255 @@
+PREHOOK: query: drop table if exists HIVE_20262
+PREHOOK: type: DROPTABLE
+POSTHOOK: query: drop table if exists HIVE_20262
+POSTHOOK: type: DROPTABLE
+PREHOOK: query: create table HIVE_20262 (a array<int>)
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@HIVE_20262
+POSTHOOK: query: create table HIVE_20262 (a array<int>)
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@HIVE_20262
+PREHOOK: query: insert into HIVE_20262 select array(1)
+PREHOOK: type: QUERY
+PREHOOK: Input: _dummy_database@_dummy_table
+PREHOOK: Output: default@hive_20262
+POSTHOOK: query: insert into HIVE_20262 select array(1)
+POSTHOOK: type: QUERY
+POSTHOOK: Input: _dummy_database@_dummy_table
+POSTHOOK: Output: default@hive_20262
+POSTHOOK: Lineage: hive_20262.a EXPRESSION []
+PREHOOK: query: insert into HIVE_20262 select array(2)
+PREHOOK: type: QUERY
+PREHOOK: Input: _dummy_database@_dummy_table
+PREHOOK: Output: default@hive_20262
+POSTHOOK: query: insert into HIVE_20262 select array(2)
+POSTHOOK: type: QUERY
+POSTHOOK: Input: _dummy_database@_dummy_table
+POSTHOOK: Output: default@hive_20262
+POSTHOOK: Lineage: hive_20262.a EXPRESSION []
+PREHOOK: query: explain select explode(array(1,2,3,4,5)) as col
+PREHOOK: type: QUERY
+POSTHOOK: query: explain select explode(array(1,2,3,4,5)) as col
+POSTHOOK: type: QUERY
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Map Operator Tree:
+ TableScan
+ alias: _dummy_table
+ Row Limit Per Split: 1
+ Statistics: Num rows: 1 Data size: 10 Basic stats: COMPLETE Column stats: COMPLETE
+ Select Operator
+ expressions: array(1,2,3,4,5) (type: array<int>)
+ outputColumnNames: _col0
+ Statistics: Num rows: 1 Data size: 64 Basic stats: COMPLETE Column stats: COMPLETE
+ UDTF Operator
+ Statistics: Num rows: 5 Data size: 320 Basic stats: COMPLETE Column stats: COMPLETE
+ function name: explode
+ Select Operator
+ expressions: col (type: int)
+ outputColumnNames: _col0
+ Statistics: Num rows: 5 Data size: 40 Basic stats: COMPLETE Column stats: COMPLETE
+ File Output Operator
+ compressed: false
+ Statistics: Num rows: 5 Data size: 40 Basic stats: COMPLETE Column stats: COMPLETE
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+ Processor Tree:
+ ListSink
+
+PREHOOK: query: explain select explode(a) from HIVE_20262
+PREHOOK: type: QUERY
+POSTHOOK: query: explain select explode(a) from HIVE_20262
+POSTHOOK: type: QUERY
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Map Operator Tree:
+ TableScan
+ alias: hive_20262
+ Statistics: Num rows: 2 Data size: 2 Basic stats: COMPLETE Column stats: NONE
+ Select Operator
+ expressions: a (type: array<int>)
+ outputColumnNames: _col0
+ Statistics: Num rows: 2 Data size: 2 Basic stats: COMPLETE Column stats: NONE
+ UDTF Operator
+ Statistics: Num rows: 10 Data size: 10 Basic stats: COMPLETE Column stats: NONE
+ function name: explode
+ Select Operator
+ expressions: col (type: int)
+ outputColumnNames: _col0
+ Statistics: Num rows: 10 Data size: 10 Basic stats: COMPLETE Column stats: NONE
+ File Output Operator
+ compressed: false
+ Statistics: Num rows: 10 Data size: 10 Basic stats: COMPLETE Column stats: NONE
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+ Processor Tree:
+ ListSink
+
+PREHOOK: query: explain select 1, r from HIVE_20262
+ lateral view explode(a) t as r
+PREHOOK: type: QUERY
+POSTHOOK: query: explain select 1, r from HIVE_20262
+ lateral view explode(a) t as r
+POSTHOOK: type: QUERY
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Map Operator Tree:
+ TableScan
+ alias: hive_20262
+ Statistics: Num rows: 2 Data size: 2 Basic stats: COMPLETE Column stats: NONE
+ Lateral View Forward
+ Statistics: Num rows: 2 Data size: 2 Basic stats: COMPLETE Column stats: NONE
+ Select Operator
+ Statistics: Num rows: 2 Data size: 2 Basic stats: COMPLETE Column stats: NONE
+ Lateral View Join Operator
+ outputColumnNames: _col4
+ Statistics: Num rows: 12 Data size: 12 Basic stats: COMPLETE Column stats: NONE
+ Select Operator
+ expressions: 1 (type: int), _col4 (type: int)
+ outputColumnNames: _col0, _col1
+ Statistics: Num rows: 12 Data size: 12 Basic stats: COMPLETE Column stats: NONE
+ File Output Operator
+ compressed: false
+ Statistics: Num rows: 12 Data size: 12 Basic stats: COMPLETE Column stats: NONE
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ Select Operator
+ expressions: a (type: array<int>)
+ outputColumnNames: _col0
+ Statistics: Num rows: 2 Data size: 2 Basic stats: COMPLETE Column stats: NONE
+ UDTF Operator
+ Statistics: Num rows: 10 Data size: 10 Basic stats: COMPLETE Column stats: NONE
+ function name: explode
+ Lateral View Join Operator
+ outputColumnNames: _col4
+ Statistics: Num rows: 12 Data size: 12 Basic stats: COMPLETE Column stats: NONE
+ Select Operator
+ expressions: 1 (type: int), _col4 (type: int)
+ outputColumnNames: _col0, _col1
+ Statistics: Num rows: 12 Data size: 12 Basic stats: COMPLETE Column stats: NONE
+ File Output Operator
+ compressed: false
+ Statistics: Num rows: 12 Data size: 12 Basic stats: COMPLETE Column stats: NONE
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+ Processor Tree:
+ ListSink
+
+PREHOOK: query: explain select explode(array(1,2,3,4,5)) as col
+PREHOOK: type: QUERY
+POSTHOOK: query: explain select explode(array(1,2,3,4,5)) as col
+POSTHOOK: type: QUERY
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Map Operator Tree:
+ TableScan
+ alias: _dummy_table
+ Row Limit Per Split: 1
+ Statistics: Num rows: 1 Data size: 10 Basic stats: COMPLETE Column stats: COMPLETE
+ Select Operator
+ expressions: array(1,2,3,4,5) (type: array<int>)
+ outputColumnNames: _col0
+ Statistics: Num rows: 1 Data size: 64 Basic stats: COMPLETE Column stats: COMPLETE
+ UDTF Operator
+ Statistics: Num rows: 1 Data size: 64 Basic stats: COMPLETE Column stats: COMPLETE
+ function name: explode
+ Select Operator
+ expressions: col (type: int)
+ outputColumnNames: _col0
+ Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE
+ File Output Operator
+ compressed: false
+ Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+ Processor Tree:
+ ListSink
+
+PREHOOK: query: explain select explode(a) from HIVE_20262
+PREHOOK: type: QUERY
+POSTHOOK: query: explain select explode(a) from HIVE_20262
+POSTHOOK: type: QUERY
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Map Operator Tree:
+ TableScan
+ alias: hive_20262
+ Statistics: Num rows: 2 Data size: 2 Basic stats: COMPLETE Column stats: NONE
+ Select Operator
+ expressions: a (type: array<int>)
+ outputColumnNames: _col0
+ Statistics: Num rows: 2 Data size: 2 Basic stats: COMPLETE Column stats: NONE
+ UDTF Operator
+ Statistics: Num rows: 2 Data size: 2 Basic stats: COMPLETE Column stats: NONE
+ function name: explode
+ Select Operator
+ expressions: col (type: int)
+ outputColumnNames: _col0
+ Statistics: Num rows: 2 Data size: 2 Basic stats: COMPLETE Column stats: NONE
+ File Output Operator
+ compressed: false
+ Statistics: Num rows: 2 Data size: 2 Basic stats: COMPLETE Column stats: NONE
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+ Processor Tree:
+ ListSink
+