You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by jd...@apache.org on 2016/04/04 22:36:57 UTC
[22/50] [abbrv] hive git commit: HIVE-13358: Stats state is not
captured correctly: turn off stats optimizer for sampled table (Pengcheng
Xiong, reviewed by Ashutosh Chauhan)
HIVE-13358: Stats state is not captured correctly: turn off stats optimizer for sampled table (Pengcheng Xiong, reviewed by Ashutosh Chauhan)
Project: http://git-wip-us.apache.org/repos/asf/hive/repo
Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/77474581
Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/77474581
Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/77474581
Branch: refs/heads/llap
Commit: 77474581df4016e3899a986e079513087a945674
Parents: 41a30b5
Author: Pengcheng Xiong <px...@apache.org>
Authored: Sun Mar 27 22:30:29 2016 -0700
Committer: Pengcheng Xiong <px...@apache.org>
Committed: Sun Mar 27 22:30:42 2016 -0700
----------------------------------------------------------------------
.../hive/ql/optimizer/StatsOptimizer.java | 14 +-
.../sample_islocalmode_hook_use_metadata.q | 48 ++++
.../sample_islocalmode_hook_use_metadata.q.out | 230 +++++++++++++++++++
3 files changed, 289 insertions(+), 3 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/hive/blob/77474581/ql/src/java/org/apache/hadoop/hive/ql/optimizer/StatsOptimizer.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/StatsOptimizer.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/StatsOptimizer.java
index 4091c0d..bc17fec 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/StatsOptimizer.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/StatsOptimizer.java
@@ -100,9 +100,12 @@ public class StatsOptimizer extends Transform {
@Override
public ParseContext transform(ParseContext pctx) throws SemanticException {
- if (pctx.getFetchTask() != null || !pctx.getQueryProperties().isQuery() ||
- pctx.getQueryProperties().isAnalyzeRewrite() || pctx.getQueryProperties().isCTAS() ||
- pctx.getLoadFileWork().size() > 1 || !pctx.getLoadTableWork().isEmpty()) {
+ if (pctx.getFetchTask() != null || !pctx.getQueryProperties().isQuery()
+ || pctx.getQueryProperties().isAnalyzeRewrite() || pctx.getQueryProperties().isCTAS()
+ || pctx.getLoadFileWork().size() > 1 || !pctx.getLoadTableWork().isEmpty()
+ // If getNameToSplitSample is not empty, at least one of the source
+ // tables is being sampled and we can not optimize.
+ || !pctx.getNameToSplitSample().isEmpty()) {
return pctx;
}
@@ -251,6 +254,11 @@ public class StatsOptimizer extends Transform {
// looks like a subq plan.
return null;
}
+ if (tsOp.getConf().getRowLimit() != -1) {
+ // table is sampled. In some situation, we really can leverage row
+ // limit. In order to be safe, we do not use it now.
+ return null;
+ }
SelectOperator pselOp = (SelectOperator)stack.get(1);
for(ExprNodeDesc desc : pselOp.getConf().getColList()) {
if (!((desc instanceof ExprNodeColumnDesc) || (desc instanceof ExprNodeConstantDesc))) {
http://git-wip-us.apache.org/repos/asf/hive/blob/77474581/ql/src/test/queries/clientpositive/sample_islocalmode_hook_use_metadata.q
----------------------------------------------------------------------
diff --git a/ql/src/test/queries/clientpositive/sample_islocalmode_hook_use_metadata.q b/ql/src/test/queries/clientpositive/sample_islocalmode_hook_use_metadata.q
new file mode 100644
index 0000000..ac915b5
--- /dev/null
+++ b/ql/src/test/queries/clientpositive/sample_islocalmode_hook_use_metadata.q
@@ -0,0 +1,48 @@
+set hive.mapred.mode=nonstrict;
+set hive.exec.submitviachild=true;
+set hive.exec.submit.local.task.via.child=true;
+set hive.input.format=org.apache.hadoop.hive.ql.io.CombineHiveInputFormat;
+set mapred.max.split.size=300;
+set mapred.min.split.size=300;
+set mapred.min.split.size.per.node=300;
+set mapred.min.split.size.per.rack=300;
+set hive.exec.mode.local.auto=true;
+set hive.merge.smallfiles.avgsize=1;
+set hive.compute.query.using.stats=true;
+
+-- EXCLUDE_HADOOP_MAJOR_VERSIONS( 0.20S)
+
+-- create file inputs
+create table sih_i_part (key int, value string) partitioned by (p string);
+insert overwrite table sih_i_part partition (p='1') select key, value from src;
+insert overwrite table sih_i_part partition (p='2') select key+10000, value from src;
+insert overwrite table sih_i_part partition (p='3') select key+20000, value from src;
+create table sih_src as select key, value from sih_i_part order by key, value;
+create table sih_src2 as select key, value from sih_src order by key, value;
+
+set hive.exec.post.hooks = org.apache.hadoop.hive.ql.hooks.VerifyIsLocalModeHook;
+set mapreduce.framework.name=yarn;
+set mapreduce.jobtracker.address=localhost:58;
+set hive.sample.seednumber=7;
+
+-- Relaxing hive.exec.mode.local.auto.input.files.max=1.
+-- Hadoop20 will not generate more splits than there are files (one).
+-- Hadoop23 generate splits correctly (four), hence the max needs to be adjusted to ensure running in local mode.
+-- Default value is hive.exec.mode.local.auto.input.files.max=4 which produces expected behavior on Hadoop23.
+-- hive.sample.seednumber is required because Hadoop23 generates multiple splits and tablesample is non-repeatable without it.
+
+-- sample split, running locally limited by num tasks
+
+desc formatted sih_src;
+
+explain select count(1) from sih_src;
+
+select count(1) from sih_src;
+
+explain select count(1) from sih_src tablesample(1 percent);
+
+select count(1) from sih_src tablesample(1 percent);
+
+explain select count(1) from sih_src tablesample(10 rows);
+
+select count(1) from sih_src tablesample(10 rows);
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/hive/blob/77474581/ql/src/test/results/clientpositive/sample_islocalmode_hook_use_metadata.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/sample_islocalmode_hook_use_metadata.q.out b/ql/src/test/results/clientpositive/sample_islocalmode_hook_use_metadata.q.out
new file mode 100644
index 0000000..d268837
--- /dev/null
+++ b/ql/src/test/results/clientpositive/sample_islocalmode_hook_use_metadata.q.out
@@ -0,0 +1,230 @@
+PREHOOK: query: -- EXCLUDE_HADOOP_MAJOR_VERSIONS( 0.20S)
+
+-- create file inputs
+create table sih_i_part (key int, value string) partitioned by (p string)
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@sih_i_part
+POSTHOOK: query: -- EXCLUDE_HADOOP_MAJOR_VERSIONS( 0.20S)
+
+-- create file inputs
+create table sih_i_part (key int, value string) partitioned by (p string)
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@sih_i_part
+PREHOOK: query: insert overwrite table sih_i_part partition (p='1') select key, value from src
+PREHOOK: type: QUERY
+PREHOOK: Input: default@src
+PREHOOK: Output: default@sih_i_part@p=1
+POSTHOOK: query: insert overwrite table sih_i_part partition (p='1') select key, value from src
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@src
+POSTHOOK: Output: default@sih_i_part@p=1
+POSTHOOK: Lineage: sih_i_part PARTITION(p=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: sih_i_part PARTITION(p=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+PREHOOK: query: insert overwrite table sih_i_part partition (p='2') select key+10000, value from src
+PREHOOK: type: QUERY
+PREHOOK: Input: default@src
+PREHOOK: Output: default@sih_i_part@p=2
+POSTHOOK: query: insert overwrite table sih_i_part partition (p='2') select key+10000, value from src
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@src
+POSTHOOK: Output: default@sih_i_part@p=2
+POSTHOOK: Lineage: sih_i_part PARTITION(p=2).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: sih_i_part PARTITION(p=2).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+PREHOOK: query: insert overwrite table sih_i_part partition (p='3') select key+20000, value from src
+PREHOOK: type: QUERY
+PREHOOK: Input: default@src
+PREHOOK: Output: default@sih_i_part@p=3
+POSTHOOK: query: insert overwrite table sih_i_part partition (p='3') select key+20000, value from src
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@src
+POSTHOOK: Output: default@sih_i_part@p=3
+POSTHOOK: Lineage: sih_i_part PARTITION(p=3).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: sih_i_part PARTITION(p=3).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+PREHOOK: query: create table sih_src as select key, value from sih_i_part order by key, value
+PREHOOK: type: CREATETABLE_AS_SELECT
+PREHOOK: Input: default@sih_i_part
+PREHOOK: Input: default@sih_i_part@p=1
+PREHOOK: Input: default@sih_i_part@p=2
+PREHOOK: Input: default@sih_i_part@p=3
+PREHOOK: Output: database:default
+PREHOOK: Output: default@sih_src
+POSTHOOK: query: create table sih_src as select key, value from sih_i_part order by key, value
+POSTHOOK: type: CREATETABLE_AS_SELECT
+POSTHOOK: Input: default@sih_i_part
+POSTHOOK: Input: default@sih_i_part@p=1
+POSTHOOK: Input: default@sih_i_part@p=2
+POSTHOOK: Input: default@sih_i_part@p=3
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@sih_src
+POSTHOOK: Lineage: sih_src.key SIMPLE [(sih_i_part)sih_i_part.FieldSchema(name:key, type:int, comment:null), ]
+POSTHOOK: Lineage: sih_src.value SIMPLE [(sih_i_part)sih_i_part.FieldSchema(name:value, type:string, comment:null), ]
+PREHOOK: query: create table sih_src2 as select key, value from sih_src order by key, value
+PREHOOK: type: CREATETABLE_AS_SELECT
+PREHOOK: Input: default@sih_src
+PREHOOK: Output: database:default
+PREHOOK: Output: default@sih_src2
+POSTHOOK: query: create table sih_src2 as select key, value from sih_src order by key, value
+POSTHOOK: type: CREATETABLE_AS_SELECT
+POSTHOOK: Input: default@sih_src
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@sih_src2
+POSTHOOK: Lineage: sih_src2.key SIMPLE [(sih_src)sih_src.FieldSchema(name:key, type:int, comment:null), ]
+POSTHOOK: Lineage: sih_src2.value SIMPLE [(sih_src)sih_src.FieldSchema(name:value, type:string, comment:null), ]
+PREHOOK: query: -- Relaxing hive.exec.mode.local.auto.input.files.max=1.
+-- Hadoop20 will not generate more splits than there are files (one).
+-- Hadoop23 generate splits correctly (four), hence the max needs to be adjusted to ensure running in local mode.
+-- Default value is hive.exec.mode.local.auto.input.files.max=4 which produces expected behavior on Hadoop23.
+-- hive.sample.seednumber is required because Hadoop23 generates multiple splits and tablesample is non-repeatable without it.
+
+-- sample split, running locally limited by num tasks
+
+desc formatted sih_src
+PREHOOK: type: DESCTABLE
+PREHOOK: Input: default@sih_src
+# col_name data_type comment
+
+key int
+value string
+
+# Detailed Table Information
+Database: default
+#### A masked pattern was here ####
+Retention: 0
+#### A masked pattern was here ####
+Table Type: MANAGED_TABLE
+Table Parameters:
+ COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\"}
+ numFiles 1
+ numRows 1500
+ rawDataSize 18124
+ totalSize 19624
+#### A masked pattern was here ####
+
+# Storage Information
+SerDe Library: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+InputFormat: org.apache.hadoop.mapred.TextInputFormat
+OutputFormat: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+Compressed: No
+Num Buckets: -1
+Bucket Columns: []
+Sort Columns: []
+Storage Desc Params:
+ serialization.format 1
+PREHOOK: query: explain select count(1) from sih_src
+PREHOOK: type: QUERY
+STAGE DEPENDENCIES:
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-0
+ Fetch Operator
+ limit: 1
+ Processor Tree:
+ ListSink
+
+PREHOOK: query: select count(1) from sih_src
+PREHOOK: type: QUERY
+PREHOOK: Input: default@sih_src
+#### A masked pattern was here ####
+1500
+PREHOOK: query: explain select count(1) from sih_src tablesample(1 percent)
+PREHOOK: type: QUERY
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Map Operator Tree:
+ TableScan
+ alias: sih_src
+ Statistics: Num rows: 1500 Data size: 18124 Basic stats: COMPLETE Column stats: COMPLETE
+ Select Operator
+ Statistics: Num rows: 1500 Data size: 18124 Basic stats: COMPLETE Column stats: COMPLETE
+ Group By Operator
+ aggregations: count(1)
+ mode: hash
+ outputColumnNames: _col0
+ Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE
+ Reduce Output Operator
+ sort order:
+ Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE
+ value expressions: _col0 (type: bigint)
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations: count(VALUE._col0)
+ mode: mergepartial
+ outputColumnNames: _col0
+ Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE
+ File Output Operator
+ compressed: false
+ Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+ Processor Tree:
+ ListSink
+
+PREHOOK: query: select count(1) from sih_src tablesample(1 percent)
+PREHOOK: type: QUERY
+PREHOOK: Input: default@sih_src
+#### A masked pattern was here ####
+25
+PREHOOK: query: explain select count(1) from sih_src tablesample(10 rows)
+PREHOOK: type: QUERY
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Map Operator Tree:
+ TableScan
+ alias: sih_src
+ Row Limit Per Split: 10
+ Statistics: Num rows: 1500 Data size: 18124 Basic stats: COMPLETE Column stats: COMPLETE
+ Select Operator
+ Statistics: Num rows: 1500 Data size: 18124 Basic stats: COMPLETE Column stats: COMPLETE
+ Group By Operator
+ aggregations: count(1)
+ mode: hash
+ outputColumnNames: _col0
+ Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE
+ Reduce Output Operator
+ sort order:
+ Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE
+ value expressions: _col0 (type: bigint)
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations: count(VALUE._col0)
+ mode: mergepartial
+ outputColumnNames: _col0
+ Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE
+ File Output Operator
+ compressed: false
+ Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+ Processor Tree:
+ ListSink
+
+PREHOOK: query: select count(1) from sih_src tablesample(10 rows)
+PREHOOK: type: QUERY
+PREHOOK: Input: default@sih_src
+#### A masked pattern was here ####
+650