You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by br...@apache.org on 2014/08/26 03:07:26 UTC
svn commit: r1620492 - in /hive/branches/spark:
common/src/java/org/apache/hadoop/hive/conf/ itests/src/test/resources/
ql/src/java/org/apache/hadoop/hive/ql/optimizer/
ql/src/test/results/clientpositive/spark/
Author: brock
Date: Tue Aug 26 01:07:25 2014
New Revision: 1620492
URL: http://svn.apache.org/r1620492
Log:
HIVE-7810 - Insert overwrite table query has strange behavior when set hive.optimize.union.remove=true [Spark Branch] (Na Yang via Brock)
Added:
hive/branches/spark/ql/src/test/results/clientpositive/spark/union_remove_10.q.out
hive/branches/spark/ql/src/test/results/clientpositive/spark/union_remove_9.q.out
Modified:
hive/branches/spark/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
hive/branches/spark/itests/src/test/resources/testconfiguration.properties
hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMapRedUtils.java
hive/branches/spark/ql/src/test/results/clientpositive/spark/union_remove_16.q.out
hive/branches/spark/ql/src/test/results/clientpositive/spark/union_remove_4.q.out
hive/branches/spark/ql/src/test/results/clientpositive/spark/union_remove_5.q.out
Modified: hive/branches/spark/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
URL: http://svn.apache.org/viewvc/hive/branches/spark/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java?rev=1620492&r1=1620491&r2=1620492&view=diff
==============================================================================
--- hive/branches/spark/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java (original)
+++ hive/branches/spark/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java Tue Aug 26 01:07:25 2014
@@ -793,6 +793,7 @@ public class HiveConf extends Configurat
HIVEMERGEMAPREDFILES("hive.merge.mapredfiles", false,
"Merge small files at the end of a map-reduce job"),
HIVEMERGETEZFILES("hive.merge.tezfiles", false, "Merge small files at the end of a Tez DAG"),
+ HIVEMERGESPARKFILES("hive.merge.sparkfiles", false, "Merge small files at the end of a Spark DAG Transformation"),
HIVEMERGEMAPFILESSIZE("hive.merge.size.per.task", (long) (256 * 1000 * 1000),
"Size of merged files at the end of the job"),
HIVEMERGEMAPFILESAVGSIZE("hive.merge.smallfiles.avgsize", (long) (16 * 1000 * 1000),
Modified: hive/branches/spark/itests/src/test/resources/testconfiguration.properties
URL: http://svn.apache.org/viewvc/hive/branches/spark/itests/src/test/resources/testconfiguration.properties?rev=1620492&r1=1620491&r2=1620492&view=diff
==============================================================================
--- hive/branches/spark/itests/src/test/resources/testconfiguration.properties (original)
+++ hive/branches/spark/itests/src/test/resources/testconfiguration.properties Tue Aug 26 01:07:25 2014
@@ -424,6 +424,7 @@ spark.query.files=alter_merge_orc.q \
union_null.q \
union_ppr.q \
union_remove_1.q \
+ union_remove_10.q \
union_remove_11.q \
union_remove_15.q \
union_remove_16.q \
@@ -440,4 +441,5 @@ spark.query.files=alter_merge_orc.q \
union_remove_5.q \
union_remove_6.q \
union_remove_7.q \
- union_remove_8.q
+ union_remove_8.q \
+ union_remove_9.q
Modified: hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMapRedUtils.java
URL: http://svn.apache.org/viewvc/hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMapRedUtils.java?rev=1620492&r1=1620491&r2=1620492&view=diff
==============================================================================
--- hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMapRedUtils.java (original)
+++ hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMapRedUtils.java Tue Aug 26 01:07:25 2014
@@ -1671,6 +1671,9 @@ public final class GenMapRedUtils {
// tez blurs the boundary between map and reduce, thus it has it's own
// config
return hconf.getBoolVar(ConfVars.HIVEMERGETEZFILES);
+ } else if (currTask.getWork() instanceof SparkWork) {
+ // spark has its own config for merging
+ return hconf.getBoolVar(ConfVars.HIVEMERGESPARKFILES);
}
if (fsOp.getConf().isLinkedFileSink()) {
Added: hive/branches/spark/ql/src/test/results/clientpositive/spark/union_remove_10.q.out
URL: http://svn.apache.org/viewvc/hive/branches/spark/ql/src/test/results/clientpositive/spark/union_remove_10.q.out?rev=1620492&view=auto
==============================================================================
--- hive/branches/spark/ql/src/test/results/clientpositive/spark/union_remove_10.q.out (added)
+++ hive/branches/spark/ql/src/test/results/clientpositive/spark/union_remove_10.q.out Tue Aug 26 01:07:25 2014
@@ -0,0 +1,273 @@
+PREHOOK: query: -- This is to test the union->selectstar->filesink optimization
+-- Union of 2 subqueries is performed (one of which is a map-only query, and the
+-- other one contains a nested union where one of the sub-queries requires a map-reduce
+-- job), followed by select star and a file sink.
+-- There is no need to write the temporary results of the sub-queries, and then read them
+-- again to process the union. The outer union can be removed completely.
+-- The final file format is different from the input and intermediate file format.
+-- It does not matter, whether the output is merged or not. In this case, merging is turned
+-- on
+
+-- INCLUDE_HADOOP_MAJOR_VERSIONS(0.23)
+-- Since this test creates sub-directories for the output table outputTbl1, it might be easier
+-- to run the test only on hadoop 23
+
+create table inputTbl1(key string, val string) stored as textfile
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@inputTbl1
+POSTHOOK: query: -- This is to test the union->selectstar->filesink optimization
+-- Union of 2 subqueries is performed (one of which is a map-only query, and the
+-- other one contains a nested union where one of the sub-queries requires a map-reduce
+-- job), followed by select star and a file sink.
+-- There is no need to write the temporary results of the sub-queries, and then read them
+-- again to process the union. The outer union can be removed completely.
+-- The final file format is different from the input and intermediate file format.
+-- It does not matter, whether the output is merged or not. In this case, merging is turned
+-- on
+
+-- INCLUDE_HADOOP_MAJOR_VERSIONS(0.23)
+-- Since this test creates sub-directories for the output table outputTbl1, it might be easier
+-- to run the test only on hadoop 23
+
+create table inputTbl1(key string, val string) stored as textfile
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@inputTbl1
+PREHOOK: query: create table outputTbl1(key string, values bigint) stored as rcfile
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@outputTbl1
+POSTHOOK: query: create table outputTbl1(key string, values bigint) stored as rcfile
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@outputTbl1
+PREHOOK: query: load data local inpath '../../data/files/T1.txt' into table inputTbl1
+PREHOOK: type: LOAD
+#### A masked pattern was here ####
+PREHOOK: Output: default@inputtbl1
+POSTHOOK: query: load data local inpath '../../data/files/T1.txt' into table inputTbl1
+POSTHOOK: type: LOAD
+#### A masked pattern was here ####
+POSTHOOK: Output: default@inputtbl1
+PREHOOK: query: explain
+insert overwrite table outputTbl1
+SELECT * FROM
+(
+select key, 1 as values from inputTbl1
+union all
+select * FROM (
+ SELECT key, count(1) as values from inputTbl1 group by key
+ UNION ALL
+ SELECT key, 2 as values from inputTbl1
+) a
+)b
+PREHOOK: type: QUERY
+POSTHOOK: query: explain
+insert overwrite table outputTbl1
+SELECT * FROM
+(
+select key, 1 as values from inputTbl1
+union all
+select * FROM (
+ SELECT key, count(1) as values from inputTbl1 group by key
+ UNION ALL
+ SELECT key, 2 as values from inputTbl1
+) a
+)b
+POSTHOOK: type: QUERY
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-2 depends on stages: Stage-1
+ Stage-0 depends on stages: Stage-2
+
+STAGE PLANS:
+ Stage: Stage-1
+ Spark
+ Edges:
+ Reducer 2 <- Map 1 (GROUP)
+ Union 3 <- Map 5 (NONE), Reducer 2 (NONE)
+#### A masked pattern was here ####
+ Vertices:
+ Map 1
+ Map Operator Tree:
+ TableScan
+ alias: inputtbl1
+ Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE
+ Select Operator
+ expressions: key (type: string)
+ outputColumnNames: key
+ Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE
+ Group By Operator
+ aggregations: count(1)
+ keys: key (type: string)
+ mode: hash
+ outputColumnNames: _col0, _col1
+ Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE
+ Reduce Output Operator
+ key expressions: _col0 (type: string)
+ sort order: +
+ Map-reduce partition columns: _col0 (type: string)
+ Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE
+ value expressions: _col1 (type: bigint)
+ Map 4
+ Map Operator Tree:
+ TableScan
+ alias: inputtbl1
+ Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE
+ Select Operator
+ expressions: key (type: string), UDFToLong(1) (type: bigint)
+ outputColumnNames: _col0, _col1
+ Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE
+ File Output Operator
+ compressed: false
+ Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE
+ table:
+ input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.RCFileOutputFormat
+ serde: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe
+ name: default.outputtbl1
+ Map 5
+ Map Operator Tree:
+ TableScan
+ alias: inputtbl1
+ Select Operator
+ expressions: key (type: string), UDFToLong(2) (type: bigint)
+ outputColumnNames: _col0, _col1
+ Select Operator
+ expressions: _col0 (type: string), _col1 (type: bigint)
+ outputColumnNames: _col0, _col1
+ File Output Operator
+ compressed: false
+ table:
+ input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.RCFileOutputFormat
+ serde: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe
+ name: default.outputtbl1
+ Reducer 2
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations: count(VALUE._col0)
+ keys: KEY._col0 (type: string)
+ mode: mergepartial
+ outputColumnNames: _col0, _col1
+ Select Operator
+ expressions: _col0 (type: string), _col1 (type: bigint)
+ outputColumnNames: _col0, _col1
+ Select Operator
+ expressions: _col0 (type: string), _col1 (type: bigint)
+ outputColumnNames: _col0, _col1
+ File Output Operator
+ compressed: false
+ table:
+ input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.RCFileOutputFormat
+ serde: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe
+ name: default.outputtbl1
+ Union 3
+ Vertex: Union 3
+
+ Stage: Stage-2
+ Dependency Collection
+
+ Stage: Stage-0
+ Move Operator
+ tables:
+ replace: true
+ table:
+ input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.RCFileOutputFormat
+ serde: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe
+ name: default.outputtbl1
+
+PREHOOK: query: insert overwrite table outputTbl1
+SELECT * FROM
+(
+select key, 1 as values from inputTbl1
+union all
+select * FROM (
+ SELECT key, count(1) as values from inputTbl1 group by key
+ UNION ALL
+ SELECT key, 2 as values from inputTbl1
+) a
+)b
+PREHOOK: type: QUERY
+PREHOOK: Input: default@inputtbl1
+PREHOOK: Output: default@outputtbl1
+POSTHOOK: query: insert overwrite table outputTbl1
+SELECT * FROM
+(
+select key, 1 as values from inputTbl1
+union all
+select * FROM (
+ SELECT key, count(1) as values from inputTbl1 group by key
+ UNION ALL
+ SELECT key, 2 as values from inputTbl1
+) a
+)b
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@inputtbl1
+POSTHOOK: Output: default@outputtbl1
+POSTHOOK: Lineage: outputtbl1.key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ]
+POSTHOOK: Lineage: outputtbl1.values EXPRESSION [(inputtbl1)inputtbl1.null, ]
+PREHOOK: query: desc formatted outputTbl1
+PREHOOK: type: DESCTABLE
+PREHOOK: Input: default@outputtbl1
+POSTHOOK: query: desc formatted outputTbl1
+POSTHOOK: type: DESCTABLE
+POSTHOOK: Input: default@outputtbl1
+# col_name data_type comment
+
+key string
+values bigint
+
+# Detailed Table Information
+Database: default
+#### A masked pattern was here ####
+Protect Mode: None
+Retention: 0
+#### A masked pattern was here ####
+Table Type: MANAGED_TABLE
+Table Parameters:
+ COLUMN_STATS_ACCURATE false
+ numFiles 3
+ numRows -1
+ rawDataSize -1
+ totalSize 271
+#### A masked pattern was here ####
+
+# Storage Information
+SerDe Library: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe
+InputFormat: org.apache.hadoop.hive.ql.io.RCFileInputFormat
+OutputFormat: org.apache.hadoop.hive.ql.io.RCFileOutputFormat
+Compressed: No
+Num Buckets: -1
+Bucket Columns: []
+Sort Columns: []
+Storage Desc Params:
+ serialization.format 1
+PREHOOK: query: select * from outputTbl1 order by key, values
+PREHOOK: type: QUERY
+PREHOOK: Input: default@outputtbl1
+#### A masked pattern was here ####
+POSTHOOK: query: select * from outputTbl1 order by key, values
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@outputtbl1
+#### A masked pattern was here ####
+1 1
+1 1
+1 2
+2 1
+2 1
+2 2
+3 1
+3 1
+3 2
+7 1
+7 1
+7 2
+8 1
+8 1
+8 2
+8 2
+8 2
Modified: hive/branches/spark/ql/src/test/results/clientpositive/spark/union_remove_16.q.out
URL: http://svn.apache.org/viewvc/hive/branches/spark/ql/src/test/results/clientpositive/spark/union_remove_16.q.out?rev=1620492&r1=1620491&r2=1620492&view=diff
==============================================================================
--- hive/branches/spark/ql/src/test/results/clientpositive/spark/union_remove_16.q.out (original)
+++ hive/branches/spark/ql/src/test/results/clientpositive/spark/union_remove_16.q.out Tue Aug 26 01:07:25 2014
@@ -66,13 +66,8 @@ FROM (
POSTHOOK: type: QUERY
STAGE DEPENDENCIES:
Stage-1 is a root stage
- Stage-7 depends on stages: Stage-1 , consists of Stage-4, Stage-3, Stage-5
- Stage-4
- Stage-2 depends on stages: Stage-4, Stage-3, Stage-6
+ Stage-2 depends on stages: Stage-1
Stage-0 depends on stages: Stage-2
- Stage-3
- Stage-5
- Stage-6 depends on stages: Stage-5
STAGE PLANS:
Stage: Stage-1
@@ -165,15 +160,6 @@ STAGE PLANS:
serde: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe
name: default.outputtbl1
- Stage: Stage-7
- Conditional Operator
-
- Stage: Stage-4
- Move Operator
- files:
- hdfs directory: true
-#### A masked pattern was here ####
-
Stage: Stage-2
Dependency Collection
@@ -189,22 +175,6 @@ STAGE PLANS:
serde: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe
name: default.outputtbl1
- Stage: Stage-3
- Merge Work
- merge level: block
- input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat
-
- Stage: Stage-5
- Merge Work
- merge level: block
- input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat
-
- Stage: Stage-6
- Move Operator
- files:
- hdfs directory: true
-#### A masked pattern was here ####
-
PREHOOK: query: insert overwrite table outputTbl1 partition (ds)
SELECT *
FROM (
Modified: hive/branches/spark/ql/src/test/results/clientpositive/spark/union_remove_4.q.out
URL: http://svn.apache.org/viewvc/hive/branches/spark/ql/src/test/results/clientpositive/spark/union_remove_4.q.out?rev=1620492&r1=1620491&r2=1620492&view=diff
==============================================================================
--- hive/branches/spark/ql/src/test/results/clientpositive/spark/union_remove_4.q.out (original)
+++ hive/branches/spark/ql/src/test/results/clientpositive/spark/union_remove_4.q.out Tue Aug 26 01:07:25 2014
@@ -62,13 +62,8 @@ FROM (
POSTHOOK: type: QUERY
STAGE DEPENDENCIES:
Stage-1 is a root stage
- Stage-7 depends on stages: Stage-1 , consists of Stage-4, Stage-3, Stage-5
- Stage-4
- Stage-2 depends on stages: Stage-4, Stage-3, Stage-6
+ Stage-2 depends on stages: Stage-1
Stage-0 depends on stages: Stage-2
- Stage-3
- Stage-5
- Stage-6 depends on stages: Stage-5
STAGE PLANS:
Stage: Stage-1
@@ -161,15 +156,6 @@ STAGE PLANS:
serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
name: default.outputtbl1
- Stage: Stage-7
- Conditional Operator
-
- Stage: Stage-4
- Move Operator
- files:
- hdfs directory: true
-#### A masked pattern was here ####
-
Stage: Stage-2
Dependency Collection
@@ -183,42 +169,6 @@ STAGE PLANS:
serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
name: default.outputtbl1
- Stage: Stage-3
- Spark
-#### A masked pattern was here ####
- Vertices:
- Merge
- Map Operator Tree:
- TableScan
- File Output Operator
- compressed: false
- table:
- input format: org.apache.hadoop.mapred.TextInputFormat
- output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
- serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- name: default.outputtbl1
-
- Stage: Stage-5
- Spark
-#### A masked pattern was here ####
- Vertices:
- Merge
- Map Operator Tree:
- TableScan
- File Output Operator
- compressed: false
- table:
- input format: org.apache.hadoop.mapred.TextInputFormat
- output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
- serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- name: default.outputtbl1
-
- Stage: Stage-6
- Move Operator
- files:
- hdfs directory: true
-#### A masked pattern was here ####
-
PREHOOK: query: insert overwrite table outputTbl1
SELECT *
FROM (
Modified: hive/branches/spark/ql/src/test/results/clientpositive/spark/union_remove_5.q.out
URL: http://svn.apache.org/viewvc/hive/branches/spark/ql/src/test/results/clientpositive/spark/union_remove_5.q.out?rev=1620492&r1=1620491&r2=1620492&view=diff
==============================================================================
--- hive/branches/spark/ql/src/test/results/clientpositive/spark/union_remove_5.q.out (original)
+++ hive/branches/spark/ql/src/test/results/clientpositive/spark/union_remove_5.q.out Tue Aug 26 01:07:25 2014
@@ -70,13 +70,8 @@ FROM (
POSTHOOK: type: QUERY
STAGE DEPENDENCIES:
Stage-1 is a root stage
- Stage-7 depends on stages: Stage-1 , consists of Stage-4, Stage-3, Stage-5
- Stage-4
- Stage-2 depends on stages: Stage-4, Stage-3, Stage-6
+ Stage-2 depends on stages: Stage-1
Stage-0 depends on stages: Stage-2
- Stage-3
- Stage-5
- Stage-6 depends on stages: Stage-5
STAGE PLANS:
Stage: Stage-1
@@ -161,15 +156,6 @@ STAGE PLANS:
serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
name: default.outputtbl1
- Stage: Stage-7
- Conditional Operator
-
- Stage: Stage-4
- Move Operator
- files:
- hdfs directory: true
-#### A masked pattern was here ####
-
Stage: Stage-2
Dependency Collection
@@ -183,42 +169,6 @@ STAGE PLANS:
serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
name: default.outputtbl1
- Stage: Stage-3
- Spark
-#### A masked pattern was here ####
- Vertices:
- Merge
- Map Operator Tree:
- TableScan
- File Output Operator
- compressed: false
- table:
- input format: org.apache.hadoop.mapred.TextInputFormat
- output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
- serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- name: default.outputtbl1
-
- Stage: Stage-5
- Spark
-#### A masked pattern was here ####
- Vertices:
- Merge
- Map Operator Tree:
- TableScan
- File Output Operator
- compressed: false
- table:
- input format: org.apache.hadoop.mapred.TextInputFormat
- output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
- serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
- name: default.outputtbl1
-
- Stage: Stage-6
- Move Operator
- files:
- hdfs directory: true
-#### A masked pattern was here ####
-
PREHOOK: query: insert overwrite table outputTbl1
SELECT *
FROM (
Added: hive/branches/spark/ql/src/test/results/clientpositive/spark/union_remove_9.q.out
URL: http://svn.apache.org/viewvc/hive/branches/spark/ql/src/test/results/clientpositive/spark/union_remove_9.q.out?rev=1620492&view=auto
==============================================================================
--- hive/branches/spark/ql/src/test/results/clientpositive/spark/union_remove_9.q.out (added)
+++ hive/branches/spark/ql/src/test/results/clientpositive/spark/union_remove_9.q.out Tue Aug 26 01:07:25 2014
@@ -0,0 +1,269 @@
+PREHOOK: query: -- This is to test the union->selectstar->filesink optimization
+-- Union of 2 subqueries is performed (one of which contains a union and is map-only),
+-- and the other one is a map-reduce query followed by select star and a file sink.
+-- There is no need for the outer union.
+-- The final file format is different from the input and intermediate file format.
+-- It does not matter, whether the output is merged or not. In this case, merging is turned
+-- on
+
+-- INCLUDE_HADOOP_MAJOR_VERSIONS(0.23)
+-- Since this test creates sub-directories for the output table outputTbl1, it might be easier
+-- to run the test only on hadoop 23
+
+create table inputTbl1(key string, val string) stored as textfile
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@inputTbl1
+POSTHOOK: query: -- This is to test the union->selectstar->filesink optimization
+-- Union of 2 subqueries is performed (one of which contains a union and is map-only),
+-- and the other one is a map-reduce query followed by select star and a file sink.
+-- There is no need for the outer union.
+-- The final file format is different from the input and intermediate file format.
+-- It does not matter, whether the output is merged or not. In this case, merging is turned
+-- on
+
+-- INCLUDE_HADOOP_MAJOR_VERSIONS(0.23)
+-- Since this test creates sub-directories for the output table outputTbl1, it might be easier
+-- to run the test only on hadoop 23
+
+create table inputTbl1(key string, val string) stored as textfile
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@inputTbl1
+PREHOOK: query: create table outputTbl1(key string, values bigint) stored as rcfile
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@outputTbl1
+POSTHOOK: query: create table outputTbl1(key string, values bigint) stored as rcfile
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@outputTbl1
+PREHOOK: query: load data local inpath '../../data/files/T1.txt' into table inputTbl1
+PREHOOK: type: LOAD
+#### A masked pattern was here ####
+PREHOOK: Output: default@inputtbl1
+POSTHOOK: query: load data local inpath '../../data/files/T1.txt' into table inputTbl1
+POSTHOOK: type: LOAD
+#### A masked pattern was here ####
+POSTHOOK: Output: default@inputtbl1
+PREHOOK: query: explain
+insert overwrite table outputTbl1
+SELECT * FROM
+(
+select key, count(1) as values from inputTbl1 group by key
+union all
+select * FROM (
+ SELECT key, 1 as values from inputTbl1
+ UNION ALL
+ SELECT key, 2 as values from inputTbl1
+) a
+)b
+PREHOOK: type: QUERY
+POSTHOOK: query: explain
+insert overwrite table outputTbl1
+SELECT * FROM
+(
+select key, count(1) as values from inputTbl1 group by key
+union all
+select * FROM (
+ SELECT key, 1 as values from inputTbl1
+ UNION ALL
+ SELECT key, 2 as values from inputTbl1
+) a
+)b
+POSTHOOK: type: QUERY
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-2 depends on stages: Stage-1
+ Stage-0 depends on stages: Stage-2
+
+STAGE PLANS:
+ Stage: Stage-1
+ Spark
+ Edges:
+ Reducer 4 <- Map 3 (GROUP)
+ Union 2 <- Map 1 (NONE), Map 5 (NONE)
+#### A masked pattern was here ####
+ Vertices:
+ Map 1
+ Map Operator Tree:
+ TableScan
+ alias: inputtbl1
+ Select Operator
+ expressions: key (type: string), 1 (type: int)
+ outputColumnNames: _col0, _col1
+ Select Operator
+ expressions: _col0 (type: string), UDFToLong(_col1) (type: bigint)
+ outputColumnNames: _col0, _col1
+ File Output Operator
+ compressed: false
+ table:
+ input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.RCFileOutputFormat
+ serde: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe
+ name: default.outputtbl1
+ Map 3
+ Map Operator Tree:
+ TableScan
+ alias: inputtbl1
+ Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE
+ Select Operator
+ expressions: key (type: string)
+ outputColumnNames: key
+ Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE
+ Group By Operator
+ aggregations: count(1)
+ keys: key (type: string)
+ mode: hash
+ outputColumnNames: _col0, _col1
+ Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE
+ Reduce Output Operator
+ key expressions: _col0 (type: string)
+ sort order: +
+ Map-reduce partition columns: _col0 (type: string)
+ Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE
+ value expressions: _col1 (type: bigint)
+ Map 5
+ Map Operator Tree:
+ TableScan
+ alias: inputtbl1
+ Select Operator
+ expressions: key (type: string), 2 (type: int)
+ outputColumnNames: _col0, _col1
+ Select Operator
+ expressions: _col0 (type: string), UDFToLong(_col1) (type: bigint)
+ outputColumnNames: _col0, _col1
+ File Output Operator
+ compressed: false
+ table:
+ input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.RCFileOutputFormat
+ serde: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe
+ name: default.outputtbl1
+ Reducer 4
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations: count(VALUE._col0)
+ keys: KEY._col0 (type: string)
+ mode: mergepartial
+ outputColumnNames: _col0, _col1
+ Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE
+ Select Operator
+ expressions: _col0 (type: string), _col1 (type: bigint)
+ outputColumnNames: _col0, _col1
+ Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE
+ File Output Operator
+ compressed: false
+ Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE
+ table:
+ input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.RCFileOutputFormat
+ serde: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe
+ name: default.outputtbl1
+ Union 2
+ Vertex: Union 2
+
+ Stage: Stage-2
+ Dependency Collection
+
+ Stage: Stage-0
+ Move Operator
+ tables:
+ replace: true
+ table:
+ input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.RCFileOutputFormat
+ serde: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe
+ name: default.outputtbl1
+
+PREHOOK: query: insert overwrite table outputTbl1
+SELECT * FROM
+(
+select key, count(1) as values from inputTbl1 group by key
+union all
+select * FROM (
+ SELECT key, 1 as values from inputTbl1
+ UNION ALL
+ SELECT key, 2 as values from inputTbl1
+) a
+)b
+PREHOOK: type: QUERY
+PREHOOK: Input: default@inputtbl1
+PREHOOK: Output: default@outputtbl1
+POSTHOOK: query: insert overwrite table outputTbl1
+SELECT * FROM
+(
+select key, count(1) as values from inputTbl1 group by key
+union all
+select * FROM (
+ SELECT key, 1 as values from inputTbl1
+ UNION ALL
+ SELECT key, 2 as values from inputTbl1
+) a
+)b
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@inputtbl1
+POSTHOOK: Output: default@outputtbl1
+POSTHOOK: Lineage: outputtbl1.key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ]
+POSTHOOK: Lineage: outputtbl1.values EXPRESSION [(inputtbl1)inputtbl1.null, ]
+PREHOOK: query: desc formatted outputTbl1
+PREHOOK: type: DESCTABLE
+PREHOOK: Input: default@outputtbl1
+POSTHOOK: query: desc formatted outputTbl1
+POSTHOOK: type: DESCTABLE
+POSTHOOK: Input: default@outputtbl1
+# col_name data_type comment
+
+key string
+values bigint
+
+# Detailed Table Information
+Database: default
+#### A masked pattern was here ####
+Protect Mode: None
+Retention: 0
+#### A masked pattern was here ####
+Table Type: MANAGED_TABLE
+Table Parameters:
+ COLUMN_STATS_ACCURATE false
+ numFiles 3
+ numRows -1
+ rawDataSize -1
+ totalSize 271
+#### A masked pattern was here ####
+
+# Storage Information
+SerDe Library: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe
+InputFormat: org.apache.hadoop.hive.ql.io.RCFileInputFormat
+OutputFormat: org.apache.hadoop.hive.ql.io.RCFileOutputFormat
+Compressed: No
+Num Buckets: -1
+Bucket Columns: []
+Sort Columns: []
+Storage Desc Params:
+ serialization.format 1
+PREHOOK: query: select * from outputTbl1 order by key, values
+PREHOOK: type: QUERY
+PREHOOK: Input: default@outputtbl1
+#### A masked pattern was here ####
+POSTHOOK: query: select * from outputTbl1 order by key, values
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@outputtbl1
+#### A masked pattern was here ####
+1 1
+1 1
+1 2
+2 1
+2 1
+2 2
+3 1
+3 1
+3 2
+7 1
+7 1
+7 2
+8 1
+8 1
+8 2
+8 2
+8 2