You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by we...@apache.org on 2017/05/24 23:51:42 UTC
[08/54] [abbrv] hive git commit: HIVE-16698: HoS should avoid mapjoin
optimization in case of union and using table stats (Chao Sun,
reviewed by Xuefu Zhang)
HIVE-16698: HoS should avoid mapjoin optimization in case of union and using table stats (Chao Sun, reviewed by Xuefu Zhang)
Project: http://git-wip-us.apache.org/repos/asf/hive/repo
Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/ed3c3edc
Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/ed3c3edc
Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/ed3c3edc
Branch: refs/heads/hive-14535
Commit: ed3c3edcf175276a1b061680651fa1753e354c0e
Parents: 9298315
Author: Chao Sun <su...@apache.org>
Authored: Wed May 17 09:39:29 2017 -0700
Committer: Chao Sun <su...@apache.org>
Committed: Wed May 17 23:33:52 2017 -0700
----------------------------------------------------------------------
.../test/resources/testconfiguration.properties | 2 +-
.../optimizer/spark/SparkMapJoinOptimizer.java | 3 +-
.../spark_use_file_size_for_mapjoin.q | 30 -
.../spark_use_ts_stats_for_mapjoin.q | 75 +++
.../spark/spark_use_ts_stats_for_mapjoin.q.out | 574 +++++++++++++++++++
5 files changed, 652 insertions(+), 32 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/hive/blob/ed3c3edc/itests/src/test/resources/testconfiguration.properties
----------------------------------------------------------------------
diff --git a/itests/src/test/resources/testconfiguration.properties b/itests/src/test/resources/testconfiguration.properties
index 51385cf..e8df4d7 100644
--- a/itests/src/test/resources/testconfiguration.properties
+++ b/itests/src/test/resources/testconfiguration.properties
@@ -1377,7 +1377,7 @@ spark.only.query.files=spark_combine_equivalent_work.q,\
spark_dynamic_partition_pruning_2.q,\
spark_explainuser_1.q,\
spark_vectorized_dynamic_partition_pruning.q,\
- spark_use_file_size_for_mapjoin.q,\
+ spark_use_ts_stats_for_mapjoin.q,\
spark_use_op_stats.q
miniSparkOnYarn.query.files=auto_sortmerge_join_16.q,\
http://git-wip-us.apache.org/repos/asf/hive/blob/ed3c3edc/ql/src/java/org/apache/hadoop/hive/ql/optimizer/spark/SparkMapJoinOptimizer.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/spark/SparkMapJoinOptimizer.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/spark/SparkMapJoinOptimizer.java
index 9243873..81c2348 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/spark/SparkMapJoinOptimizer.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/spark/SparkMapJoinOptimizer.java
@@ -206,7 +206,8 @@ public class SparkMapJoinOptimizer implements NodeProcessor {
LOG.debug("Checking map join optimization for operator {} using TS stats", joinOp);
for (Operator<? extends OperatorDesc> parentOp : joinOp.getParentOperators()) {
if (isBigTableBranch(parentOp)) {
- if (bigTablePosition < 0 && bigTableCandidateSet.contains(pos)) {
+ if (bigTablePosition < 0 && bigTableCandidateSet.contains(pos)
+ && !containUnionWithoutRS(parentOp.getParentOperators().get(0))) {
LOG.debug("Found a big table branch with parent operator {} and position {}", parentOp, pos);
bigTablePosition = pos;
bigTableFound = true;
http://git-wip-us.apache.org/repos/asf/hive/blob/ed3c3edc/ql/src/test/queries/clientpositive/spark_use_file_size_for_mapjoin.q
----------------------------------------------------------------------
diff --git a/ql/src/test/queries/clientpositive/spark_use_file_size_for_mapjoin.q b/ql/src/test/queries/clientpositive/spark_use_file_size_for_mapjoin.q
deleted file mode 100644
index b623b83..0000000
--- a/ql/src/test/queries/clientpositive/spark_use_file_size_for_mapjoin.q
+++ /dev/null
@@ -1,30 +0,0 @@
-set hive.mapred.mode=nonstrict;
-set hive.auto.convert.join=true;
-set hive.spark.use.file.size.for.mapjoin=true;
-set hive.auto.convert.join.noconditionaltask.size=4000;
-
-EXPLAIN
-SELECT src1.key, src2.value
-FROM src src1 JOIN src src2 ON (src1.key = src2.key)
-WHERE src1.key = 97;
-
-SELECT src1.key, src2.value
-FROM src src1 JOIN src src2 ON (src1.key = src2.key)
-WHERE src1.key = 97;
-
-set hive.auto.convert.join.noconditionaltask.size=8000;
-
--- This is copied from auto_join2. Without the configuration both joins are mapjoins,
--- but with the configuration on, Hive should not turn the second join into mapjoin since it
--- has a upstream reduce sink.
-
-CREATE TABLE dest(key INT, value STRING) STORED AS TEXTFILE;
-
-EXPLAIN
-FROM src src1 JOIN src src2 ON (src1.key = src2.key) JOIN src src3 ON (src1.key + src2.key = src3.key)
-INSERT OVERWRITE TABLE dest SELECT src1.key, src3.value;
-
-FROM src src1 JOIN src src2 ON (src1.key = src2.key) JOIN src src3 ON (src1.key + src2.key = src3.key)
-INSERT OVERWRITE TABLE dest SELECT src1.key, src3.value;
-
-SELECT sum(hash(dest.key,dest.value)) FROM dest;
http://git-wip-us.apache.org/repos/asf/hive/blob/ed3c3edc/ql/src/test/queries/clientpositive/spark_use_ts_stats_for_mapjoin.q
----------------------------------------------------------------------
diff --git a/ql/src/test/queries/clientpositive/spark_use_ts_stats_for_mapjoin.q b/ql/src/test/queries/clientpositive/spark_use_ts_stats_for_mapjoin.q
new file mode 100644
index 0000000..26d9e50
--- /dev/null
+++ b/ql/src/test/queries/clientpositive/spark_use_ts_stats_for_mapjoin.q
@@ -0,0 +1,75 @@
+set hive.mapred.mode=nonstrict;
+set hive.auto.convert.join=true;
+set hive.spark.use.ts.stats.for.mapjoin=true;
+set hive.auto.convert.join.noconditionaltask.size=4000;
+
+EXPLAIN
+SELECT src1.key, src2.value
+FROM src src1 JOIN src src2 ON (src1.key = src2.key)
+WHERE src1.key = 97;
+
+SELECT src1.key, src2.value
+FROM src src1 JOIN src src2 ON (src1.key = src2.key)
+WHERE src1.key = 97;
+
+set hive.auto.convert.join.noconditionaltask.size=8000;
+
+-- This is copied from auto_join2. Without the configuration both joins are mapjoins,
+-- but with the configuration on, Hive should not turn the second join into mapjoin since it
+-- has a upstream reduce sink.
+
+CREATE TABLE dest(key INT, value STRING) STORED AS TEXTFILE;
+
+EXPLAIN
+FROM src src1 JOIN src src2 ON (src1.key = src2.key) JOIN src src3 ON (src1.key + src2.key = src3.key)
+INSERT OVERWRITE TABLE dest SELECT src1.key, src3.value;
+
+FROM src src1 JOIN src src2 ON (src1.key = src2.key) JOIN src src3 ON (src1.key + src2.key = src3.key)
+INSERT OVERWRITE TABLE dest SELECT src1.key, src3.value;
+
+SELECT sum(hash(dest.key,dest.value)) FROM dest;
+
+
+-- Test for HIVE-16698, for the case of UNION + MAPJOIN
+
+set hive.auto.convert.join.noconditionaltask.size=16;
+
+CREATE TABLE a (c1 STRING, c2 INT);
+CREATE TABLE b (c3 STRING, c4 INT);
+CREATE TABLE c (c1 STRING, c2 INT);
+CREATE TABLE d (c3 STRING, c4 INT);
+CREATE TABLE e (c5 STRING, c6 INT);
+INSERT INTO TABLE a VALUES ("a1", 1), ("a2", 2), ("a3", 3), ("a4", 4), ("a5", 5), ("a6", 6), ("a7", 7);
+INSERT INTO TABLE b VALUES ("b1", 1), ("b2", 2), ("b3", 3), ("b4", 4);
+INSERT INTO TABLE c VALUES ("c1", 1), ("c2", 2), ("c3", 3), ("c4", 4), ("c5", 5), ("c6", 6), ("c7", 7);
+INSERT INTO TABLE d VALUES ("d1", 1), ("d2", 2), ("d3", 3), ("d4", 4);
+INSERT INTO TABLE e VALUES ("d1", 1), ("d2", 2);
+
+EXPLAIN
+WITH t1 AS (
+SELECT a.c1 AS c1, a.c2 AS c2, b.c3 AS c3 FROM a JOIN b ON a.c2 = b.c4
+),
+t2 AS (
+SELECT c.c1 AS c1, c.c2 AS c2, d.c3 AS c3 FROM c JOIN d ON c.c2 = d.c4
+),
+t3 AS (
+SELECT * FROM t1 UNION ALL SELECT * FROM t2
+),
+t4 AS (
+SELECT t3.c1, t3.c3, t5.c5 FROM t3 JOIN e AS t5 ON t3.c2 = t5.c6
+)
+SELECT * FROM t4;
+
+WITH t1 AS (
+SELECT a.c1 AS c1, a.c2 AS c2, b.c3 AS c3 FROM a JOIN b ON a.c2 = b.c4
+),
+t2 AS (
+SELECT c.c1 AS c1, c.c2 AS c2, d.c3 AS c3 FROM c JOIN d ON c.c2 = d.c4
+),
+t3 AS (
+SELECT * FROM t1 UNION ALL SELECT * FROM t2
+),
+t4 AS (
+SELECT t3.c1, t3.c3, t5.c5 FROM t3 JOIN e AS t5 ON t3.c2 = t5.c6
+)
+SELECT * FROM t4;
http://git-wip-us.apache.org/repos/asf/hive/blob/ed3c3edc/ql/src/test/results/clientpositive/spark/spark_use_ts_stats_for_mapjoin.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/spark/spark_use_ts_stats_for_mapjoin.q.out b/ql/src/test/results/clientpositive/spark/spark_use_ts_stats_for_mapjoin.q.out
new file mode 100644
index 0000000..7ebae9e
--- /dev/null
+++ b/ql/src/test/results/clientpositive/spark/spark_use_ts_stats_for_mapjoin.q.out
@@ -0,0 +1,574 @@
+PREHOOK: query: EXPLAIN
+SELECT src1.key, src2.value
+FROM src src1 JOIN src src2 ON (src1.key = src2.key)
+WHERE src1.key = 97
+PREHOOK: type: QUERY
+POSTHOOK: query: EXPLAIN
+SELECT src1.key, src2.value
+FROM src src1 JOIN src src2 ON (src1.key = src2.key)
+WHERE src1.key = 97
+POSTHOOK: type: QUERY
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+ Stage: Stage-1
+ Spark
+ Edges:
+ Reducer 2 <- Map 1 (PARTITION-LEVEL SORT, 2), Map 3 (PARTITION-LEVEL SORT, 2)
+#### A masked pattern was here ####
+ Vertices:
+ Map 1
+ Map Operator Tree:
+ TableScan
+ alias: src1
+ Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE
+ Filter Operator
+ predicate: (UDFToDouble(key) = 97.0) (type: boolean)
+ Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE
+ Select Operator
+ expressions: key (type: string)
+ outputColumnNames: _col0
+ Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE
+ Reduce Output Operator
+ key expressions: _col0 (type: string)
+ sort order: +
+ Map-reduce partition columns: _col0 (type: string)
+ Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE
+ Map 3
+ Map Operator Tree:
+ TableScan
+ alias: src2
+ Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE
+ Filter Operator
+ predicate: (UDFToDouble(key) = 97.0) (type: boolean)
+ Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE
+ Select Operator
+ expressions: key (type: string), value (type: string)
+ outputColumnNames: _col0, _col1
+ Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE
+ Reduce Output Operator
+ key expressions: _col0 (type: string)
+ sort order: +
+ Map-reduce partition columns: _col0 (type: string)
+ Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE
+ value expressions: _col1 (type: string)
+ Reducer 2
+ Reduce Operator Tree:
+ Join Operator
+ condition map:
+ Inner Join 0 to 1
+ keys:
+ 0 _col0 (type: string)
+ 1 _col0 (type: string)
+ outputColumnNames: _col0, _col2
+ Statistics: Num rows: 275 Data size: 2921 Basic stats: COMPLETE Column stats: NONE
+ Select Operator
+ expressions: _col0 (type: string), _col2 (type: string)
+ outputColumnNames: _col0, _col1
+ Statistics: Num rows: 275 Data size: 2921 Basic stats: COMPLETE Column stats: NONE
+ File Output Operator
+ compressed: false
+ Statistics: Num rows: 275 Data size: 2921 Basic stats: COMPLETE Column stats: NONE
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+ Processor Tree:
+ ListSink
+
+PREHOOK: query: SELECT src1.key, src2.value
+FROM src src1 JOIN src src2 ON (src1.key = src2.key)
+WHERE src1.key = 97
+PREHOOK: type: QUERY
+PREHOOK: Input: default@src
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT src1.key, src2.value
+FROM src src1 JOIN src src2 ON (src1.key = src2.key)
+WHERE src1.key = 97
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@src
+#### A masked pattern was here ####
+97 val_97
+97 val_97
+97 val_97
+97 val_97
+PREHOOK: query: CREATE TABLE dest(key INT, value STRING) STORED AS TEXTFILE
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@dest
+POSTHOOK: query: CREATE TABLE dest(key INT, value STRING) STORED AS TEXTFILE
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@dest
+PREHOOK: query: EXPLAIN
+FROM src src1 JOIN src src2 ON (src1.key = src2.key) JOIN src src3 ON (src1.key + src2.key = src3.key)
+INSERT OVERWRITE TABLE dest SELECT src1.key, src3.value
+PREHOOK: type: QUERY
+POSTHOOK: query: EXPLAIN
+FROM src src1 JOIN src src2 ON (src1.key = src2.key) JOIN src src3 ON (src1.key + src2.key = src3.key)
+INSERT OVERWRITE TABLE dest SELECT src1.key, src3.value
+POSTHOOK: type: QUERY
+STAGE DEPENDENCIES:
+ Stage-3 is a root stage
+ Stage-1 depends on stages: Stage-3
+ Stage-0 depends on stages: Stage-1
+ Stage-2 depends on stages: Stage-0
+
+STAGE PLANS:
+ Stage: Stage-3
+ Spark
+#### A masked pattern was here ####
+ Vertices:
+ Map 4
+ Map Operator Tree:
+ TableScan
+ alias: src3
+ Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE
+ Filter Operator
+ predicate: key is not null (type: boolean)
+ Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE
+ Select Operator
+ expressions: key (type: string), value (type: string)
+ outputColumnNames: _col0, _col1
+ Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE
+ Spark HashTable Sink Operator
+ keys:
+ 0 (UDFToDouble(_col0) + UDFToDouble(_col1)) (type: double)
+ 1 UDFToDouble(_col0) (type: double)
+ Local Work:
+ Map Reduce Local Work
+
+ Stage: Stage-1
+ Spark
+ Edges:
+ Reducer 2 <- Map 1 (PARTITION-LEVEL SORT, 2), Map 3 (PARTITION-LEVEL SORT, 2)
+#### A masked pattern was here ####
+ Vertices:
+ Map 1
+ Map Operator Tree:
+ TableScan
+ alias: src1
+ Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE
+ Filter Operator
+ predicate: key is not null (type: boolean)
+ Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE
+ Select Operator
+ expressions: key (type: string)
+ outputColumnNames: _col0
+ Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE
+ Reduce Output Operator
+ key expressions: _col0 (type: string)
+ sort order: +
+ Map-reduce partition columns: _col0 (type: string)
+ Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE
+ Map 3
+ Map Operator Tree:
+ TableScan
+ alias: src2
+ Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE
+ Filter Operator
+ predicate: key is not null (type: boolean)
+ Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE
+ Select Operator
+ expressions: key (type: string)
+ outputColumnNames: _col0
+ Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE
+ Reduce Output Operator
+ key expressions: _col0 (type: string)
+ sort order: +
+ Map-reduce partition columns: _col0 (type: string)
+ Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE
+ Reducer 2
+ Local Work:
+ Map Reduce Local Work
+ Reduce Operator Tree:
+ Join Operator
+ condition map:
+ Inner Join 0 to 1
+ keys:
+ 0 _col0 (type: string)
+ 1 _col0 (type: string)
+ outputColumnNames: _col0, _col1
+ Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE
+ Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ keys:
+ 0 (UDFToDouble(_col0) + UDFToDouble(_col1)) (type: double)
+ 1 UDFToDouble(_col0) (type: double)
+ outputColumnNames: _col0, _col3
+ input vertices:
+ 1 Map 4
+ Statistics: Num rows: 605 Data size: 6427 Basic stats: COMPLETE Column stats: NONE
+ Select Operator
+ expressions: UDFToInteger(_col0) (type: int), _col3 (type: string)
+ outputColumnNames: _col0, _col1
+ Statistics: Num rows: 605 Data size: 6427 Basic stats: COMPLETE Column stats: NONE
+ File Output Operator
+ compressed: false
+ Statistics: Num rows: 605 Data size: 6427 Basic stats: COMPLETE Column stats: NONE
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.dest
+
+ Stage: Stage-0
+ Move Operator
+ tables:
+ replace: true
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.dest
+
+ Stage: Stage-2
+ Stats-Aggr Operator
+
+PREHOOK: query: FROM src src1 JOIN src src2 ON (src1.key = src2.key) JOIN src src3 ON (src1.key + src2.key = src3.key)
+INSERT OVERWRITE TABLE dest SELECT src1.key, src3.value
+PREHOOK: type: QUERY
+PREHOOK: Input: default@src
+PREHOOK: Output: default@dest
+POSTHOOK: query: FROM src src1 JOIN src src2 ON (src1.key = src2.key) JOIN src src3 ON (src1.key + src2.key = src3.key)
+INSERT OVERWRITE TABLE dest SELECT src1.key, src3.value
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@src
+POSTHOOK: Output: default@dest
+POSTHOOK: Lineage: dest.key EXPRESSION [(src)src1.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: dest.value SIMPLE [(src)src3.FieldSchema(name:value, type:string, comment:default), ]
+PREHOOK: query: SELECT sum(hash(dest.key,dest.value)) FROM dest
+PREHOOK: type: QUERY
+PREHOOK: Input: default@dest
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT sum(hash(dest.key,dest.value)) FROM dest
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@dest
+#### A masked pattern was here ####
+33815990627
+PREHOOK: query: CREATE TABLE a (c1 STRING, c2 INT)
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@a
+POSTHOOK: query: CREATE TABLE a (c1 STRING, c2 INT)
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@a
+PREHOOK: query: CREATE TABLE b (c3 STRING, c4 INT)
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@b
+POSTHOOK: query: CREATE TABLE b (c3 STRING, c4 INT)
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@b
+PREHOOK: query: CREATE TABLE c (c1 STRING, c2 INT)
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@c
+POSTHOOK: query: CREATE TABLE c (c1 STRING, c2 INT)
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@c
+PREHOOK: query: CREATE TABLE d (c3 STRING, c4 INT)
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@d
+POSTHOOK: query: CREATE TABLE d (c3 STRING, c4 INT)
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@d
+PREHOOK: query: CREATE TABLE e (c5 STRING, c6 INT)
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@e
+POSTHOOK: query: CREATE TABLE e (c5 STRING, c6 INT)
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@e
+PREHOOK: query: INSERT INTO TABLE a VALUES ("a1", 1), ("a2", 2), ("a3", 3), ("a4", 4), ("a5", 5), ("a6", 6), ("a7", 7)
+PREHOOK: type: QUERY
+PREHOOK: Output: default@a
+POSTHOOK: query: INSERT INTO TABLE a VALUES ("a1", 1), ("a2", 2), ("a3", 3), ("a4", 4), ("a5", 5), ("a6", 6), ("a7", 7)
+POSTHOOK: type: QUERY
+POSTHOOK: Output: default@a
+POSTHOOK: Lineage: a.c1 SIMPLE [(values__tmp__table__1)values__tmp__table__1.FieldSchema(name:tmp_values_col1, type:string, comment:), ]
+POSTHOOK: Lineage: a.c2 EXPRESSION [(values__tmp__table__1)values__tmp__table__1.FieldSchema(name:tmp_values_col2, type:string, comment:), ]
+PREHOOK: query: INSERT INTO TABLE b VALUES ("b1", 1), ("b2", 2), ("b3", 3), ("b4", 4)
+PREHOOK: type: QUERY
+PREHOOK: Output: default@b
+POSTHOOK: query: INSERT INTO TABLE b VALUES ("b1", 1), ("b2", 2), ("b3", 3), ("b4", 4)
+POSTHOOK: type: QUERY
+POSTHOOK: Output: default@b
+POSTHOOK: Lineage: b.c3 SIMPLE [(values__tmp__table__2)values__tmp__table__2.FieldSchema(name:tmp_values_col1, type:string, comment:), ]
+POSTHOOK: Lineage: b.c4 EXPRESSION [(values__tmp__table__2)values__tmp__table__2.FieldSchema(name:tmp_values_col2, type:string, comment:), ]
+PREHOOK: query: INSERT INTO TABLE c VALUES ("c1", 1), ("c2", 2), ("c3", 3), ("c4", 4), ("c5", 5), ("c6", 6), ("c7", 7)
+PREHOOK: type: QUERY
+PREHOOK: Output: default@c
+POSTHOOK: query: INSERT INTO TABLE c VALUES ("c1", 1), ("c2", 2), ("c3", 3), ("c4", 4), ("c5", 5), ("c6", 6), ("c7", 7)
+POSTHOOK: type: QUERY
+POSTHOOK: Output: default@c
+POSTHOOK: Lineage: c.c1 SIMPLE [(values__tmp__table__3)values__tmp__table__3.FieldSchema(name:tmp_values_col1, type:string, comment:), ]
+POSTHOOK: Lineage: c.c2 EXPRESSION [(values__tmp__table__3)values__tmp__table__3.FieldSchema(name:tmp_values_col2, type:string, comment:), ]
+PREHOOK: query: INSERT INTO TABLE d VALUES ("d1", 1), ("d2", 2), ("d3", 3), ("d4", 4)
+PREHOOK: type: QUERY
+PREHOOK: Output: default@d
+POSTHOOK: query: INSERT INTO TABLE d VALUES ("d1", 1), ("d2", 2), ("d3", 3), ("d4", 4)
+POSTHOOK: type: QUERY
+POSTHOOK: Output: default@d
+POSTHOOK: Lineage: d.c3 SIMPLE [(values__tmp__table__4)values__tmp__table__4.FieldSchema(name:tmp_values_col1, type:string, comment:), ]
+POSTHOOK: Lineage: d.c4 EXPRESSION [(values__tmp__table__4)values__tmp__table__4.FieldSchema(name:tmp_values_col2, type:string, comment:), ]
+PREHOOK: query: INSERT INTO TABLE e VALUES ("d1", 1), ("d2", 2)
+PREHOOK: type: QUERY
+PREHOOK: Output: default@e
+POSTHOOK: query: INSERT INTO TABLE e VALUES ("d1", 1), ("d2", 2)
+POSTHOOK: type: QUERY
+POSTHOOK: Output: default@e
+POSTHOOK: Lineage: e.c5 SIMPLE [(values__tmp__table__5)values__tmp__table__5.FieldSchema(name:tmp_values_col1, type:string, comment:), ]
+POSTHOOK: Lineage: e.c6 EXPRESSION [(values__tmp__table__5)values__tmp__table__5.FieldSchema(name:tmp_values_col2, type:string, comment:), ]
+PREHOOK: query: EXPLAIN
+WITH t1 AS (
+SELECT a.c1 AS c1, a.c2 AS c2, b.c3 AS c3 FROM a JOIN b ON a.c2 = b.c4
+),
+t2 AS (
+SELECT c.c1 AS c1, c.c2 AS c2, d.c3 AS c3 FROM c JOIN d ON c.c2 = d.c4
+),
+t3 AS (
+SELECT * FROM t1 UNION ALL SELECT * FROM t2
+),
+t4 AS (
+SELECT t3.c1, t3.c3, t5.c5 FROM t3 JOIN e AS t5 ON t3.c2 = t5.c6
+)
+SELECT * FROM t4
+PREHOOK: type: QUERY
+POSTHOOK: query: EXPLAIN
+WITH t1 AS (
+SELECT a.c1 AS c1, a.c2 AS c2, b.c3 AS c3 FROM a JOIN b ON a.c2 = b.c4
+),
+t2 AS (
+SELECT c.c1 AS c1, c.c2 AS c2, d.c3 AS c3 FROM c JOIN d ON c.c2 = d.c4
+),
+t3 AS (
+SELECT * FROM t1 UNION ALL SELECT * FROM t2
+),
+t4 AS (
+SELECT t3.c1, t3.c3, t5.c5 FROM t3 JOIN e AS t5 ON t3.c2 = t5.c6
+)
+SELECT * FROM t4
+POSTHOOK: type: QUERY
+STAGE DEPENDENCIES:
+ Stage-2 is a root stage
+ Stage-3 depends on stages: Stage-2
+ Stage-1 depends on stages: Stage-3
+ Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+ Stage: Stage-2
+ Spark
+#### A masked pattern was here ####
+ Vertices:
+ Map 3
+ Map Operator Tree:
+ TableScan
+ alias: b
+ Statistics: Num rows: 4 Data size: 16 Basic stats: COMPLETE Column stats: NONE
+ Filter Operator
+ predicate: c4 is not null (type: boolean)
+ Statistics: Num rows: 4 Data size: 16 Basic stats: COMPLETE Column stats: NONE
+ Select Operator
+ expressions: c3 (type: string), c4 (type: int)
+ outputColumnNames: _col0, _col1
+ Statistics: Num rows: 4 Data size: 16 Basic stats: COMPLETE Column stats: NONE
+ Spark HashTable Sink Operator
+ keys:
+ 0 _col1 (type: int)
+ 1 _col1 (type: int)
+ Local Work:
+ Map Reduce Local Work
+
+ Stage: Stage-3
+ Spark
+#### A masked pattern was here ####
+ Vertices:
+ Map 5
+ Map Operator Tree:
+ TableScan
+ alias: d
+ Statistics: Num rows: 4 Data size: 16 Basic stats: COMPLETE Column stats: NONE
+ Filter Operator
+ predicate: c4 is not null (type: boolean)
+ Statistics: Num rows: 4 Data size: 16 Basic stats: COMPLETE Column stats: NONE
+ Select Operator
+ expressions: c3 (type: string), c4 (type: int)
+ outputColumnNames: _col0, _col1
+ Statistics: Num rows: 4 Data size: 16 Basic stats: COMPLETE Column stats: NONE
+ Spark HashTable Sink Operator
+ keys:
+ 0 _col1 (type: int)
+ 1 _col1 (type: int)
+ Local Work:
+ Map Reduce Local Work
+
+ Stage: Stage-1
+ Spark
+ Edges:
+ Reducer 2 <- Map 1 (PARTITION-LEVEL SORT, 2), Map 4 (PARTITION-LEVEL SORT, 2), Map 6 (PARTITION-LEVEL SORT, 2)
+#### A masked pattern was here ####
+ Vertices:
+ Map 1
+ Map Operator Tree:
+ TableScan
+ alias: a
+ Statistics: Num rows: 7 Data size: 28 Basic stats: COMPLETE Column stats: NONE
+ Filter Operator
+ predicate: c2 is not null (type: boolean)
+ Statistics: Num rows: 7 Data size: 28 Basic stats: COMPLETE Column stats: NONE
+ Select Operator
+ expressions: c1 (type: string), c2 (type: int)
+ outputColumnNames: _col0, _col1
+ Statistics: Num rows: 7 Data size: 28 Basic stats: COMPLETE Column stats: NONE
+ Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ keys:
+ 0 _col1 (type: int)
+ 1 _col1 (type: int)
+ outputColumnNames: _col0, _col1, _col2
+ input vertices:
+ 1 Map 3
+ Statistics: Num rows: 7 Data size: 30 Basic stats: COMPLETE Column stats: NONE
+ Reduce Output Operator
+ key expressions: _col1 (type: int)
+ sort order: +
+ Map-reduce partition columns: _col1 (type: int)
+ Statistics: Num rows: 14 Data size: 60 Basic stats: COMPLETE Column stats: NONE
+ value expressions: _col0 (type: string), _col2 (type: string)
+ Local Work:
+ Map Reduce Local Work
+ Map 4
+ Map Operator Tree:
+ TableScan
+ alias: c
+ Statistics: Num rows: 7 Data size: 28 Basic stats: COMPLETE Column stats: NONE
+ Filter Operator
+ predicate: c2 is not null (type: boolean)
+ Statistics: Num rows: 7 Data size: 28 Basic stats: COMPLETE Column stats: NONE
+ Select Operator
+ expressions: c1 (type: string), c2 (type: int)
+ outputColumnNames: _col0, _col1
+ Statistics: Num rows: 7 Data size: 28 Basic stats: COMPLETE Column stats: NONE
+ Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ keys:
+ 0 _col1 (type: int)
+ 1 _col1 (type: int)
+ outputColumnNames: _col0, _col1, _col2
+ input vertices:
+ 1 Map 5
+ Statistics: Num rows: 7 Data size: 30 Basic stats: COMPLETE Column stats: NONE
+ Reduce Output Operator
+ key expressions: _col1 (type: int)
+ sort order: +
+ Map-reduce partition columns: _col1 (type: int)
+ Statistics: Num rows: 14 Data size: 60 Basic stats: COMPLETE Column stats: NONE
+ value expressions: _col0 (type: string), _col2 (type: string)
+ Local Work:
+ Map Reduce Local Work
+ Map 6
+ Map Operator Tree:
+ TableScan
+ alias: t5
+ Statistics: Num rows: 2 Data size: 8 Basic stats: COMPLETE Column stats: NONE
+ Filter Operator
+ predicate: c6 is not null (type: boolean)
+ Statistics: Num rows: 2 Data size: 8 Basic stats: COMPLETE Column stats: NONE
+ Select Operator
+ expressions: c5 (type: string), c6 (type: int)
+ outputColumnNames: _col0, _col1
+ Statistics: Num rows: 2 Data size: 8 Basic stats: COMPLETE Column stats: NONE
+ Reduce Output Operator
+ key expressions: _col1 (type: int)
+ sort order: +
+ Map-reduce partition columns: _col1 (type: int)
+ Statistics: Num rows: 2 Data size: 8 Basic stats: COMPLETE Column stats: NONE
+ value expressions: _col0 (type: string)
+ Reducer 2
+ Reduce Operator Tree:
+ Join Operator
+ condition map:
+ Inner Join 0 to 1
+ keys:
+ 0 _col1 (type: int)
+ 1 _col1 (type: int)
+ outputColumnNames: _col0, _col2, _col3
+ Statistics: Num rows: 15 Data size: 66 Basic stats: COMPLETE Column stats: NONE
+ Select Operator
+ expressions: _col0 (type: string), _col2 (type: string), _col3 (type: string)
+ outputColumnNames: _col0, _col1, _col2
+ Statistics: Num rows: 15 Data size: 66 Basic stats: COMPLETE Column stats: NONE
+ File Output Operator
+ compressed: false
+ Statistics: Num rows: 15 Data size: 66 Basic stats: COMPLETE Column stats: NONE
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+ Processor Tree:
+ ListSink
+
+PREHOOK: query: WITH t1 AS (
+SELECT a.c1 AS c1, a.c2 AS c2, b.c3 AS c3 FROM a JOIN b ON a.c2 = b.c4
+),
+t2 AS (
+SELECT c.c1 AS c1, c.c2 AS c2, d.c3 AS c3 FROM c JOIN d ON c.c2 = d.c4
+),
+t3 AS (
+SELECT * FROM t1 UNION ALL SELECT * FROM t2
+),
+t4 AS (
+SELECT t3.c1, t3.c3, t5.c5 FROM t3 JOIN e AS t5 ON t3.c2 = t5.c6
+)
+SELECT * FROM t4
+PREHOOK: type: QUERY
+PREHOOK: Input: default@a
+PREHOOK: Input: default@b
+PREHOOK: Input: default@c
+PREHOOK: Input: default@d
+PREHOOK: Input: default@e
+#### A masked pattern was here ####
+POSTHOOK: query: WITH t1 AS (
+SELECT a.c1 AS c1, a.c2 AS c2, b.c3 AS c3 FROM a JOIN b ON a.c2 = b.c4
+),
+t2 AS (
+SELECT c.c1 AS c1, c.c2 AS c2, d.c3 AS c3 FROM c JOIN d ON c.c2 = d.c4
+),
+t3 AS (
+SELECT * FROM t1 UNION ALL SELECT * FROM t2
+),
+t4 AS (
+SELECT t3.c1, t3.c3, t5.c5 FROM t3 JOIN e AS t5 ON t3.c2 = t5.c6
+)
+SELECT * FROM t4
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@a
+POSTHOOK: Input: default@b
+POSTHOOK: Input: default@c
+POSTHOOK: Input: default@d
+POSTHOOK: Input: default@e
+#### A masked pattern was here ####
+c2 d2 d2
+a2 b2 d2
+a1 b1 d1
+c1 d1 d1