You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by vg...@apache.org on 2019/07/08 19:51:27 UTC
[hive] branch master updated: HIVE-21915: Hive with TEZ UNION ALL
and UDTF results in data loss (Wei Zhang, reviewed by Vineet Garg)
This is an automated email from the ASF dual-hosted git repository.
vgarg pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hive.git
The following commit(s) were added to refs/heads/master by this push:
new d7214ea HIVE-21915: Hive with TEZ UNION ALL and UDTF results in data loss (Wei Zhang, reviewed by Vineet Garg)
d7214ea is described below
commit d7214eae068b8c76fd84bb4bcfbc6de7c22da335
Author: Vineet Garg <vg...@apache.org>
AuthorDate: Mon Jul 8 12:50:35 2019 -0700
HIVE-21915: Hive with TEZ UNION ALL and UDTF results in data loss (Wei Zhang, reviewed by Vineet Garg)
---
.../test/resources/testconfiguration.properties | 3 +-
.../apache/hadoop/hive/ql/parse/GenTezUtils.java | 6 +-
.../test/queries/clientpositive/tez_union_udtf.q | 22 +++
.../clientpositive/tez/tez_union_udtf.q.out | 153 +++++++++++++++++++++
4 files changed, 182 insertions(+), 2 deletions(-)
diff --git a/itests/src/test/resources/testconfiguration.properties b/itests/src/test/resources/testconfiguration.properties
index 7e3294f..6d6e5db 100644
--- a/itests/src/test/resources/testconfiguration.properties
+++ b/itests/src/test/resources/testconfiguration.properties
@@ -44,7 +44,8 @@ minitez.query.files=acid_vectorization_original_tez.q,\
hybridgrace_hashjoin_2.q,\
multi_count_distinct.q,\
tez-tag.q,\
- tez_union_with_udf.q
+ tez_union_with_udf.q,\
+ tez_union_udtf.q
minillap.shared.query.files=insert_into1.q,\
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/GenTezUtils.java b/ql/src/java/org/apache/hadoop/hive/ql/parse/GenTezUtils.java
index d64f983..2877479 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/parse/GenTezUtils.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/GenTezUtils.java
@@ -318,7 +318,11 @@ public class GenTezUtils {
FileSinkOperator fileSink = (FileSinkOperator)current;
// remember it for additional processing later
- context.fileSinkSet.add(fileSink);
+ if (context.fileSinkSet.contains(fileSink)) {
+ continue;
+ } else {
+ context.fileSinkSet.add(fileSink);
+ }
FileSinkDesc desc = fileSink.getConf();
Path path = desc.getDirName();
diff --git a/ql/src/test/queries/clientpositive/tez_union_udtf.q b/ql/src/test/queries/clientpositive/tez_union_udtf.q
new file mode 100644
index 0000000..ed58cfd
--- /dev/null
+++ b/ql/src/test/queries/clientpositive/tez_union_udtf.q
@@ -0,0 +1,22 @@
+--! qt:dataset:src1
+--! qt:dataset:src
+set hive.merge.tezfiles=true;
+-- SORT_BEFORE_DIFF
+
+EXPLAIN
+CREATE TABLE x AS
+ SELECT key, 1 as tag FROM src WHERE key = '238'
+ UNION ALL
+ SELECT key, tag FROM src1
+ LATERAL VIEW EXPLODE(array(2)) tf as tag
+ WHERE key = '238';
+
+CREATE TABLE x AS
+ SELECT key, 1 as tag FROM src WHERE key = '238'
+ UNION ALL
+ SELECT key, tag FROM src1
+ LATERAL VIEW EXPLODE(array(2)) tf as tag
+ WHERE key = '238';
+
+SELECT * FROM x;
+
diff --git a/ql/src/test/results/clientpositive/tez/tez_union_udtf.q.out b/ql/src/test/results/clientpositive/tez/tez_union_udtf.q.out
new file mode 100644
index 0000000..cfe4481
--- /dev/null
+++ b/ql/src/test/results/clientpositive/tez/tez_union_udtf.q.out
@@ -0,0 +1,153 @@
+PREHOOK: query: EXPLAIN
+CREATE TABLE x AS
+ SELECT key, 1 as tag FROM src WHERE key = '238'
+ UNION ALL
+ SELECT key, tag FROM src1
+ LATERAL VIEW EXPLODE(array(2)) tf as tag
+ WHERE key = '238'
+PREHOOK: type: CREATETABLE_AS_SELECT
+PREHOOK: Input: default@src
+PREHOOK: Input: default@src1
+PREHOOK: Output: database:default
+PREHOOK: Output: default@x
+POSTHOOK: query: EXPLAIN
+CREATE TABLE x AS
+ SELECT key, 1 as tag FROM src WHERE key = '238'
+ UNION ALL
+ SELECT key, tag FROM src1
+ LATERAL VIEW EXPLODE(array(2)) tf as tag
+ WHERE key = '238'
+POSTHOOK: type: CREATETABLE_AS_SELECT
+POSTHOOK: Input: default@src
+POSTHOOK: Input: default@src1
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@x
+Plan not optimized by CBO because the statement has lateral views
+
+Vertex dependency in root stage
+Map 1 <- Union 2 (CONTAINS)
+Map 4 <- Union 2 (CONTAINS)
+Reducer 3 <- Union 2 (CUSTOM_SIMPLE_EDGE)
+
+Stage-3
+ Stats Work{}
+ Stage-9
+ Create Table{"name:":"default.x"}
+ Stage-0
+ Move Operator
+ Stage-5(CONDITIONAL)
+ Move Operator
+ Stage-8(CONDITIONAL CHILD TASKS: Stage-5, Stage-4, Stage-6)
+ Conditional Operator
+ Stage-1
+ Reducer 3
+ File Output Operator [FS_23]
+ Group By Operator [GBY_21] (rows=1 width=880)
+ Output:["_col0","_col1"],aggregations:["compute_stats(VALUE._col0)","compute_stats(VALUE._col1)"]
+ <-Union 2 [CUSTOM_SIMPLE_EDGE]
+ <-Map 1 [CONTAINS]
+ File Output Operator [FS_31]
+ table:{"name:":"default.x"}
+ Select Operator [SEL_30] (rows=6 width=91)
+ Output:["_col0","_col1"]
+ Select Operator [SEL_28] (rows=2 width=91)
+ Output:["_col1"]
+ Filter Operator [FIL_27] (rows=2 width=87)
+ predicate:(key = '238')
+ TableScan [TS_26] (rows=500 width=87)
+ Output:["key"]
+ Reduce Output Operator [RS_34]
+ Group By Operator [GBY_33] (rows=1 width=864)
+ Output:["_col0","_col1"],aggregations:["compute_stats(col1, 'hll')","compute_stats(col2, 'hll')"]
+ Select Operator [SEL_32] (rows=6 width=91)
+ Output:["col1","col2"]
+ Please refer to the previous Select Operator [SEL_30]
+ <-Map 4 [CONTAINS]
+ File Output Operator [FS_45]
+ table:{"name:":"default.x"}
+ Select Operator [SEL_44] (rows=6 width=91)
+ Output:["_col0","_col1"]
+ Select Operator [SEL_42] (rows=4 width=87)
+ Output:["_col1"]
+ Lateral View Join Operator [LVJ_40] (rows=4 width=239)
+ Output:["_col5"]
+ Select Operator [SEL_38] (rows=2 width=431)
+ Lateral View Forward [LVF_37] (rows=2 width=86)
+ Filter Operator [FIL_36] (rows=2 width=86)
+ predicate:(key = '238')
+ TableScan [TS_35] (rows=25 width=86)
+ Output:["key"]
+ Reduce Output Operator [RS_48]
+ Group By Operator [GBY_47] (rows=1 width=864)
+ Output:["_col0","_col1"],aggregations:["compute_stats(col1, 'hll')","compute_stats(col2, 'hll')"]
+ Select Operator [SEL_46] (rows=6 width=91)
+ Output:["col1","col2"]
+ Please refer to the previous Select Operator [SEL_44]
+ File Output Operator [FS_45]
+ table:{"name:":"default.x"}
+ Select Operator [SEL_44] (rows=6 width=91)
+ Output:["_col0","_col1"]
+ Select Operator [SEL_42] (rows=4 width=87)
+ Output:["_col1"]
+ Lateral View Join Operator [LVJ_40] (rows=4 width=239)
+ Output:["_col5"]
+ UDTF Operator [UDTF_41] (rows=2 width=48)
+ function name:explode
+ Select Operator [SEL_39] (rows=2 width=48)
+ Output:["_col0"]
+ Please refer to the previous Lateral View Forward [LVF_37]
+ Reduce Output Operator [RS_48]
+ Group By Operator [GBY_47] (rows=1 width=864)
+ Output:["_col0","_col1"],aggregations:["compute_stats(col1, 'hll')","compute_stats(col2, 'hll')"]
+ Select Operator [SEL_46] (rows=6 width=91)
+ Output:["col1","col2"]
+ Please refer to the previous Select Operator [SEL_44]
+ Stage-4(CONDITIONAL)
+ File Merge
+ Please refer to the previous Stage-8(CONDITIONAL CHILD TASKS: Stage-5, Stage-4, Stage-6)
+ Stage-7
+ Move Operator
+ Stage-6(CONDITIONAL)
+ File Merge
+ Please refer to the previous Stage-8(CONDITIONAL CHILD TASKS: Stage-5, Stage-4, Stage-6)
+ Stage-2
+ Dependency Collection{}
+ Please refer to the previous Stage-5(CONDITIONAL)
+ Please refer to the previous Stage-4(CONDITIONAL)
+ Please refer to the previous Stage-7
+
+PREHOOK: query: CREATE TABLE x AS
+ SELECT key, 1 as tag FROM src WHERE key = '238'
+ UNION ALL
+ SELECT key, tag FROM src1
+ LATERAL VIEW EXPLODE(array(2)) tf as tag
+ WHERE key = '238'
+PREHOOK: type: CREATETABLE_AS_SELECT
+PREHOOK: Input: default@src
+PREHOOK: Input: default@src1
+PREHOOK: Output: database:default
+PREHOOK: Output: default@x
+POSTHOOK: query: CREATE TABLE x AS
+ SELECT key, 1 as tag FROM src WHERE key = '238'
+ UNION ALL
+ SELECT key, tag FROM src1
+ LATERAL VIEW EXPLODE(array(2)) tf as tag
+ WHERE key = '238'
+POSTHOOK: type: CREATETABLE_AS_SELECT
+POSTHOOK: Input: default@src
+POSTHOOK: Input: default@src1
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@x
+POSTHOOK: Lineage: x.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), (src1)src1.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: x.tag EXPRESSION []
+PREHOOK: query: SELECT * FROM x
+PREHOOK: type: QUERY
+PREHOOK: Input: default@x
+PREHOOK: Output: hdfs://### HDFS PATH ###
+POSTHOOK: query: SELECT * FROM x
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@x
+POSTHOOK: Output: hdfs://### HDFS PATH ###
+238 1
+238 1
+238 2