You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by ct...@apache.org on 2015/12/05 17:52:27 UTC

hive git commit: HIVE-12566: Incorrect result returns when using COALESCE in WHERE condition with LEFT JOIN (Chaoyu Tang, reviewed by Xuefu Zhang, Jesus Camacho Rodriguez)

Repository: hive
Updated Branches:
  refs/heads/master b2efd1a69 -> 1806dbce4


HIVE-12566: Incorrect result returns when using COALESCE in WHERE condition with LEFT JOIN (Chaoyu Tang, reviewed by Xuefu Zhang, Jesus Camacho Rodriguez)


Project: http://git-wip-us.apache.org/repos/asf/hive/repo
Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/1806dbce
Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/1806dbce
Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/1806dbce

Branch: refs/heads/master
Commit: 1806dbce41bddc3cbb6741ca87c05f5fe83cb3f4
Parents: b2efd1a
Author: ctang <ct...@gmail.com>
Authored: Sat Dec 5 11:51:45 2015 -0500
Committer: ctang <ct...@gmail.com>
Committed: Sat Dec 5 11:51:45 2015 -0500

----------------------------------------------------------------------
 .../hadoop/hive/ql/parse/SemanticAnalyzer.java  |   2 +-
 .../clientpositive/join_cond_pushdown_unqual5.q |  14 ++
 .../join_cond_pushdown_unqual5.q.out            | 186 +++++++++++++++++++
 3 files changed, 201 insertions(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hive/blob/1806dbce/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java b/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java
index 4ebdf90..e1a0c4a 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java
@@ -9643,7 +9643,7 @@ public class SemanticAnalyzer extends BaseSemanticAnalyzer {
          * as Join conditions
          */
         Set<String> dests = qb.getParseInfo().getClauseNames();
-        if ( dests.size() == 1 ) {
+        if ( dests.size() == 1 && joinTree.getNoOuterJoin()) {
           String dest = dests.iterator().next();
           ASTNode whereClause = qb.getParseInfo().getWhrForClause(dest);
           if ( whereClause != null ) {

http://git-wip-us.apache.org/repos/asf/hive/blob/1806dbce/ql/src/test/queries/clientpositive/join_cond_pushdown_unqual5.q
----------------------------------------------------------------------
diff --git a/ql/src/test/queries/clientpositive/join_cond_pushdown_unqual5.q b/ql/src/test/queries/clientpositive/join_cond_pushdown_unqual5.q
new file mode 100644
index 0000000..7e6d32b
--- /dev/null
+++ b/ql/src/test/queries/clientpositive/join_cond_pushdown_unqual5.q
@@ -0,0 +1,14 @@
+-- outer join is not qualified for pushing down of where to join condition
+CREATE TABLE ltable (index int, la int, lk1 string, lk2 string) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',';
+CREATE TABLE rtable (ra int, rk1 string, rk2 string) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',';
+
+insert into ltable values (1, null, 'CD5415192314304', '00071'), (2, null, 'CD5415192225530', '00071');
+insert into rtable values (1, 'CD5415192314304', '00071'), (45, 'CD5415192314304', '00072');
+
+set hive.auto.convert.join=false;
+EXPLAIN SELECT * FROM ltable l LEFT OUTER JOIN rtable r on (l.lk1 = r.rk1 AND l.lk2 = r.rk2) WHERE COALESCE(l.la,'EMPTY')=COALESCE(r.ra,'EMPTY');
+SELECT * FROM ltable l LEFT OUTER JOIN rtable r on (l.lk1 = r.rk1 AND l.lk2 = r.rk2) WHERE COALESCE(l.la,'EMPTY')=COALESCE(r.ra,'EMPTY');
+
+set hive.auto.convert.join=true;
+EXPLAIN SELECT * FROM ltable l LEFT OUTER JOIN rtable r on (l.lk1 = r.rk1 AND l.lk2 = r.rk2) WHERE COALESCE(l.la,'EMPTY')=COALESCE(r.ra,'EMPTY');
+SELECT * FROM ltable l LEFT OUTER JOIN rtable r on (l.lk1 = r.rk1 AND l.lk2 = r.rk2) WHERE COALESCE(l.la,'EMPTY')=COALESCE(r.ra,'EMPTY');
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/hive/blob/1806dbce/ql/src/test/results/clientpositive/join_cond_pushdown_unqual5.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/join_cond_pushdown_unqual5.q.out b/ql/src/test/results/clientpositive/join_cond_pushdown_unqual5.q.out
new file mode 100644
index 0000000..ab333b0
--- /dev/null
+++ b/ql/src/test/results/clientpositive/join_cond_pushdown_unqual5.q.out
@@ -0,0 +1,186 @@
+PREHOOK: query: -- outer join is not qualified for pushing down of where to join condition
+CREATE TABLE ltable (index int, la int, lk1 string, lk2 string) ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@ltable
+POSTHOOK: query: -- outer join is not qualified for pushing down of where to join condition
+CREATE TABLE ltable (index int, la int, lk1 string, lk2 string) ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@ltable
+PREHOOK: query: CREATE TABLE rtable (ra int, rk1 string, rk2 string) ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@rtable
+POSTHOOK: query: CREATE TABLE rtable (ra int, rk1 string, rk2 string) ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@rtable
+PREHOOK: query: insert into ltable values (1, null, 'CD5415192314304', '00071'), (2, null, 'CD5415192225530', '00071')
+PREHOOK: type: QUERY
+PREHOOK: Input: default@values__tmp__table__1
+PREHOOK: Output: default@ltable
+POSTHOOK: query: insert into ltable values (1, null, 'CD5415192314304', '00071'), (2, null, 'CD5415192225530', '00071')
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@values__tmp__table__1
+POSTHOOK: Output: default@ltable
+POSTHOOK: Lineage: ltable.index EXPRESSION [(values__tmp__table__1)values__tmp__table__1.FieldSchema(name:tmp_values_col1, type:string, comment:), ]
+POSTHOOK: Lineage: ltable.la EXPRESSION [(values__tmp__table__1)values__tmp__table__1.FieldSchema(name:tmp_values_col2, type:string, comment:), ]
+POSTHOOK: Lineage: ltable.lk1 SIMPLE [(values__tmp__table__1)values__tmp__table__1.FieldSchema(name:tmp_values_col3, type:string, comment:), ]
+POSTHOOK: Lineage: ltable.lk2 SIMPLE [(values__tmp__table__1)values__tmp__table__1.FieldSchema(name:tmp_values_col4, type:string, comment:), ]
+PREHOOK: query: insert into rtable values (1, 'CD5415192314304', '00071'), (45, 'CD5415192314304', '00072')
+PREHOOK: type: QUERY
+PREHOOK: Input: default@values__tmp__table__2
+PREHOOK: Output: default@rtable
+POSTHOOK: query: insert into rtable values (1, 'CD5415192314304', '00071'), (45, 'CD5415192314304', '00072')
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@values__tmp__table__2
+POSTHOOK: Output: default@rtable
+POSTHOOK: Lineage: rtable.ra EXPRESSION [(values__tmp__table__2)values__tmp__table__2.FieldSchema(name:tmp_values_col1, type:string, comment:), ]
+POSTHOOK: Lineage: rtable.rk1 SIMPLE [(values__tmp__table__2)values__tmp__table__2.FieldSchema(name:tmp_values_col2, type:string, comment:), ]
+POSTHOOK: Lineage: rtable.rk2 SIMPLE [(values__tmp__table__2)values__tmp__table__2.FieldSchema(name:tmp_values_col3, type:string, comment:), ]
+PREHOOK: query: EXPLAIN SELECT * FROM ltable l LEFT OUTER JOIN rtable r on (l.lk1 = r.rk1 AND l.lk2 = r.rk2) WHERE COALESCE(l.la,'EMPTY')=COALESCE(r.ra,'EMPTY')
+PREHOOK: type: QUERY
+POSTHOOK: query: EXPLAIN SELECT * FROM ltable l LEFT OUTER JOIN rtable r on (l.lk1 = r.rk1 AND l.lk2 = r.rk2) WHERE COALESCE(l.la,'EMPTY')=COALESCE(r.ra,'EMPTY')
+POSTHOOK: type: QUERY
+STAGE DEPENDENCIES:
+  Stage-1 is a root stage
+  Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+  Stage: Stage-1
+    Map Reduce
+      Map Operator Tree:
+          TableScan
+            alias: l
+            Statistics: Num rows: 2 Data size: 52 Basic stats: COMPLETE Column stats: NONE
+            Reduce Output Operator
+              key expressions: lk1 (type: string), lk2 (type: string)
+              sort order: ++
+              Map-reduce partition columns: lk1 (type: string), lk2 (type: string)
+              Statistics: Num rows: 2 Data size: 52 Basic stats: COMPLETE Column stats: NONE
+              value expressions: index (type: int), la (type: int)
+          TableScan
+            alias: r
+            Statistics: Num rows: 2 Data size: 47 Basic stats: COMPLETE Column stats: NONE
+            Reduce Output Operator
+              key expressions: rk1 (type: string), rk2 (type: string)
+              sort order: ++
+              Map-reduce partition columns: rk1 (type: string), rk2 (type: string)
+              Statistics: Num rows: 2 Data size: 47 Basic stats: COMPLETE Column stats: NONE
+              value expressions: ra (type: int)
+      Reduce Operator Tree:
+        Join Operator
+          condition map:
+               Left Outer Join0 to 1
+          keys:
+            0 lk1 (type: string), lk2 (type: string)
+            1 rk1 (type: string), rk2 (type: string)
+          outputColumnNames: _col0, _col1, _col2, _col3, _col7, _col8, _col9
+          Statistics: Num rows: 2 Data size: 57 Basic stats: COMPLETE Column stats: NONE
+          Filter Operator
+            predicate: (COALESCE(_col1,'EMPTY') = COALESCE(_col7,'EMPTY')) (type: boolean)
+            Statistics: Num rows: 1 Data size: 28 Basic stats: COMPLETE Column stats: NONE
+            Select Operator
+              expressions: _col0 (type: int), _col1 (type: int), _col2 (type: string), _col3 (type: string), _col7 (type: int), _col8 (type: string), _col9 (type: string)
+              outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6
+              Statistics: Num rows: 1 Data size: 28 Basic stats: COMPLETE Column stats: NONE
+              File Output Operator
+                compressed: false
+                Statistics: Num rows: 1 Data size: 28 Basic stats: COMPLETE Column stats: NONE
+                table:
+                    input format: org.apache.hadoop.mapred.TextInputFormat
+                    output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+                    serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+  Stage: Stage-0
+    Fetch Operator
+      limit: -1
+      Processor Tree:
+        ListSink
+
+PREHOOK: query: SELECT * FROM ltable l LEFT OUTER JOIN rtable r on (l.lk1 = r.rk1 AND l.lk2 = r.rk2) WHERE COALESCE(l.la,'EMPTY')=COALESCE(r.ra,'EMPTY')
+PREHOOK: type: QUERY
+PREHOOK: Input: default@ltable
+PREHOOK: Input: default@rtable
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT * FROM ltable l LEFT OUTER JOIN rtable r on (l.lk1 = r.rk1 AND l.lk2 = r.rk2) WHERE COALESCE(l.la,'EMPTY')=COALESCE(r.ra,'EMPTY')
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@ltable
+POSTHOOK: Input: default@rtable
+#### A masked pattern was here ####
+2	NULL	CD5415192225530	00071	NULL	NULL	NULL
+PREHOOK: query: EXPLAIN SELECT * FROM ltable l LEFT OUTER JOIN rtable r on (l.lk1 = r.rk1 AND l.lk2 = r.rk2) WHERE COALESCE(l.la,'EMPTY')=COALESCE(r.ra,'EMPTY')
+PREHOOK: type: QUERY
+POSTHOOK: query: EXPLAIN SELECT * FROM ltable l LEFT OUTER JOIN rtable r on (l.lk1 = r.rk1 AND l.lk2 = r.rk2) WHERE COALESCE(l.la,'EMPTY')=COALESCE(r.ra,'EMPTY')
+POSTHOOK: type: QUERY
+STAGE DEPENDENCIES:
+  Stage-4 is a root stage
+  Stage-3 depends on stages: Stage-4
+  Stage-0 depends on stages: Stage-3
+
+STAGE PLANS:
+  Stage: Stage-4
+    Map Reduce Local Work
+      Alias -> Map Local Tables:
+        r 
+          Fetch Operator
+            limit: -1
+      Alias -> Map Local Operator Tree:
+        r 
+          TableScan
+            alias: r
+            Statistics: Num rows: 2 Data size: 47 Basic stats: COMPLETE Column stats: NONE
+            HashTable Sink Operator
+              keys:
+                0 lk1 (type: string), lk2 (type: string)
+                1 rk1 (type: string), rk2 (type: string)
+
+  Stage: Stage-3
+    Map Reduce
+      Map Operator Tree:
+          TableScan
+            alias: l
+            Statistics: Num rows: 2 Data size: 52 Basic stats: COMPLETE Column stats: NONE
+            Map Join Operator
+              condition map:
+                   Left Outer Join0 to 1
+              keys:
+                0 lk1 (type: string), lk2 (type: string)
+                1 rk1 (type: string), rk2 (type: string)
+              outputColumnNames: _col0, _col1, _col2, _col3, _col7, _col8, _col9
+              Statistics: Num rows: 2 Data size: 57 Basic stats: COMPLETE Column stats: NONE
+              Filter Operator
+                predicate: (COALESCE(_col1,'EMPTY') = COALESCE(_col7,'EMPTY')) (type: boolean)
+                Statistics: Num rows: 1 Data size: 28 Basic stats: COMPLETE Column stats: NONE
+                Select Operator
+                  expressions: _col0 (type: int), _col1 (type: int), _col2 (type: string), _col3 (type: string), _col7 (type: int), _col8 (type: string), _col9 (type: string)
+                  outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6
+                  Statistics: Num rows: 1 Data size: 28 Basic stats: COMPLETE Column stats: NONE
+                  File Output Operator
+                    compressed: false
+                    Statistics: Num rows: 1 Data size: 28 Basic stats: COMPLETE Column stats: NONE
+                    table:
+                        input format: org.apache.hadoop.mapred.TextInputFormat
+                        output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+                        serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+      Local Work:
+        Map Reduce Local Work
+
+  Stage: Stage-0
+    Fetch Operator
+      limit: -1
+      Processor Tree:
+        ListSink
+
+PREHOOK: query: SELECT * FROM ltable l LEFT OUTER JOIN rtable r on (l.lk1 = r.rk1 AND l.lk2 = r.rk2) WHERE COALESCE(l.la,'EMPTY')=COALESCE(r.ra,'EMPTY')
+PREHOOK: type: QUERY
+PREHOOK: Input: default@ltable
+PREHOOK: Input: default@rtable
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT * FROM ltable l LEFT OUTER JOIN rtable r on (l.lk1 = r.rk1 AND l.lk2 = r.rk2) WHERE COALESCE(l.la,'EMPTY')=COALESCE(r.ra,'EMPTY')
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@ltable
+POSTHOOK: Input: default@rtable
+#### A masked pattern was here ####
+2	NULL	CD5415192225530	00071	NULL	NULL	NULL