You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by se...@apache.org on 2017/02/28 20:03:11 UTC
hive git commit: HIVE-16040 : union column expansion should take aliases from the leftmost branch (Sergey Shelukhin, reviewed by Ashutosh Chauhan)

Repository: hive
Updated Branches:
  refs/heads/master 95da916eb -> 928b4c015


HIVE-16040 : union column expansion should take aliases from the leftmost branch (Sergey Shelukhin, reviewed by Ashutosh Chauhan)


Project: http://git-wip-us.apache.org/repos/asf/hive/repo
Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/928b4c01
Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/928b4c01
Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/928b4c01

Branch: refs/heads/master
Commit: 928b4c01531470601a68950625bbe64faea95674
Parents: 95da916
Author: Sergey Shelukhin <se...@apache.org>
Authored: Tue Feb 28 11:59:29 2017 -0800
Committer: Sergey Shelukhin <se...@apache.org>
Committed: Tue Feb 28 11:59:40 2017 -0800

----------------------------------------------------------------------
 .../apache/hadoop/hive/ql/parse/ParseUtils.java |  32 +-
 .../queries/clientpositive/union_pos_alias.q    |  20 +-
 .../results/clientpositive/perf/query14.q.out   |   4 +-
 .../clientpositive/union_pos_alias.q.out        | 307 +++++++++++++++++++
 4 files changed, 356 insertions(+), 7 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hive/blob/928b4c01/ql/src/java/org/apache/hadoop/hive/ql/parse/ParseUtils.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/ParseUtils.java b/ql/src/java/org/apache/hadoop/hive/ql/parse/ParseUtils.java
index 473a664..54e37f7 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/parse/ParseUtils.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/ParseUtils.java
@@ -51,6 +51,8 @@ import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory;
 import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils;
 import org.apache.hadoop.hive.serde2.typeinfo.VarcharTypeInfo;
 
+import com.google.common.base.Preconditions;
+
 
 /**
  * Library of utility functions used in the parse code.
@@ -400,10 +402,9 @@ public final class ParseUtils {
           }
         }
       }
-      // We find the SELECT closest to the top. This assumes there's only one FROM or FROM-s
-      // are all equivalent (union case). Also, this assumption could be false for an already
-      // malformed query; we don't check for that here - it will fail later anyway.
-      // TODO: Maybe we should find ALL the SELECT-s not nested in another from, and compare.
+      // Note: we assume that this isn't an already malformed query;
+      //       we don't check for that here - it will fail later anyway.
+      // First, we find the SELECT closest to the top.
       ASTNode select = searcher.simpleBreadthFirstSearchAny((ASTNode)fromNode,
           HiveParser.TOK_SELECT, HiveParser.TOK_SELECTDI);
       if (select == null) {
@@ -412,6 +413,29 @@ public final class ParseUtils {
         setCols.token.setType(HiveParser.TOK_ALLCOLREF);
         return;
       }
+
+      // Then, find the leftmost logical sibling select, because that's what Hive uses for aliases. 
+      while (true) {
+        CommonTree queryOfSelect = select.parent;
+        while (queryOfSelect != null && queryOfSelect.getType() != HiveParser.TOK_QUERY) {
+          queryOfSelect = queryOfSelect.parent;
+        }
+        // We should have some QUERY; and also its parent because by supposition we are in subq.
+        if (queryOfSelect == null || queryOfSelect.parent == null) {
+          LOG.debug("Replacing SETCOLREF with ALLCOLREF because we couldn't find the QUERY");
+          setCols.token.setType(HiveParser.TOK_ALLCOLREF);
+          return;
+        }
+        if (queryOfSelect.childIndex == 0) break; // We are the left-most child.
+        Tree moreToTheLeft = queryOfSelect.parent.getChild(0);
+        Preconditions.checkState(moreToTheLeft != queryOfSelect);
+        ASTNode newSelect = searcher.simpleBreadthFirstSearchAny((ASTNode)moreToTheLeft,
+          HiveParser.TOK_SELECT, HiveParser.TOK_SELECTDI);
+        Preconditions.checkState(newSelect != select);
+        select = newSelect;
+        // Repeat the procedure for the new select.
+      }
+
       // Found the proper columns.
       List<ASTNode> newChildren = new ArrayList<>(select.getChildCount());
       HashSet<String> aliases = new HashSet<>();

http://git-wip-us.apache.org/repos/asf/hive/blob/928b4c01/ql/src/test/queries/clientpositive/union_pos_alias.q
----------------------------------------------------------------------
diff --git a/ql/src/test/queries/clientpositive/union_pos_alias.q b/ql/src/test/queries/clientpositive/union_pos_alias.q
index c4eca68..60fea8f 100644
--- a/ql/src/test/queries/clientpositive/union_pos_alias.q
+++ b/ql/src/test/queries/clientpositive/union_pos_alias.q
@@ -27,4 +27,22 @@ UNION ALL
 select 'test', value from src_10 s3
 order by 2, 1 desc;
 
-drop table src_10;
\ No newline at end of file
+drop table src_10;
+
+
+drop view v;
+create view v as select key as k from src intersect all select key as k1 from src;
+desc formatted v;
+
+set hive.mapred.mode=nonstrict;
+set hive.security.authorization.manager=org.apache.hadoop.hive.ql.security.authorization.plugin.sqlstd.SQLStdHiveAuthorizerFactoryForTest;
+
+create table masking_test as select cast(key as int) as key, value from src;
+
+explain
+select * from masking_test  union all select * from masking_test ;
+select * from masking_test  union all select * from masking_test ;
+
+explain
+select key as k1, value as v1 from masking_test where key > 0 intersect all select key as k2, value as v2 from masking_test where key > 0;
+select key as k1, value as v1 from masking_test where key > 0 intersect all select key as k2, value as v2 from masking_test where key > 0;

http://git-wip-us.apache.org/repos/asf/hive/blob/928b4c01/ql/src/test/results/clientpositive/perf/query14.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/perf/query14.q.out b/ql/src/test/results/clientpositive/perf/query14.q.out
index 55a2e5b..9821180 100644
--- a/ql/src/test/results/clientpositive/perf/query14.q.out
+++ b/ql/src/test/results/clientpositive/perf/query14.q.out
@@ -1,9 +1,9 @@
+Warning: Shuffle Join MERGEJOIN[916][tables = [$hdt$_1, $hdt$_2]] in Stage 'Reducer 114' is a cross product
+Warning: Shuffle Join MERGEJOIN[917][tables = [$hdt$_1, $hdt$_2, $hdt$_0]] in Stage 'Reducer 115' is a cross product
 Warning: Shuffle Join MERGEJOIN[914][tables = [$hdt$_1, $hdt$_2]] in Stage 'Reducer 61' is a cross product
 Warning: Shuffle Join MERGEJOIN[915][tables = [$hdt$_1, $hdt$_2, $hdt$_0]] in Stage 'Reducer 62' is a cross product
 Warning: Shuffle Join MERGEJOIN[912][tables = [$hdt$_1, $hdt$_2]] in Stage 'Reducer 5' is a cross product
 Warning: Shuffle Join MERGEJOIN[913][tables = [$hdt$_1, $hdt$_2, $hdt$_0]] in Stage 'Reducer 6' is a cross product
-Warning: Shuffle Join MERGEJOIN[916][tables = [$hdt$_1, $hdt$_2]] in Stage 'Reducer 114' is a cross product
-Warning: Shuffle Join MERGEJOIN[917][tables = [$hdt$_1, $hdt$_2, $hdt$_0]] in Stage 'Reducer 115' is a cross product
 PREHOOK: query: explain
 with  cross_items as
  (select i_item_sk ss_item_sk

http://git-wip-us.apache.org/repos/asf/hive/blob/928b4c01/ql/src/test/results/clientpositive/union_pos_alias.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/union_pos_alias.q.out b/ql/src/test/results/clientpositive/union_pos_alias.q.out
index 8eddbd9..8ffbc2a 100644
--- a/ql/src/test/results/clientpositive/union_pos_alias.q.out
+++ b/ql/src/test/results/clientpositive/union_pos_alias.q.out
@@ -306,3 +306,310 @@ POSTHOOK: query: drop table src_10
 POSTHOOK: type: DROPTABLE
 POSTHOOK: Input: default@src_10
 POSTHOOK: Output: default@src_10
+PREHOOK: query: drop view v
+PREHOOK: type: DROPVIEW
+POSTHOOK: query: drop view v
+POSTHOOK: type: DROPVIEW
+PREHOOK: query: create view v as select key as k from src intersect all select key as k1 from src
+PREHOOK: type: CREATEVIEW
+PREHOOK: Input: default@src
+PREHOOK: Output: database:default
+PREHOOK: Output: default@v
+POSTHOOK: query: create view v as select key as k from src intersect all select key as k1 from src
+POSTHOOK: type: CREATEVIEW
+POSTHOOK: Input: default@src
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@v
+POSTHOOK: Lineage: v.k SCRIPT [(src)src.null, (src)src.FieldSchema(name:key, type:string, comment:default), ]
+PREHOOK: query: desc formatted v
+PREHOOK: type: DESCTABLE
+PREHOOK: Input: default@v
+POSTHOOK: query: desc formatted v
+POSTHOOK: type: DESCTABLE
+POSTHOOK: Input: default@v
+# col_name            	data_type           	comment             
+	 	 
+k                   	string              	                    
+	 	 
+# Detailed Table Information	 	 
+Database:           	default             	 
+#### A masked pattern was here ####
+Retention:          	0                   	 
+Table Type:         	VIRTUAL_VIEW        	 
+Table Parameters:	 	 
+#### A masked pattern was here ####
+	 	 
+# Storage Information	 	 
+SerDe Library:      	null                	 
+InputFormat:        	org.apache.hadoop.mapred.TextInputFormat	 
+OutputFormat:       	org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat	 
+Compressed:         	No                  	 
+Num Buckets:        	-1                  	 
+Bucket Columns:     	[]                  	 
+Sort Columns:       	[]                  	 
+	 	 
+# View Information	 	 
+View Original Text: 	select key as k from src intersect all select key as k1 from src	 
+View Expanded Text: 	select `src`.`key` as `k` from `default`.`src` intersect all select `src`.`key` as `k1` from `default`.`src`	 
+View Rewrite Enabled:	No                  	 
+PREHOOK: query: create table masking_test as select cast(key as int) as key, value from src
+PREHOOK: type: CREATETABLE_AS_SELECT
+PREHOOK: Input: default@src
+PREHOOK: Output: database:default
+PREHOOK: Output: default@masking_test
+POSTHOOK: query: create table masking_test as select cast(key as int) as key, value from src
+POSTHOOK: type: CREATETABLE_AS_SELECT
+POSTHOOK: Input: default@src
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@masking_test
+POSTHOOK: Lineage: masking_test.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: masking_test.value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+PREHOOK: query: explain
+select * from masking_test  union all select * from masking_test
+PREHOOK: type: QUERY
+POSTHOOK: query: explain
+select * from masking_test  union all select * from masking_test
+POSTHOOK: type: QUERY
+STAGE DEPENDENCIES:
+  Stage-1 is a root stage
+  Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+  Stage: Stage-1
+    Map Reduce
+      Map Operator Tree:
+          TableScan
+            alias: masking_test
+            Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE
+            Filter Operator
+              predicate: (((key % 2) = 0) and (key < 10)) (type: boolean)
+              Statistics: Num rows: 83 Data size: 881 Basic stats: COMPLETE Column stats: NONE
+              Select Operator
+                expressions: key (type: int), reverse(value) (type: string)
+                outputColumnNames: _col0, _col1
+                Statistics: Num rows: 83 Data size: 881 Basic stats: COMPLETE Column stats: NONE
+                Union
+                  Statistics: Num rows: 166 Data size: 1762 Basic stats: COMPLETE Column stats: NONE
+                  File Output Operator
+                    compressed: false
+                    Statistics: Num rows: 166 Data size: 1762 Basic stats: COMPLETE Column stats: NONE
+                    table:
+                        input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+                        output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+                        serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+          TableScan
+            alias: masking_test
+            Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE
+            Filter Operator
+              predicate: (((key % 2) = 0) and (key < 10)) (type: boolean)
+              Statistics: Num rows: 83 Data size: 881 Basic stats: COMPLETE Column stats: NONE
+              Select Operator
+                expressions: key (type: int), reverse(value) (type: string)
+                outputColumnNames: _col0, _col1
+                Statistics: Num rows: 83 Data size: 881 Basic stats: COMPLETE Column stats: NONE
+                Union
+                  Statistics: Num rows: 166 Data size: 1762 Basic stats: COMPLETE Column stats: NONE
+                  File Output Operator
+                    compressed: false
+                    Statistics: Num rows: 166 Data size: 1762 Basic stats: COMPLETE Column stats: NONE
+                    table:
+                        input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+                        output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+                        serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+  Stage: Stage-0
+    Fetch Operator
+      limit: -1
+      Processor Tree:
+        ListSink
+
+PREHOOK: query: select * from masking_test  union all select * from masking_test
+PREHOOK: type: QUERY
+PREHOOK: Input: default@masking_test
+#### A masked pattern was here ####
+POSTHOOK: query: select * from masking_test  union all select * from masking_test
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@masking_test
+#### A masked pattern was here ####
+0	0_lav
+0	0_lav
+4	4_lav
+4	4_lav
+8	8_lav
+8	8_lav
+0	0_lav
+0	0_lav
+0	0_lav
+0	0_lav
+2	2_lav
+2	2_lav
+PREHOOK: query: explain
+select key as k1, value as v1 from masking_test where key > 0 intersect all select key as k2, value as v2 from masking_test where key > 0
+PREHOOK: type: QUERY
+POSTHOOK: query: explain
+select key as k1, value as v1 from masking_test where key > 0 intersect all select key as k2, value as v2 from masking_test where key > 0
+POSTHOOK: type: QUERY
+STAGE DEPENDENCIES:
+  Stage-1 is a root stage
+  Stage-2 depends on stages: Stage-1, Stage-3
+  Stage-3 is a root stage
+  Stage-0 depends on stages: Stage-2
+
+STAGE PLANS:
+  Stage: Stage-1
+    Map Reduce
+      Map Operator Tree:
+          TableScan
+            alias: masking_test
+            Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE
+            Filter Operator
+              predicate: (((key % 2) = 0) and (key < 10) and (key > 0)) (type: boolean)
+              Statistics: Num rows: 27 Data size: 286 Basic stats: COMPLETE Column stats: NONE
+              Select Operator
+                expressions: key (type: int), reverse(value) (type: string)
+                outputColumnNames: _col0, _col1
+                Statistics: Num rows: 27 Data size: 286 Basic stats: COMPLETE Column stats: NONE
+                Group By Operator
+                  aggregations: count(1)
+                  keys: _col0 (type: int), _col1 (type: string)
+                  mode: hash
+                  outputColumnNames: _col0, _col1, _col2
+                  Statistics: Num rows: 27 Data size: 286 Basic stats: COMPLETE Column stats: NONE
+                  Reduce Output Operator
+                    key expressions: _col0 (type: int), _col1 (type: string)
+                    sort order: ++
+                    Map-reduce partition columns: _col0 (type: int), _col1 (type: string)
+                    Statistics: Num rows: 27 Data size: 286 Basic stats: COMPLETE Column stats: NONE
+                    value expressions: _col2 (type: bigint)
+      Reduce Operator Tree:
+        Group By Operator
+          aggregations: count(VALUE._col0)
+          keys: KEY._col0 (type: int), KEY._col1 (type: string)
+          mode: mergepartial
+          outputColumnNames: _col0, _col1, _col2
+          Statistics: Num rows: 13 Data size: 137 Basic stats: COMPLETE Column stats: NONE
+          File Output Operator
+            compressed: false
+            table:
+                input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+                output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+                serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe
+
+  Stage: Stage-2
+    Map Reduce
+      Map Operator Tree:
+          TableScan
+            Union
+              Statistics: Num rows: 26 Data size: 274 Basic stats: COMPLETE Column stats: NONE
+              Group By Operator
+                aggregations: min(_col2), count(_col2)
+                keys: _col0 (type: int), _col1 (type: string)
+                mode: hash
+                outputColumnNames: _col0, _col1, _col2, _col3
+                Statistics: Num rows: 26 Data size: 274 Basic stats: COMPLETE Column stats: NONE
+                Reduce Output Operator
+                  key expressions: _col0 (type: int), _col1 (type: string)
+                  sort order: ++
+                  Map-reduce partition columns: _col0 (type: int), _col1 (type: string)
+                  Statistics: Num rows: 26 Data size: 274 Basic stats: COMPLETE Column stats: NONE
+                  value expressions: _col2 (type: bigint), _col3 (type: bigint)
+          TableScan
+            Union
+              Statistics: Num rows: 26 Data size: 274 Basic stats: COMPLETE Column stats: NONE
+              Group By Operator
+                aggregations: min(_col2), count(_col2)
+                keys: _col0 (type: int), _col1 (type: string)
+                mode: hash
+                outputColumnNames: _col0, _col1, _col2, _col3
+                Statistics: Num rows: 26 Data size: 274 Basic stats: COMPLETE Column stats: NONE
+                Reduce Output Operator
+                  key expressions: _col0 (type: int), _col1 (type: string)
+                  sort order: ++
+                  Map-reduce partition columns: _col0 (type: int), _col1 (type: string)
+                  Statistics: Num rows: 26 Data size: 274 Basic stats: COMPLETE Column stats: NONE
+                  value expressions: _col2 (type: bigint), _col3 (type: bigint)
+      Reduce Operator Tree:
+        Group By Operator
+          aggregations: min(VALUE._col0), count(VALUE._col1)
+          keys: KEY._col0 (type: int), KEY._col1 (type: string)
+          mode: mergepartial
+          outputColumnNames: _col0, _col1, _col2, _col3
+          Statistics: Num rows: 13 Data size: 137 Basic stats: COMPLETE Column stats: NONE
+          Filter Operator
+            predicate: (_col3 = 2) (type: boolean)
+            Statistics: Num rows: 1 Data size: 10 Basic stats: COMPLETE Column stats: NONE
+            Select Operator
+              expressions: _col2 (type: bigint), _col0 (type: int), _col1 (type: string)
+              outputColumnNames: _col0, _col1, _col2
+              Statistics: Num rows: 1 Data size: 10 Basic stats: COMPLETE Column stats: NONE
+              UDTF Operator
+                Statistics: Num rows: 1 Data size: 10 Basic stats: COMPLETE Column stats: NONE
+                function name: UDTFReplicateRows
+                Select Operator
+                  expressions: col1 (type: int), col2 (type: string)
+                  outputColumnNames: _col0, _col1
+                  Statistics: Num rows: 1 Data size: 10 Basic stats: COMPLETE Column stats: NONE
+                  File Output Operator
+                    compressed: false
+                    Statistics: Num rows: 1 Data size: 10 Basic stats: COMPLETE Column stats: NONE
+                    table:
+                        input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+                        output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+                        serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+  Stage: Stage-3
+    Map Reduce
+      Map Operator Tree:
+          TableScan
+            alias: masking_test
+            Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE
+            Filter Operator
+              predicate: (((key % 2) = 0) and (key < 10) and (key > 0)) (type: boolean)
+              Statistics: Num rows: 27 Data size: 286 Basic stats: COMPLETE Column stats: NONE
+              Select Operator
+                expressions: key (type: int), reverse(value) (type: string)
+                outputColumnNames: _col0, _col1
+                Statistics: Num rows: 27 Data size: 286 Basic stats: COMPLETE Column stats: NONE
+                Group By Operator
+                  aggregations: count(1)
+                  keys: _col0 (type: int), _col1 (type: string)
+                  mode: hash
+                  outputColumnNames: _col0, _col1, _col2
+                  Statistics: Num rows: 27 Data size: 286 Basic stats: COMPLETE Column stats: NONE
+                  Reduce Output Operator
+                    key expressions: _col0 (type: int), _col1 (type: string)
+                    sort order: ++
+                    Map-reduce partition columns: _col0 (type: int), _col1 (type: string)
+                    Statistics: Num rows: 27 Data size: 286 Basic stats: COMPLETE Column stats: NONE
+                    value expressions: _col2 (type: bigint)
+      Reduce Operator Tree:
+        Group By Operator
+          aggregations: count(VALUE._col0)
+          keys: KEY._col0 (type: int), KEY._col1 (type: string)
+          mode: mergepartial
+          outputColumnNames: _col0, _col1, _col2
+          Statistics: Num rows: 13 Data size: 137 Basic stats: COMPLETE Column stats: NONE
+          File Output Operator
+            compressed: false
+            table:
+                input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+                output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+                serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe
+
+  Stage: Stage-0
+    Fetch Operator
+      limit: -1
+      Processor Tree:
+        ListSink
+
+PREHOOK: query: select key as k1, value as v1 from masking_test where key > 0 intersect all select key as k2, value as v2 from masking_test where key > 0
+PREHOOK: type: QUERY
+PREHOOK: Input: default@masking_test
+#### A masked pattern was here ####
+POSTHOOK: query: select key as k1, value as v1 from masking_test where key > 0 intersect all select key as k2, value as v2 from masking_test where key > 0
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@masking_test
+#### A masked pattern was here ####
+2	2_lav
+4	4_lav
+8	8_lav