You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by na...@apache.org on 2013/04/02 05:56:11 UTC
svn commit: r1463373 - in /hive/trunk/ql/src:
java/org/apache/hadoop/hive/ql/optimizer/GroupByOptimizer.java
test/queries/clientpositive/groupby_sort_9.q
test/results/clientpositive/groupby_sort_9.q.out
Author: namit
Date: Tue Apr 2 03:56:10 2013
New Revision: 1463373
URL: http://svn.apache.org/r1463373
Log:
HIVE-4270 bug in hive.map.groupby.sorted in the presence of multiple input partitions
(Namit via Gang Tim Liu)
Added:
hive/trunk/ql/src/test/queries/clientpositive/groupby_sort_9.q
hive/trunk/ql/src/test/results/clientpositive/groupby_sort_9.q.out
Modified:
hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/GroupByOptimizer.java
Modified: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/GroupByOptimizer.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/GroupByOptimizer.java?rev=1463373&r1=1463372&r2=1463373&view=diff
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/GroupByOptimizer.java (original)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/GroupByOptimizer.java Tue Apr 2 03:56:10 2013
@@ -335,10 +335,13 @@ public class GroupByOptimizer implements
throw new SemanticException(e.getMessage(), e);
}
+ List<Partition> notDeniedPartns = partsList.getNotDeniedPartns();
+
GroupByOptimizerSortMatch currentMatch =
- partsList.getNotDeniedPartns().isEmpty() ? GroupByOptimizerSortMatch.NO_MATCH :
+ notDeniedPartns.isEmpty() ? GroupByOptimizerSortMatch.NO_MATCH :
+ notDeniedPartns.size() > 1 ? GroupByOptimizerSortMatch.PARTIAL_MATCH :
GroupByOptimizerSortMatch.COMPLETE_MATCH;
- for (Partition part : partsList.getNotDeniedPartns()) {
+ for (Partition part : notDeniedPartns) {
List<String> sortCols = part.getSortColNames();
List<String> bucketCols = part.getBucketCols();
GroupByOptimizerSortMatch match = matchBucketSortCols(groupByCols, bucketCols, sortCols);
Added: hive/trunk/ql/src/test/queries/clientpositive/groupby_sort_9.q
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/queries/clientpositive/groupby_sort_9.q?rev=1463373&view=auto
==============================================================================
--- hive/trunk/ql/src/test/queries/clientpositive/groupby_sort_9.q (added)
+++ hive/trunk/ql/src/test/queries/clientpositive/groupby_sort_9.q Tue Apr 2 03:56:10 2013
@@ -0,0 +1,21 @@
+set hive.enforce.bucketing = true;
+set hive.enforce.sorting = true;
+set hive.exec.reducers.max = 10;
+set hive.map.groupby.sorted=true;
+
+CREATE TABLE T1(key STRING, val STRING) PARTITIONED BY (ds string)
+CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS STORED AS TEXTFILE;
+
+LOAD DATA LOCAL INPATH '../data/files/T1.txt' INTO TABLE T1 PARTITION (ds='1');
+
+-- perform an insert to make sure there are 2 files
+INSERT OVERWRITE TABLE T1 PARTITION (ds='1') select key, val from T1 where ds = '1';
+INSERT OVERWRITE TABLE T1 PARTITION (ds='2') select key, val from T1 where ds = '1';
+
+-- The plan is not converted to a map-side, since although the sorting columns and grouping
+-- columns match, the user is querying multiple input partitions
+EXPLAIN
+select key, count(1) from T1 group by key;
+select key, count(1) from T1 group by key;
+
+DROP TABLE T1;
\ No newline at end of file
Added: hive/trunk/ql/src/test/results/clientpositive/groupby_sort_9.q.out
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/results/clientpositive/groupby_sort_9.q.out?rev=1463373&view=auto
==============================================================================
--- hive/trunk/ql/src/test/results/clientpositive/groupby_sort_9.q.out (added)
+++ hive/trunk/ql/src/test/results/clientpositive/groupby_sort_9.q.out Tue Apr 2 03:56:10 2013
@@ -0,0 +1,158 @@
+PREHOOK: query: CREATE TABLE T1(key STRING, val STRING) PARTITIONED BY (ds string)
+CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS STORED AS TEXTFILE
+PREHOOK: type: CREATETABLE
+POSTHOOK: query: CREATE TABLE T1(key STRING, val STRING) PARTITIONED BY (ds string)
+CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS STORED AS TEXTFILE
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: default@T1
+PREHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T1.txt' INTO TABLE T1 PARTITION (ds='1')
+PREHOOK: type: LOAD
+PREHOOK: Output: default@t1
+POSTHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T1.txt' INTO TABLE T1 PARTITION (ds='1')
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@t1
+POSTHOOK: Output: default@t1@ds=1
+PREHOOK: query: -- perform an insert to make sure there are 2 files
+INSERT OVERWRITE TABLE T1 PARTITION (ds='1') select key, val from T1 where ds = '1'
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t1
+PREHOOK: Input: default@t1@ds=1
+PREHOOK: Output: default@t1@ds=1
+POSTHOOK: query: -- perform an insert to make sure there are 2 files
+INSERT OVERWRITE TABLE T1 PARTITION (ds='1') select key, val from T1 where ds = '1'
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t1
+POSTHOOK: Input: default@t1@ds=1
+POSTHOOK: Output: default@t1@ds=1
+POSTHOOK: Lineage: t1 PARTITION(ds=1).key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ]
+POSTHOOK: Lineage: t1 PARTITION(ds=1).val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ]
+PREHOOK: query: INSERT OVERWRITE TABLE T1 PARTITION (ds='2') select key, val from T1 where ds = '1'
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t1
+PREHOOK: Input: default@t1@ds=1
+PREHOOK: Output: default@t1@ds=2
+POSTHOOK: query: INSERT OVERWRITE TABLE T1 PARTITION (ds='2') select key, val from T1 where ds = '1'
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t1
+POSTHOOK: Input: default@t1@ds=1
+POSTHOOK: Output: default@t1@ds=2
+POSTHOOK: Lineage: t1 PARTITION(ds=1).key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ]
+POSTHOOK: Lineage: t1 PARTITION(ds=1).val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ]
+POSTHOOK: Lineage: t1 PARTITION(ds=2).key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ]
+POSTHOOK: Lineage: t1 PARTITION(ds=2).val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ]
+PREHOOK: query: -- The plan is not converted to a map-side, since although the sorting columns and grouping
+-- columns match, the user is querying multiple input partitions
+EXPLAIN
+select key, count(1) from T1 group by key
+PREHOOK: type: QUERY
+POSTHOOK: query: -- The plan is not converted to a map-side, since although the sorting columns and grouping
+-- columns match, the user is querying multiple input partitions
+EXPLAIN
+select key, count(1) from T1 group by key
+POSTHOOK: type: QUERY
+POSTHOOK: Lineage: t1 PARTITION(ds=1).key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ]
+POSTHOOK: Lineage: t1 PARTITION(ds=1).val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ]
+POSTHOOK: Lineage: t1 PARTITION(ds=2).key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ]
+POSTHOOK: Lineage: t1 PARTITION(ds=2).val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ]
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (TOK_TABLE_OR_COL key))))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ t1
+ TableScan
+ alias: t1
+ Select Operator
+ expressions:
+ expr: key
+ type: string
+ outputColumnNames: key
+ Group By Operator
+ aggregations:
+ expr: count(1)
+ bucketGroup: true
+ keys:
+ expr: key
+ type: string
+ mode: hash
+ outputColumnNames: _col0, _col1
+ Reduce Output Operator
+ key expressions:
+ expr: _col0
+ type: string
+ sort order: +
+ Map-reduce partition columns:
+ expr: _col0
+ type: string
+ tag: -1
+ value expressions:
+ expr: _col1
+ type: bigint
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: count(VALUE._col0)
+ bucketGroup: false
+ keys:
+ expr: KEY._col0
+ type: string
+ mode: mergepartial
+ outputColumnNames: _col0, _col1
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: bigint
+ outputColumnNames: _col0, _col1
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: select key, count(1) from T1 group by key
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t1
+PREHOOK: Input: default@t1@ds=1
+PREHOOK: Input: default@t1@ds=2
+#### A masked pattern was here ####
+POSTHOOK: query: select key, count(1) from T1 group by key
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t1
+POSTHOOK: Input: default@t1@ds=1
+POSTHOOK: Input: default@t1@ds=2
+#### A masked pattern was here ####
+POSTHOOK: Lineage: t1 PARTITION(ds=1).key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ]
+POSTHOOK: Lineage: t1 PARTITION(ds=1).val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ]
+POSTHOOK: Lineage: t1 PARTITION(ds=2).key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ]
+POSTHOOK: Lineage: t1 PARTITION(ds=2).val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ]
+1 2
+2 2
+3 2
+7 2
+8 4
+PREHOOK: query: DROP TABLE T1
+PREHOOK: type: DROPTABLE
+PREHOOK: Input: default@t1
+PREHOOK: Output: default@t1
+POSTHOOK: query: DROP TABLE T1
+POSTHOOK: type: DROPTABLE
+POSTHOOK: Input: default@t1
+POSTHOOK: Output: default@t1
+POSTHOOK: Lineage: t1 PARTITION(ds=1).key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ]
+POSTHOOK: Lineage: t1 PARTITION(ds=1).val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ]
+POSTHOOK: Lineage: t1 PARTITION(ds=2).key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ]
+POSTHOOK: Lineage: t1 PARTITION(ds=2).val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ]