You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by na...@apache.org on 2013/04/20 18:23:26 UTC
svn commit: r1470182 - in /hive/trunk/ql/src:
java/org/apache/hadoop/hive/ql/exec/
java/org/apache/hadoop/hive/ql/optimizer/
java/org/apache/hadoop/hive/ql/plan/ test/queries/clientpositive/
test/results/clientpositive/ test/results/compiler/plan/
Author: namit
Date: Sat Apr 20 16:23:25 2013
New Revision: 1470182
URL: http://svn.apache.org/r1470182
Log:
HIVE-4310 optimize count(distinct) with hive.map.groupby.sorted
(Namit Jain via Gang Tim Liu)
Added:
hive/trunk/ql/src/test/queries/clientpositive/groupby_sort_11.q
hive/trunk/ql/src/test/results/clientpositive/groupby_sort_11.q.out
Modified:
hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/GroupByOperator.java
hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/GroupByOptimizer.java
hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/plan/GroupByDesc.java
hive/trunk/ql/src/test/queries/clientpositive/groupby_sort_8.q
hive/trunk/ql/src/test/results/clientpositive/groupby_sort_8.q.out
hive/trunk/ql/src/test/results/compiler/plan/groupby2.q.xml
hive/trunk/ql/src/test/results/compiler/plan/groupby3.q.xml
Modified: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/GroupByOperator.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/GroupByOperator.java?rev=1470182&r1=1470181&r2=1470182&view=diff
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/GroupByOperator.java (original)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/GroupByOperator.java Sat Apr 20 16:23:25 2013
@@ -58,12 +58,12 @@ import org.apache.hadoop.hive.serde2.laz
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils.ObjectInspectorCopyOption;
+import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory;
import org.apache.hadoop.hive.serde2.objectinspector.StandardStructObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.StructField;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.UnionObject;
-import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils.ObjectInspectorCopyOption;
-import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory;
import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils;
@@ -90,8 +90,6 @@ public class GroupByOperator extends Ope
protected transient ObjectInspector[][] aggregationParameterObjectInspectors;
protected transient ObjectInspector[][] aggregationParameterStandardObjectInspectors;
protected transient Object[][] aggregationParameterObjects;
- // In the future, we may allow both count(DISTINCT a) and sum(DISTINCT a) in
- // the same SQL clause,
// so aggregationIsDistinct is a boolean array instead of a single number.
protected transient boolean[] aggregationIsDistinct;
// Map from integer tag to distinct aggrs
@@ -887,8 +885,15 @@ public class GroupByOperator extends Ope
// Forward the current keys if needed for sort-based aggregation
if (currentKeys != null && !keysAreEqual) {
- forward(currentKeys.getKeyArray(), aggregations);
- countAfterReport = 0;
+ // This is to optimize queries of the form:
+ // select count(distinct key) from T
+ // where T is sorted and bucketized by key
+ // Partial aggregation is performed on the mapper, and the
+ // reducer gets 1 row (partial result) per mapper.
+ if (!conf.isDontResetAggrsDistinct()) {
+ forward(currentKeys.getKeyArray(), aggregations);
+ countAfterReport = 0;
+ }
}
// Need to update the keys?
@@ -900,7 +905,10 @@ public class GroupByOperator extends Ope
}
// Reset the aggregations
- resetAggregations(aggregations);
+ // For distincts optimization with sorting/bucketing, perform partial aggregation
+ if (!conf.isDontResetAggrsDistinct()) {
+ resetAggregations(aggregations);
+ }
// clear parameters in last-invoke
for (int i = 0; i < aggregationsParametersLastInvoke.length; i++) {
Modified: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/GroupByOptimizer.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/GroupByOptimizer.java?rev=1470182&r1=1470181&r2=1470182&view=diff
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/GroupByOptimizer.java (original)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/GroupByOptimizer.java Sat Apr 20 16:23:25 2013
@@ -53,6 +53,7 @@ import org.apache.hadoop.hive.ql.optimiz
import org.apache.hadoop.hive.ql.parse.ParseContext;
import org.apache.hadoop.hive.ql.parse.PrunedPartitionList;
import org.apache.hadoop.hive.ql.parse.SemanticException;
+import org.apache.hadoop.hive.ql.plan.AggregationDesc;
import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc;
import org.apache.hadoop.hive.ql.plan.ExprNodeConstantDesc;
import org.apache.hadoop.hive.ql.plan.ExprNodeDesc;
@@ -60,6 +61,7 @@ import org.apache.hadoop.hive.ql.plan.Ex
import org.apache.hadoop.hive.ql.plan.GroupByDesc;
import org.apache.hadoop.hive.ql.plan.OperatorDesc;
import org.apache.hadoop.hive.ql.plan.SelectDesc;
+import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator.Mode;
import org.apache.hadoop.util.StringUtils;
/**
@@ -107,7 +109,7 @@ public class GroupByOptimizer implements
GraphWalker ogw = new DefaultGraphWalker(disp);
// Create a list of topop nodes
- ArrayList<Node> topNodes = new ArrayList<Node>();
+ List<Node> topNodes = new ArrayList<Node>();
topNodes.addAll(pctx.getTopOps().values());
ogw.startWalking(topNodes, null);
@@ -174,15 +176,83 @@ public class GroupByOptimizer implements
GroupByOptimizerSortMatch match = checkSortGroupBy(stack, groupByOp);
boolean useMapperSort =
HiveConf.getBoolVar(hiveConf, HiveConf.ConfVars.HIVE_MAP_GROUPBY_SORT);
+ GroupByDesc groupByOpDesc = groupByOp.getConf();
+
+ boolean removeReduceSink = false;
+ boolean optimizeDistincts = false;
+ boolean setBucketGroup = false;
// Dont remove the operator for distincts
- if (useMapperSort && !groupByOp.getConf().isDistinct() &&
+ if (useMapperSort &&
(match == GroupByOptimizerSortMatch.COMPLETE_MATCH)) {
- convertGroupByMapSideSortedGroupBy(hiveConf, groupByOp, depth);
+ if (!groupByOpDesc.isDistinct()) {
+ removeReduceSink = true;
+ }
+ else if (!HiveConf.getBoolVar(hiveConf, HiveConf.ConfVars.HIVEGROUPBYSKEW)) {
+ // Optimize the query: select count(distinct keys) from T, where
+ // T is bucketized and sorted by T
+ // Partial aggregation can be done by the mappers in this scenario
+
+ List<ExprNodeDesc> keys =
+ ((GroupByOperator)
+ (groupByOp.getChildOperators().get(0).getChildOperators().get(0)))
+ .getConf().getKeys();
+ if ((keys == null) || (keys.isEmpty())) {
+ optimizeDistincts = true;
+ }
+ }
}
- else if ((match == GroupByOptimizerSortMatch.PARTIAL_MATCH) ||
+
+ if ((match == GroupByOptimizerSortMatch.PARTIAL_MATCH) ||
(match == GroupByOptimizerSortMatch.COMPLETE_MATCH)) {
- groupByOp.getConf().setBucketGroup(true);
+ setBucketGroup = true;
+ }
+
+ if (removeReduceSink) {
+ convertGroupByMapSideSortedGroupBy(hiveConf, groupByOp, depth);
+ }
+ else if (optimizeDistincts) {
+ // In test mode, dont change the query plan. However, setup a query property
+ pGraphContext.getQueryProperties().setHasMapGroupBy(true);
+ if (HiveConf.getBoolVar(hiveConf, HiveConf.ConfVars.HIVE_MAP_GROUPBY_SORT_TESTMODE)) {
+ return;
+ }
+ ReduceSinkOperator reduceSinkOp =
+ (ReduceSinkOperator)groupByOp.getChildOperators().get(0);
+ GroupByDesc childGroupByDesc =
+ ((GroupByOperator)
+ (reduceSinkOp.getChildOperators().get(0))).getConf();
+
+ for (int pos = 0; pos < childGroupByDesc.getAggregators().size(); pos++) {
+ AggregationDesc aggr = childGroupByDesc.getAggregators().get(pos);
+ // Partial aggregation is not done for distincts on the mapper
+ // However, if the data is bucketed/sorted on the distinct key, partial aggregation
+ // can be performed on the mapper.
+ if (aggr.getDistinct()) {
+ ArrayList<ExprNodeDesc> parameters = new ArrayList<ExprNodeDesc>();
+ ExprNodeDesc param = aggr.getParameters().get(0);
+ assert param instanceof ExprNodeColumnDesc;
+ ExprNodeColumnDesc paramC = (ExprNodeColumnDesc) param;
+ paramC.setIsPartitionColOrVirtualCol(false);
+ paramC.setColumn("VALUE._col" + pos);
+ parameters.add(paramC);
+ aggr.setParameters(parameters);
+ aggr.setDistinct(false);
+ aggr.setMode(Mode.FINAL);
+ }
+ }
+ // Partial aggregation is performed on the mapper, no distinct processing at the reducer
+ childGroupByDesc.setDistinct(false);
+ groupByOpDesc.setDontResetAggrsDistinct(true);
+ groupByOpDesc.setBucketGroup(true);
+ groupByOp.setUseBucketizedHiveInputFormat(true);
+ // no distinct processing at the reducer
+ // A query like 'select count(distinct key) from T' is transformed into
+ // 'select count(key) from T' as far as the reducer is concerned.
+ reduceSinkOp.getConf().setDistinctColumnIndices(new ArrayList<List<Integer>>());
+ }
+ else if (setBucketGroup) {
+ groupByOpDesc.setBucketGroup(true);
}
}
@@ -339,8 +409,8 @@ public class GroupByOptimizer implements
GroupByOptimizerSortMatch currentMatch =
notDeniedPartns.isEmpty() ? GroupByOptimizerSortMatch.NO_MATCH :
- notDeniedPartns.size() > 1 ? GroupByOptimizerSortMatch.PARTIAL_MATCH :
- GroupByOptimizerSortMatch.COMPLETE_MATCH;
+ notDeniedPartns.size() > 1 ? GroupByOptimizerSortMatch.PARTIAL_MATCH :
+ GroupByOptimizerSortMatch.COMPLETE_MATCH;
for (Partition part : notDeniedPartns) {
List<String> sortCols = part.getSortColNames();
List<String> bucketCols = part.getBucketCols();
@@ -440,8 +510,9 @@ public class GroupByOptimizer implements
case NO_MATCH:
return GroupByOptimizerSortMatch.NO_MATCH;
case COMPLETE_MATCH:
- return ((bucketCols != null) && !bucketCols.isEmpty() && sortCols.containsAll(bucketCols)) ?
- GroupByOptimizerSortMatch.COMPLETE_MATCH : GroupByOptimizerSortMatch.PARTIAL_MATCH;
+ return ((bucketCols != null) && !bucketCols.isEmpty() &&
+ sortCols.containsAll(bucketCols)) ?
+ GroupByOptimizerSortMatch.COMPLETE_MATCH : GroupByOptimizerSortMatch.PARTIAL_MATCH;
case PREFIX_COL1_MATCH:
return GroupByOptimizerSortMatch.NO_MATCH;
case PREFIX_COL2_MATCH:
Modified: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/plan/GroupByDesc.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/plan/GroupByDesc.java?rev=1470182&r1=1470181&r2=1470182&view=diff
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/plan/GroupByDesc.java (original)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/plan/GroupByDesc.java Sat Apr 20 16:23:25 2013
@@ -35,13 +35,13 @@ public class GroupByDesc extends Abstrac
* PARTIAL1: partial aggregation - first phase: iterate, terminatePartial
* PARTIAL2: partial aggregation - second phase: merge, terminatePartial
* PARTIALS: For non-distinct the same as PARTIAL2, for distinct the same as
- * PARTIAL1
+ * PARTIAL1
* FINAL: partial aggregation - final phase: merge, terminate
* HASH: For non-distinct the same as PARTIAL1 but use hash-table-based aggregation
* MERGEPARTIAL: FINAL for non-distinct aggregations, COMPLETE for distinct
* aggregations.
*/
- private static final long serialVersionUID = 1L;
+ private static long serialVersionUID = 1L;
/**
* Mode.
@@ -66,6 +66,7 @@ public class GroupByDesc extends Abstrac
private float groupByMemoryUsage;
private float memoryThreshold;
transient private boolean isDistinct;
+ private boolean dontResetAggrsDistinct;
public GroupByDesc() {
}
@@ -83,8 +84,8 @@ public class GroupByDesc extends Abstrac
final int groupingSetsPosition,
final boolean isDistinct) {
this(mode, outputColumnNames, keys, aggregators, groupKeyNotReductionKey,
- false, groupByMemoryUsage, memoryThreshold, listGroupingSets,
- groupingSetsPresent, groupingSetsPosition, isDistinct);
+ false, groupByMemoryUsage, memoryThreshold, listGroupingSets,
+ groupingSetsPresent, groupingSetsPosition, isDistinct);
}
public GroupByDesc(
@@ -212,11 +213,11 @@ public class GroupByDesc extends Abstrac
*/
public boolean isDistinctLike() {
ArrayList<AggregationDesc> aggregators = getAggregators();
- for(AggregationDesc ad: aggregators){
- if(!ad.getDistinct()) {
+ for (AggregationDesc ad : aggregators) {
+ if (!ad.getDistinct()) {
GenericUDAFEvaluator udafEval = ad.getGenericUDAFEvaluator();
UDFType annot = udafEval.getClass().getAnnotation(UDFType.class);
- if(annot == null || !annot.distinctLike()) {
+ if (annot == null || !annot.distinctLike()) {
return false;
}
}
@@ -257,4 +258,16 @@ public class GroupByDesc extends Abstrac
public boolean isDistinct() {
return isDistinct;
}
+
+ public void setDistinct(boolean isDistinct) {
+ this.isDistinct = isDistinct;
+ }
+
+ public boolean isDontResetAggrsDistinct() {
+ return dontResetAggrsDistinct;
+ }
+
+ public void setDontResetAggrsDistinct(boolean dontResetAggrsDistinct) {
+ this.dontResetAggrsDistinct = dontResetAggrsDistinct;
+ }
}
Added: hive/trunk/ql/src/test/queries/clientpositive/groupby_sort_11.q
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/queries/clientpositive/groupby_sort_11.q?rev=1470182&view=auto
==============================================================================
--- hive/trunk/ql/src/test/queries/clientpositive/groupby_sort_11.q (added)
+++ hive/trunk/ql/src/test/queries/clientpositive/groupby_sort_11.q Sat Apr 20 16:23:25 2013
@@ -0,0 +1,40 @@
+set hive.enforce.bucketing = true;
+set hive.enforce.sorting = true;
+set hive.exec.reducers.max = 1;
+set hive.map.groupby.sorted=true;
+
+CREATE TABLE T1(key STRING, val STRING) PARTITIONED BY (ds string)
+CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS;
+
+-- perform an insert to make sure there are 2 files
+INSERT OVERWRITE TABLE T1 PARTITION (ds='1')
+SELECT * from src where key < 10;
+
+-- The plan is optimized to perform partial aggregation on the mapper
+EXPLAIN select count(distinct key) from T1;
+select count(distinct key) from T1;
+
+-- The plan is optimized to perform partial aggregation on the mapper
+EXPLAIN select count(distinct key), count(1), count(key), sum(distinct key) from T1;
+select count(distinct key), count(1), count(key), sum(distinct key) from T1;
+
+-- The plan is not changed in the presence of a grouping key
+EXPLAIN select count(distinct key), count(1), count(key), sum(distinct key) from T1 group by key;
+select count(distinct key), count(1), count(key), sum(distinct key) from T1 group by key;
+
+-- The plan is not changed in the presence of a grouping key
+EXPLAIN select key, count(distinct key), count(1), count(key), sum(distinct key) from T1 group by key;
+select key, count(distinct key), count(1), count(key), sum(distinct key) from T1 group by key;
+
+-- The plan is not changed in the presence of a grouping key expression
+EXPLAIN select count(distinct key+key) from T1;
+select count(distinct key+key) from T1;
+
+EXPLAIN select count(distinct 1) from T1;
+select count(distinct 1) from T1;
+
+set hive.map.aggr=false;
+
+-- no plan change if map aggr is turned off
+EXPLAIN select count(distinct key) from T1;
+select count(distinct key) from T1;
Modified: hive/trunk/ql/src/test/queries/clientpositive/groupby_sort_8.q
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/queries/clientpositive/groupby_sort_8.q?rev=1470182&r1=1470181&r2=1470182&view=diff
==============================================================================
--- hive/trunk/ql/src/test/queries/clientpositive/groupby_sort_8.q (original)
+++ hive/trunk/ql/src/test/queries/clientpositive/groupby_sort_8.q Sat Apr 20 16:23:25 2013
@@ -12,9 +12,16 @@ LOAD DATA LOCAL INPATH '../data/files/T1
INSERT OVERWRITE TABLE T1 PARTITION (ds='1') select key, val from T1 where ds = '1';
-- The plan is not converted to a map-side, since although the sorting columns and grouping
--- columns match, the user is issueing a distinct
+-- columns match, the user is issueing a distinct.
+-- However, after HIVE-4310, partial aggregation is performed on the mapper
EXPLAIN
select count(distinct key) from T1;
select count(distinct key) from T1;
-DROP TABLE T1;
\ No newline at end of file
+set hive.map.groupby.sorted.testmode=true;
+-- In testmode, the plan is not changed
+EXPLAIN
+select count(distinct key) from T1;
+select count(distinct key) from T1;
+
+DROP TABLE T1;
Added: hive/trunk/ql/src/test/results/clientpositive/groupby_sort_11.q.out
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/results/clientpositive/groupby_sort_11.q.out?rev=1470182&view=auto
==============================================================================
--- hive/trunk/ql/src/test/results/clientpositive/groupby_sort_11.q.out (added)
+++ hive/trunk/ql/src/test/results/clientpositive/groupby_sort_11.q.out Sat Apr 20 16:23:25 2013
@@ -0,0 +1,655 @@
+PREHOOK: query: CREATE TABLE T1(key STRING, val STRING) PARTITIONED BY (ds string)
+CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS
+PREHOOK: type: CREATETABLE
+POSTHOOK: query: CREATE TABLE T1(key STRING, val STRING) PARTITIONED BY (ds string)
+CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: default@T1
+PREHOOK: query: -- perform an insert to make sure there are 2 files
+INSERT OVERWRITE TABLE T1 PARTITION (ds='1')
+SELECT * from src where key < 10
+PREHOOK: type: QUERY
+PREHOOK: Input: default@src
+PREHOOK: Output: default@t1@ds=1
+POSTHOOK: query: -- perform an insert to make sure there are 2 files
+INSERT OVERWRITE TABLE T1 PARTITION (ds='1')
+SELECT * from src where key < 10
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@src
+POSTHOOK: Output: default@t1@ds=1
+POSTHOOK: Lineage: t1 PARTITION(ds=1).key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: t1 PARTITION(ds=1).val SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+PREHOOK: query: -- The plan is optimized to perform partial aggregation on the mapper
+EXPLAIN select count(distinct key) from T1
+PREHOOK: type: QUERY
+POSTHOOK: query: -- The plan is optimized to perform partial aggregation on the mapper
+EXPLAIN select count(distinct key) from T1
+POSTHOOK: type: QUERY
+POSTHOOK: Lineage: t1 PARTITION(ds=1).key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: t1 PARTITION(ds=1).val SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTIONDI count (TOK_TABLE_OR_COL key))))))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ t1
+ TableScan
+ alias: t1
+ Select Operator
+ expressions:
+ expr: key
+ type: string
+ outputColumnNames: key
+ Group By Operator
+ aggregations:
+ expr: count(DISTINCT key)
+ bucketGroup: true
+ keys:
+ expr: key
+ type: string
+ mode: hash
+ outputColumnNames: _col0, _col1
+ Reduce Output Operator
+ key expressions:
+ expr: _col0
+ type: string
+ sort order: +
+ tag: -1
+ value expressions:
+ expr: _col1
+ type: bigint
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: count(VALUE._col0)
+ bucketGroup: false
+ mode: mergepartial
+ outputColumnNames: _col0
+ Select Operator
+ expressions:
+ expr: _col0
+ type: bigint
+ outputColumnNames: _col0
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: select count(distinct key) from T1
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t1
+PREHOOK: Input: default@t1@ds=1
+#### A masked pattern was here ####
+POSTHOOK: query: select count(distinct key) from T1
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t1
+POSTHOOK: Input: default@t1@ds=1
+#### A masked pattern was here ####
+POSTHOOK: Lineage: t1 PARTITION(ds=1).key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: t1 PARTITION(ds=1).val SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+6
+PREHOOK: query: -- The plan is optimized to perform partial aggregation on the mapper
+EXPLAIN select count(distinct key), count(1), count(key), sum(distinct key) from T1
+PREHOOK: type: QUERY
+POSTHOOK: query: -- The plan is optimized to perform partial aggregation on the mapper
+EXPLAIN select count(distinct key), count(1), count(key), sum(distinct key) from T1
+POSTHOOK: type: QUERY
+POSTHOOK: Lineage: t1 PARTITION(ds=1).key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: t1 PARTITION(ds=1).val SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTIONDI count (TOK_TABLE_OR_COL key))) (TOK_SELEXPR (TOK_FUNCTION count 1)) (TOK_SELEXPR (TOK_FUNCTION count (TOK_TABLE_OR_COL key))) (TOK_SELEXPR (TOK_FUNCTIONDI sum (TOK_TABLE_OR_COL key))))))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ t1
+ TableScan
+ alias: t1
+ Select Operator
+ expressions:
+ expr: key
+ type: string
+ outputColumnNames: key
+ Group By Operator
+ aggregations:
+ expr: count(DISTINCT key)
+ expr: count(1)
+ expr: count(key)
+ expr: sum(DISTINCT key)
+ bucketGroup: true
+ keys:
+ expr: key
+ type: string
+ mode: hash
+ outputColumnNames: _col0, _col1, _col2, _col3, _col4
+ Reduce Output Operator
+ key expressions:
+ expr: _col0
+ type: string
+ sort order: +
+ tag: -1
+ value expressions:
+ expr: _col1
+ type: bigint
+ expr: _col2
+ type: bigint
+ expr: _col3
+ type: bigint
+ expr: _col4
+ type: double
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: count(VALUE._col0)
+ expr: count(VALUE._col1)
+ expr: count(VALUE._col2)
+ expr: sum(VALUE._col3)
+ bucketGroup: false
+ mode: mergepartial
+ outputColumnNames: _col0, _col1, _col2, _col3
+ Select Operator
+ expressions:
+ expr: _col0
+ type: bigint
+ expr: _col1
+ type: bigint
+ expr: _col2
+ type: bigint
+ expr: _col3
+ type: double
+ outputColumnNames: _col0, _col1, _col2, _col3
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: select count(distinct key), count(1), count(key), sum(distinct key) from T1
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t1
+PREHOOK: Input: default@t1@ds=1
+#### A masked pattern was here ####
+POSTHOOK: query: select count(distinct key), count(1), count(key), sum(distinct key) from T1
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t1
+POSTHOOK: Input: default@t1@ds=1
+#### A masked pattern was here ####
+POSTHOOK: Lineage: t1 PARTITION(ds=1).key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: t1 PARTITION(ds=1).val SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+6 10 10 28.0
+PREHOOK: query: -- The plan is not changed in the presence of a grouping key
+EXPLAIN select count(distinct key), count(1), count(key), sum(distinct key) from T1 group by key
+PREHOOK: type: QUERY
+POSTHOOK: query: -- The plan is not changed in the presence of a grouping key
+EXPLAIN select count(distinct key), count(1), count(key), sum(distinct key) from T1 group by key
+POSTHOOK: type: QUERY
+POSTHOOK: Lineage: t1 PARTITION(ds=1).key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: t1 PARTITION(ds=1).val SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTIONDI count (TOK_TABLE_OR_COL key))) (TOK_SELEXPR (TOK_FUNCTION count 1)) (TOK_SELEXPR (TOK_FUNCTION count (TOK_TABLE_OR_COL key))) (TOK_SELEXPR (TOK_FUNCTIONDI sum (TOK_TABLE_OR_COL key)))) (TOK_GROUPBY (TOK_TABLE_OR_COL key))))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ t1
+ TableScan
+ alias: t1
+ Select Operator
+ expressions:
+ expr: key
+ type: string
+ outputColumnNames: key
+ Group By Operator
+ aggregations:
+ expr: count(DISTINCT key)
+ expr: count(1)
+ expr: count(key)
+ expr: sum(DISTINCT key)
+ bucketGroup: true
+ keys:
+ expr: key
+ type: string
+ mode: hash
+ outputColumnNames: _col0, _col1, _col2, _col3, _col4
+ Reduce Output Operator
+ key expressions:
+ expr: _col0
+ type: string
+ sort order: ++
+ Map-reduce partition columns:
+ expr: _col0
+ type: string
+ tag: -1
+ value expressions:
+ expr: _col1
+ type: bigint
+ expr: _col2
+ type: bigint
+ expr: _col3
+ type: bigint
+ expr: _col4
+ type: double
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: count(DISTINCT KEY._col1:0._col0)
+ expr: count(VALUE._col1)
+ expr: count(VALUE._col2)
+ expr: sum(DISTINCT KEY._col1:1._col0)
+ bucketGroup: false
+ keys:
+ expr: KEY._col1:1._col0
+ type: string
+ mode: mergepartial
+ outputColumnNames: _col0, _col1, _col2, _col3, _col4
+ Select Operator
+ expressions:
+ expr: _col1
+ type: bigint
+ expr: _col2
+ type: bigint
+ expr: _col3
+ type: bigint
+ expr: _col4
+ type: double
+ outputColumnNames: _col0, _col1, _col2, _col3
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: select count(distinct key), count(1), count(key), sum(distinct key) from T1 group by key
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t1
+PREHOOK: Input: default@t1@ds=1
+#### A masked pattern was here ####
+POSTHOOK: query: select count(distinct key), count(1), count(key), sum(distinct key) from T1 group by key
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t1
+POSTHOOK: Input: default@t1@ds=1
+#### A masked pattern was here ####
+POSTHOOK: Lineage: t1 PARTITION(ds=1).key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: t1 PARTITION(ds=1).val SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+1 3 3 0.0
+1 1 1 2.0
+1 1 1 4.0
+1 3 3 5.0
+1 1 1 8.0
+1 1 1 9.0
+PREHOOK: query: -- The plan is not changed in the presence of a grouping key
+EXPLAIN select key, count(distinct key), count(1), count(key), sum(distinct key) from T1 group by key
+PREHOOK: type: QUERY
+POSTHOOK: query: -- The plan is not changed in the presence of a grouping key
+EXPLAIN select key, count(distinct key), count(1), count(key), sum(distinct key) from T1 group by key
+POSTHOOK: type: QUERY
+POSTHOOK: Lineage: t1 PARTITION(ds=1).key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: t1 PARTITION(ds=1).val SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_FUNCTIONDI count (TOK_TABLE_OR_COL key))) (TOK_SELEXPR (TOK_FUNCTION count 1)) (TOK_SELEXPR (TOK_FUNCTION count (TOK_TABLE_OR_COL key))) (TOK_SELEXPR (TOK_FUNCTIONDI sum (TOK_TABLE_OR_COL key)))) (TOK_GROUPBY (TOK_TABLE_OR_COL key))))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ t1
+ TableScan
+ alias: t1
+ Select Operator
+ expressions:
+ expr: key
+ type: string
+ outputColumnNames: key
+ Group By Operator
+ aggregations:
+ expr: count(DISTINCT key)
+ expr: count(1)
+ expr: count(key)
+ expr: sum(DISTINCT key)
+ bucketGroup: true
+ keys:
+ expr: key
+ type: string
+ mode: hash
+ outputColumnNames: _col0, _col1, _col2, _col3, _col4
+ Reduce Output Operator
+ key expressions:
+ expr: _col0
+ type: string
+ sort order: ++
+ Map-reduce partition columns:
+ expr: _col0
+ type: string
+ tag: -1
+ value expressions:
+ expr: _col1
+ type: bigint
+ expr: _col2
+ type: bigint
+ expr: _col3
+ type: bigint
+ expr: _col4
+ type: double
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: count(DISTINCT KEY._col1:0._col0)
+ expr: count(VALUE._col1)
+ expr: count(VALUE._col2)
+ expr: sum(DISTINCT KEY._col1:1._col0)
+ bucketGroup: false
+ keys:
+ expr: KEY._col1:1._col0
+ type: string
+ mode: mergepartial
+ outputColumnNames: _col0, _col1, _col2, _col3, _col4
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: bigint
+ expr: _col2
+ type: bigint
+ expr: _col3
+ type: bigint
+ expr: _col4
+ type: double
+ outputColumnNames: _col0, _col1, _col2, _col3, _col4
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: select key, count(distinct key), count(1), count(key), sum(distinct key) from T1 group by key
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t1
+PREHOOK: Input: default@t1@ds=1
+#### A masked pattern was here ####
+POSTHOOK: query: select key, count(distinct key), count(1), count(key), sum(distinct key) from T1 group by key
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t1
+POSTHOOK: Input: default@t1@ds=1
+#### A masked pattern was here ####
+POSTHOOK: Lineage: t1 PARTITION(ds=1).key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: t1 PARTITION(ds=1).val SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+0 1 3 3 0.0
+2 1 1 1 2.0
+4 1 1 1 4.0
+5 1 3 3 5.0
+8 1 1 1 8.0
+9 1 1 1 9.0
+PREHOOK: query: -- The plan is not changed in the presence of a grouping key expression
+EXPLAIN select count(distinct key+key) from T1
+PREHOOK: type: QUERY
+POSTHOOK: query: -- The plan is not changed in the presence of a grouping key expression
+EXPLAIN select count(distinct key+key) from T1
+POSTHOOK: type: QUERY
+POSTHOOK: Lineage: t1 PARTITION(ds=1).key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: t1 PARTITION(ds=1).val SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTIONDI count (+ (TOK_TABLE_OR_COL key) (TOK_TABLE_OR_COL key)))))))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ t1
+ TableScan
+ alias: t1
+ Select Operator
+ expressions:
+ expr: key
+ type: string
+ outputColumnNames: key
+ Group By Operator
+ aggregations:
+ expr: count(DISTINCT (key + key))
+ bucketGroup: false
+ keys:
+ expr: (key + key)
+ type: double
+ mode: hash
+ outputColumnNames: _col0, _col1
+ Reduce Output Operator
+ key expressions:
+ expr: _col0
+ type: double
+ sort order: +
+ tag: -1
+ value expressions:
+ expr: _col1
+ type: bigint
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: count(DISTINCT KEY._col0:0._col0)
+ bucketGroup: false
+ mode: mergepartial
+ outputColumnNames: _col0
+ Select Operator
+ expressions:
+ expr: _col0
+ type: bigint
+ outputColumnNames: _col0
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: select count(distinct key+key) from T1
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t1
+PREHOOK: Input: default@t1@ds=1
+#### A masked pattern was here ####
+POSTHOOK: query: select count(distinct key+key) from T1
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t1
+POSTHOOK: Input: default@t1@ds=1
+#### A masked pattern was here ####
+POSTHOOK: Lineage: t1 PARTITION(ds=1).key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: t1 PARTITION(ds=1).val SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+6
+PREHOOK: query: EXPLAIN select count(distinct 1) from T1
+PREHOOK: type: QUERY
+POSTHOOK: query: EXPLAIN select count(distinct 1) from T1
+POSTHOOK: type: QUERY
+POSTHOOK: Lineage: t1 PARTITION(ds=1).key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: t1 PARTITION(ds=1).val SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTIONDI count 1)))))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ t1
+ TableScan
+ alias: t1
+ Select Operator
+ Group By Operator
+ aggregations:
+ expr: count(DISTINCT 1)
+ bucketGroup: false
+ keys:
+ expr: 1
+ type: int
+ mode: hash
+ outputColumnNames: _col0, _col1
+ Reduce Output Operator
+ key expressions:
+ expr: _col0
+ type: int
+ sort order: +
+ tag: -1
+ value expressions:
+ expr: _col1
+ type: bigint
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: count(DISTINCT KEY._col0:0._col0)
+ bucketGroup: false
+ mode: mergepartial
+ outputColumnNames: _col0
+ Select Operator
+ expressions:
+ expr: _col0
+ type: bigint
+ outputColumnNames: _col0
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: select count(distinct 1) from T1
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t1
+PREHOOK: Input: default@t1@ds=1
+#### A masked pattern was here ####
+POSTHOOK: query: select count(distinct 1) from T1
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t1
+POSTHOOK: Input: default@t1@ds=1
+#### A masked pattern was here ####
+POSTHOOK: Lineage: t1 PARTITION(ds=1).key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: t1 PARTITION(ds=1).val SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+1
+PREHOOK: query: -- no plan change if map aggr is turned off
+EXPLAIN select count(distinct key) from T1
+PREHOOK: type: QUERY
+POSTHOOK: query: -- no plan change if map aggr is turned off
+EXPLAIN select count(distinct key) from T1
+POSTHOOK: type: QUERY
+POSTHOOK: Lineage: t1 PARTITION(ds=1).key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: t1 PARTITION(ds=1).val SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTIONDI count (TOK_TABLE_OR_COL key))))))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ t1
+ TableScan
+ alias: t1
+ Select Operator
+ expressions:
+ expr: key
+ type: string
+ outputColumnNames: key
+ Reduce Output Operator
+ key expressions:
+ expr: key
+ type: string
+ sort order: +
+ tag: -1
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: count(DISTINCT KEY._col0:0._col0)
+ bucketGroup: false
+ mode: complete
+ outputColumnNames: _col0
+ Select Operator
+ expressions:
+ expr: _col0
+ type: bigint
+ outputColumnNames: _col0
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: select count(distinct key) from T1
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t1
+PREHOOK: Input: default@t1@ds=1
+#### A masked pattern was here ####
+POSTHOOK: query: select count(distinct key) from T1
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t1
+POSTHOOK: Input: default@t1@ds=1
+#### A masked pattern was here ####
+POSTHOOK: Lineage: t1 PARTITION(ds=1).key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: t1 PARTITION(ds=1).val SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
+6
Modified: hive/trunk/ql/src/test/results/clientpositive/groupby_sort_8.q.out
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/results/clientpositive/groupby_sort_8.q.out?rev=1470182&r1=1470181&r2=1470182&view=diff
==============================================================================
--- hive/trunk/ql/src/test/results/clientpositive/groupby_sort_8.q.out (original)
+++ hive/trunk/ql/src/test/results/clientpositive/groupby_sort_8.q.out Sat Apr 20 16:23:25 2013
@@ -27,12 +27,14 @@ POSTHOOK: Output: default@t1@ds=1
POSTHOOK: Lineage: t1 PARTITION(ds=1).key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ]
POSTHOOK: Lineage: t1 PARTITION(ds=1).val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ]
PREHOOK: query: -- The plan is not converted to a map-side, since although the sorting columns and grouping
--- columns match, the user is issueing a distinct
+-- columns match, the user is issueing a distinct.
+-- However, after HIVE-4310, partial aggregation is performed on the mapper
EXPLAIN
select count(distinct key) from T1
PREHOOK: type: QUERY
POSTHOOK: query: -- The plan is not converted to a map-side, since although the sorting columns and grouping
--- columns match, the user is issueing a distinct
+-- columns match, the user is issueing a distinct.
+-- However, after HIVE-4310, partial aggregation is performed on the mapper
EXPLAIN
select count(distinct key) from T1
POSTHOOK: type: QUERY
@@ -78,6 +80,90 @@ STAGE PLANS:
Reduce Operator Tree:
Group By Operator
aggregations:
+ expr: count(VALUE._col0)
+ bucketGroup: false
+ mode: mergepartial
+ outputColumnNames: _col0
+ Select Operator
+ expressions:
+ expr: _col0
+ type: bigint
+ outputColumnNames: _col0
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: select count(distinct key) from T1
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t1
+PREHOOK: Input: default@t1@ds=1
+#### A masked pattern was here ####
+POSTHOOK: query: select count(distinct key) from T1
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t1
+POSTHOOK: Input: default@t1@ds=1
+#### A masked pattern was here ####
+POSTHOOK: Lineage: t1 PARTITION(ds=1).key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ]
+POSTHOOK: Lineage: t1 PARTITION(ds=1).val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ]
+5
+PREHOOK: query: -- In testmode, the plan is not changed
+EXPLAIN
+select count(distinct key) from T1
+PREHOOK: type: QUERY
+POSTHOOK: query: -- In testmode, the plan is not changed
+EXPLAIN
+select count(distinct key) from T1
+POSTHOOK: type: QUERY
+POSTHOOK: Lineage: t1 PARTITION(ds=1).key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ]
+POSTHOOK: Lineage: t1 PARTITION(ds=1).val SIMPLE [(t1)t1.FieldSchema(name:val, type:string, comment:null), ]
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTIONDI count (TOK_TABLE_OR_COL key))))))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ t1
+ TableScan
+ alias: t1
+ Select Operator
+ expressions:
+ expr: key
+ type: string
+ outputColumnNames: key
+ Group By Operator
+ aggregations:
+ expr: count(DISTINCT key)
+ bucketGroup: false
+ keys:
+ expr: key
+ type: string
+ mode: hash
+ outputColumnNames: _col0, _col1
+ Reduce Output Operator
+ key expressions:
+ expr: _col0
+ type: string
+ sort order: +
+ tag: -1
+ value expressions:
+ expr: _col1
+ type: bigint
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
expr: count(DISTINCT KEY._col0:0._col0)
bucketGroup: false
mode: mergepartial
Modified: hive/trunk/ql/src/test/results/compiler/plan/groupby2.q.xml
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/results/compiler/plan/groupby2.q.xml?rev=1470182&r1=1470181&r2=1470182&view=diff
==============================================================================
--- hive/trunk/ql/src/test/results/compiler/plan/groupby2.q.xml (original)
+++ hive/trunk/ql/src/test/results/compiler/plan/groupby2.q.xml Sat Apr 20 16:23:25 2013
@@ -693,6 +693,9 @@
</void>
</object>
</void>
+ <void property="distinct">
+ <boolean>true</boolean>
+ </void>
<void property="groupByMemoryUsage">
<float>0.5</float>
</void>
@@ -1711,6 +1714,9 @@
</void>
</object>
</void>
+ <void property="distinct">
+ <boolean>true</boolean>
+ </void>
<void property="groupByMemoryUsage">
<float>0.5</float>
</void>
Modified: hive/trunk/ql/src/test/results/compiler/plan/groupby3.q.xml
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/results/compiler/plan/groupby3.q.xml?rev=1470182&r1=1470181&r2=1470182&view=diff
==============================================================================
--- hive/trunk/ql/src/test/results/compiler/plan/groupby3.q.xml (original)
+++ hive/trunk/ql/src/test/results/compiler/plan/groupby3.q.xml Sat Apr 20 16:23:25 2013
@@ -888,6 +888,9 @@
</void>
</object>
</void>
+ <void property="distinct">
+ <boolean>true</boolean>
+ </void>
<void property="groupByMemoryUsage">
<float>0.5</float>
</void>
@@ -2034,6 +2037,9 @@
</void>
</object>
</void>
+ <void property="distinct">
+ <boolean>true</boolean>
+ </void>
<void property="groupByMemoryUsage">
<float>0.5</float>
</void>