You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by px...@apache.org on 2016/11/17 18:48:26 UTC
[2/2] hive git commit: HIVE-10901: Optimize multi column distinct
queries (Pengcheng Xiong, reviewed by Ashutosh Chauhan)
HIVE-10901: Optimize multi column distinct queries (Pengcheng Xiong, reviewed by Ashutosh Chauhan)
Project: http://git-wip-us.apache.org/repos/asf/hive/repo
Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/67c022f0
Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/67c022f0
Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/67c022f0
Branch: refs/heads/master
Commit: 67c022f05f5405abd996446fefc90e7ae34c960d
Parents: 0025b9d
Author: Pengcheng Xiong <px...@apache.org>
Authored: Thu Nov 17 10:48:06 2016 -0800
Committer: Pengcheng Xiong <px...@apache.org>
Committed: Thu Nov 17 10:48:06 2016 -0800
----------------------------------------------------------------------
.../test/resources/testconfiguration.properties | 1 +
.../rules/HiveAggregateProjectMergeRule.java | 14 ++
.../rules/HiveExpandDistinctAggregatesRule.java | 206 ++++++++++++++++++-
.../clientpositive/multi_count_distinct.q | 38 ++++
.../llap/vector_grouping_sets.q.out | 8 +-
.../results/clientpositive/perf/query70.q.out | 52 ++---
.../tez/multi_count_distinct.q.out | 193 +++++++++++++++++
.../clientpositive/vector_grouping_sets.q.out | 8 +-
8 files changed, 483 insertions(+), 37 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/hive/blob/67c022f0/itests/src/test/resources/testconfiguration.properties
----------------------------------------------------------------------
diff --git a/itests/src/test/resources/testconfiguration.properties b/itests/src/test/resources/testconfiguration.properties
index f30152b..e4910e4 100644
--- a/itests/src/test/resources/testconfiguration.properties
+++ b/itests/src/test/resources/testconfiguration.properties
@@ -51,6 +51,7 @@ minitez.query.files=explainuser_3.q,\
explainanalyze_5.q,\
hybridgrace_hashjoin_1.q,\
hybridgrace_hashjoin_2.q,\
+ multi_count_distinct.q,\
tez_union_with_udf.q
http://git-wip-us.apache.org/repos/asf/hive/blob/67c022f0/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveAggregateProjectMergeRule.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveAggregateProjectMergeRule.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveAggregateProjectMergeRule.java
index c243266..7c3b4b0 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveAggregateProjectMergeRule.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveAggregateProjectMergeRule.java
@@ -31,6 +31,7 @@ import org.apache.calcite.util.ImmutableBitSet;
import org.apache.hadoop.hive.ql.optimizer.calcite.HiveRelFactories;
import org.apache.hadoop.hive.ql.optimizer.calcite.HiveRelOptUtil;
import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveAggregate;
+import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveGroupingID;
import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveProject;
import com.google.common.collect.ImmutableList;
@@ -59,6 +60,19 @@ public class HiveAggregateProjectMergeRule extends RelOptRule {
}
@Override
+ public boolean matches(RelOptRuleCall call) {
+ final Aggregate aggregate = call.rel(0);
+ // Rule cannot be applied if there are GroupingId because it will change the
+ // value as the position will be changed.
+ for (AggregateCall aggCall : aggregate.getAggCallList()) {
+ if (aggCall.getAggregation().equals(HiveGroupingID.INSTANCE)) {
+ return false;
+ }
+ }
+ return super.matches(call);
+ }
+
+ @Override
public void onMatch(RelOptRuleCall call) {
final HiveAggregate aggregate = call.rel(0);
final HiveProject project = call.rel(1);
http://git-wip-us.apache.org/repos/asf/hive/blob/67c022f0/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveExpandDistinctAggregatesRule.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveExpandDistinctAggregatesRule.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveExpandDistinctAggregatesRule.java
index 7d4411a..417eda3 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveExpandDistinctAggregatesRule.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveExpandDistinctAggregatesRule.java
@@ -16,13 +16,17 @@
*/
package org.apache.hadoop.hive.ql.optimizer.calcite.rules;
+import java.math.BigDecimal;
import java.util.ArrayList;
+import java.util.Collections;
import java.util.HashMap;
+import java.util.HashSet;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
+import org.apache.calcite.plan.RelOptCluster;
import org.apache.calcite.plan.RelOptRule;
import org.apache.calcite.plan.RelOptRuleCall;
import org.apache.calcite.rel.RelNode;
@@ -31,16 +35,30 @@ import org.apache.calcite.rel.core.AggregateCall;
import org.apache.calcite.rel.core.RelFactories;
import org.apache.calcite.rel.metadata.RelColumnOrigin;
import org.apache.calcite.rel.metadata.RelMetadataQuery;
+import org.apache.calcite.rel.type.RelDataType;
import org.apache.calcite.rel.type.RelDataTypeField;
+import org.apache.calcite.rex.RexBuilder;
import org.apache.calcite.rex.RexInputRef;
import org.apache.calcite.rex.RexNode;
+import org.apache.calcite.sql.fun.SqlStdOperatorTable;
+import org.apache.calcite.sql.type.SqlTypeName;
import org.apache.calcite.util.ImmutableBitSet;
import org.apache.calcite.util.Pair;
import org.apache.calcite.util.Util;
+import org.apache.hadoop.hive.ql.optimizer.calcite.CalciteSemanticException;
+import org.apache.hadoop.hive.ql.optimizer.calcite.HiveCalciteUtil;
import org.apache.hadoop.hive.ql.optimizer.calcite.HiveRelFactories;
import org.apache.hadoop.hive.ql.optimizer.calcite.RelOptHiveTable;
import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveAggregate;
+import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveGroupingID;
+import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveProject;
+import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveRelNode;
+import org.apache.hadoop.hive.ql.optimizer.calcite.translator.TypeConverter;
+import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import com.google.common.base.Function;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.Lists;
@@ -73,6 +91,8 @@ public final class HiveExpandDistinctAggregatesRule extends RelOptRule {
HiveRelFactories.HIVE_PROJECT_FACTORY);
private static RelFactories.ProjectFactory projFactory;
+
+ protected static final Logger LOG = LoggerFactory.getLogger(HiveExpandDistinctAggregatesRule.class);
//~ Constructors -----------------------------------------------------------
@@ -82,19 +102,25 @@ public final class HiveExpandDistinctAggregatesRule extends RelOptRule {
projFactory = projectFactory;
}
+ RelOptCluster cluster = null;
+ RexBuilder rexBuilder = null;
+
//~ Methods ----------------------------------------------------------------
@Override
public void onMatch(RelOptRuleCall call) {
final Aggregate aggregate = call.rel(0);
- if (!aggregate.containsDistinctCall()) {
+ int numCountDistinct = getNumCountDistinctCall(aggregate);
+ if (numCountDistinct == 0) {
return;
}
- // Find all of the agg expressions. We use a LinkedHashSet to ensure
- // determinism.
+ // Find all of the agg expressions. We use a List (for all count(distinct))
+ // as well as a Set (for all others) to ensure determinism.
int nonDistinctCount = 0;
+ List<List<Integer>> argListList = new ArrayList<List<Integer>>();
Set<List<Integer>> argListSets = new LinkedHashSet<List<Integer>>();
+ Set<Integer> positions = new HashSet<>();
for (AggregateCall aggCall : aggregate.getAggCallList()) {
if (!aggCall.isDistinct()) {
++nonDistinctCount;
@@ -103,11 +129,35 @@ public final class HiveExpandDistinctAggregatesRule extends RelOptRule {
ArrayList<Integer> argList = new ArrayList<Integer>();
for (Integer arg : aggCall.getArgList()) {
argList.add(arg);
+ positions.add(arg);
}
+ // Aggr checks for sorted argList.
+ argListList.add(argList);
argListSets.add(argList);
}
Util.permAssert(argListSets.size() > 0, "containsDistinctCall lied");
+ if (numCountDistinct > 1 && numCountDistinct == aggregate.getAggCallList().size()
+ && aggregate.getGroupSet().isEmpty()) {
+ // now positions contains all the distinct positions, i.e., $5, $4, $6
+ // we need to first sort them as group by set
+ // and then get their position later, i.e., $4->1, $5->2, $6->3
+ cluster = aggregate.getCluster();
+ rexBuilder = cluster.getRexBuilder();
+ RelNode converted = null;
+ List<Integer> sourceOfForCountDistinct = new ArrayList<>();
+ sourceOfForCountDistinct.addAll(positions);
+ Collections.sort(sourceOfForCountDistinct);
+ try {
+ converted = convert(aggregate, argListList, sourceOfForCountDistinct);
+ } catch (CalciteSemanticException e) {
+ LOG.debug(e.toString());
+ throw new RuntimeException(e);
+ }
+ call.transformTo(converted);
+ return;
+ }
+
// If all of the agg expressions are distinct and have the same
// arguments then we can use a more efficient form.
if ((nonDistinctCount == 0) && (argListSets.size() == 1)) {
@@ -133,6 +183,156 @@ public final class HiveExpandDistinctAggregatesRule extends RelOptRule {
}
/**
+ * Converts an aggregate relational expression that contains only
+ * count(distinct) to grouping sets with count. For example select
+ * count(distinct department_id), count(distinct gender), count(distinct
+ * education_level) from employee; can be transformed to select count(case i
+ * when 1 then 1 else null end) as c0, count(case i when 2 then 1 else null
+ * end) as c1, count(case i when 4 then 1 else null end) as c2 from (select
+ * grouping__id as i, department_id, gender, education_level from employee
+ * group by department_id, gender, education_level grouping sets
+ * (department_id, gender, education_level))subq;
+ * @throws CalciteSemanticException
+ */
+ private RelNode convert(Aggregate aggregate, List<List<Integer>> argList, List<Integer> sourceOfForCountDistinct) throws CalciteSemanticException {
+ // we use this map to map the position of argList to the position of grouping set
+ Map<Integer, Integer> map = new HashMap<>();
+ List<List<Integer>> cleanArgList = new ArrayList<>();
+ final Aggregate groupingSets = createGroupingSets(aggregate, argList, cleanArgList, map, sourceOfForCountDistinct);
+ return createCount(groupingSets, argList, cleanArgList, map, sourceOfForCountDistinct);
+ }
+
+ private long getGroupingIdValue(List<Integer> list, List<Integer> sourceOfForCountDistinct) {
+ long ind = 0;
+ for (int i : list) {
+ ind |= 1 << sourceOfForCountDistinct.indexOf(i);
+ }
+ return ind;
+ }
+
+ /**
+ * @param aggr: the original aggregate
+ * @param argList: the original argList in aggregate
+ * @param cleanArgList: the new argList without duplicates
+ * @param map: the mapping from the original argList to the new argList
+ * @param sourceOfForCountDistinct: the sorted positions of groupset
+ * @return
+ * @throws CalciteSemanticException
+ */
+ private RelNode createCount(Aggregate aggr, List<List<Integer>> argList,
+ List<List<Integer>> cleanArgList, Map<Integer, Integer> map,
+ List<Integer> sourceOfForCountDistinct) throws CalciteSemanticException {
+ List<RexNode> originalInputRefs = Lists.transform(aggr.getRowType().getFieldList(),
+ new Function<RelDataTypeField, RexNode>() {
+ @Override
+ public RexNode apply(RelDataTypeField input) {
+ return new RexInputRef(input.getIndex(), input.getType());
+ }
+ });
+ final List<RexNode> gbChildProjLst = Lists.newArrayList();
+ for (List<Integer> list : cleanArgList) {
+ RexNode equal = rexBuilder.makeCall(SqlStdOperatorTable.EQUALS,
+ originalInputRefs.get(originalInputRefs.size() - 1),
+ rexBuilder.makeExactLiteral(new BigDecimal(getGroupingIdValue(list, sourceOfForCountDistinct))));
+ RexNode condition = rexBuilder.makeCall(SqlStdOperatorTable.CASE, equal,
+ rexBuilder.makeExactLiteral(BigDecimal.ONE), rexBuilder.constantNull());
+ gbChildProjLst.add(condition);
+ }
+
+ // create the project before GB
+ RelNode gbInputRel = HiveProject.create(aggr, gbChildProjLst, null);
+
+ // create the aggregate
+ List<AggregateCall> aggregateCalls = Lists.newArrayList();
+ RelDataType aggFnRetType = TypeConverter.convert(TypeInfoFactory.longTypeInfo,
+ cluster.getTypeFactory());
+ for (int i = 0; i < cleanArgList.size(); i++) {
+ AggregateCall aggregateCall = HiveCalciteUtil.createSingleArgAggCall("count", cluster,
+ TypeInfoFactory.longTypeInfo, i, aggFnRetType);
+ aggregateCalls.add(aggregateCall);
+ }
+ Aggregate aggregate = new HiveAggregate(cluster, cluster.traitSetOf(HiveRelNode.CONVENTION), gbInputRel,
+ false, ImmutableBitSet.of(), null, aggregateCalls);
+
+ // create the project after GB. For those repeated values, e.g., select
+ // count(distinct x, y), count(distinct y, x), we find the correct mapping.
+ if (map.isEmpty()) {
+ return aggregate;
+ } else {
+ List<RexNode> originalAggrRefs = Lists.transform(aggregate.getRowType().getFieldList(),
+ new Function<RelDataTypeField, RexNode>() {
+ @Override
+ public RexNode apply(RelDataTypeField input) {
+ return new RexInputRef(input.getIndex(), input.getType());
+ }
+ });
+ final List<RexNode> projLst = Lists.newArrayList();
+ int index = 0;
+ for (int i = 0; i < argList.size(); i++) {
+ if (map.containsKey(i)) {
+ projLst.add(originalAggrRefs.get(map.get(i)));
+ } else {
+ projLst.add(originalAggrRefs.get(index++));
+ }
+ }
+ return HiveProject.create(aggregate, projLst, null);
+ }
+ }
+
+ /**
+ * @param aggregate: the original aggregate
+ * @param argList: the original argList in aggregate
+ * @param cleanArgList: the new argList without duplicates
+ * @param map: the mapping from the original argList to the new argList
+ * @param sourceOfForCountDistinct: the sorted positions of groupset
+ * @return
+ */
+ private Aggregate createGroupingSets(Aggregate aggregate, List<List<Integer>> argList,
+ List<List<Integer>> cleanArgList, Map<Integer, Integer> map,
+ List<Integer> sourceOfForCountDistinct) {
+ final ImmutableBitSet groupSet = ImmutableBitSet.of(sourceOfForCountDistinct);
+ final List<ImmutableBitSet> origGroupSets = new ArrayList<>();
+
+ for (int i = 0; i < argList.size(); i++) {
+ List<Integer> list = argList.get(i);
+ ImmutableBitSet bitSet = ImmutableBitSet.of(list);
+ int prev = origGroupSets.indexOf(bitSet);
+ if (prev == -1) {
+ origGroupSets.add(bitSet);
+ cleanArgList.add(list);
+ } else {
+ map.put(i, prev);
+ }
+ }
+ // Calcite expects the grouping sets sorted and without duplicates
+ Collections.sort(origGroupSets, ImmutableBitSet.COMPARATOR);
+
+ List<AggregateCall> aggregateCalls = new ArrayList<AggregateCall>();
+ // Create GroupingID column
+ AggregateCall aggCall = AggregateCall.create(HiveGroupingID.INSTANCE, false,
+ new ImmutableList.Builder<Integer>().build(), -1, this.cluster.getTypeFactory()
+ .createSqlType(SqlTypeName.INTEGER), HiveGroupingID.INSTANCE.getName());
+ aggregateCalls.add(aggCall);
+ return new HiveAggregate(cluster, cluster.traitSetOf(HiveRelNode.CONVENTION),
+ aggregate.getInput(), true, groupSet, origGroupSets, aggregateCalls);
+ }
+
+ /**
+ * Returns the number of count DISTINCT
+ *
+ * @return the number of count DISTINCT
+ */
+ private int getNumCountDistinctCall(Aggregate hiveAggregate) {
+ int cnt = 0;
+ for (AggregateCall aggCall : hiveAggregate.getAggCallList()) {
+ if (aggCall.isDistinct() && (aggCall.getAggregation().getName().equalsIgnoreCase("count"))) {
+ cnt++;
+ }
+ }
+ return cnt;
+ }
+
+ /**
* Converts an aggregate relational expression that contains just one
* distinct aggregate function (or perhaps several over the same arguments)
* and no non-distinct aggregate functions.
http://git-wip-us.apache.org/repos/asf/hive/blob/67c022f0/ql/src/test/queries/clientpositive/multi_count_distinct.q
----------------------------------------------------------------------
diff --git a/ql/src/test/queries/clientpositive/multi_count_distinct.q b/ql/src/test/queries/clientpositive/multi_count_distinct.q
new file mode 100644
index 0000000..855cb64
--- /dev/null
+++ b/ql/src/test/queries/clientpositive/multi_count_distinct.q
@@ -0,0 +1,38 @@
+set hive.mapred.mode=nonstrict;
+
+drop table employee;
+
+create table employee (department_id int, gender varchar(10), education_level int);
+
+insert into employee values (1, 'M', 1),(1, 'M', 1),(2, 'F', 1),(1, 'F', 3),(1, 'M', 2),(4, 'M', 1),(2, 'F', 1),(2, 'F', 3),(3, 'M', 2);
+
+explain select count(distinct department_id), count(distinct gender), count(distinct education_level) from employee;
+
+select count(distinct department_id), count(distinct gender), count(distinct education_level) from employee;
+
+select count(distinct department_id), count(distinct gender), count(distinct education_level), count(distinct education_level) from employee;
+
+select count(distinct department_id), count(distinct gender), count(distinct education_level),
+count(distinct education_level, department_id) from employee;
+
+select count(distinct gender), count(distinct department_id), count(distinct gender), count(distinct education_level),
+count(distinct education_level, department_id), count(distinct department_id, education_level) from employee;
+
+explain select count(distinct gender), count(distinct department_id), count(distinct gender), count(distinct education_level),
+count(distinct education_level, department_id), count(distinct department_id, education_level), count(distinct department_id, education_level, gender) from employee;
+
+select count(distinct gender), count(distinct department_id), count(distinct gender), count(distinct education_level),
+count(distinct education_level, department_id), count(distinct department_id, education_level), count(distinct department_id, education_level, gender) from employee;
+
+select count(case i when 1 then 1 else null end) as c0, count(case i when 2 then 1 else null end) as c1,
+count(case i when 4 then 1 else null end) as c2 from (select grouping__id as i, department_id, gender,
+education_level from employee group by department_id, gender, education_level grouping sets
+(department_id, gender, education_level))subq;
+
+select grouping__id as i, department_id, gender, education_level from employee
+group by department_id, gender, education_level grouping sets
+(department_id, gender, education_level, (education_level, department_id));
+
+
+
+
http://git-wip-us.apache.org/repos/asf/hive/blob/67c022f0/ql/src/test/results/clientpositive/llap/vector_grouping_sets.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/llap/vector_grouping_sets.q.out b/ql/src/test/results/clientpositive/llap/vector_grouping_sets.q.out
index 2cd758c..127478e 100644
--- a/ql/src/test/results/clientpositive/llap/vector_grouping_sets.q.out
+++ b/ql/src/test/results/clientpositive/llap/vector_grouping_sets.q.out
@@ -241,10 +241,10 @@ STAGE PLANS:
Statistics: Num rows: 12 Data size: 25632 Basic stats: COMPLETE Column stats: NONE
Select Operator
expressions: s_store_id (type: string)
- outputColumnNames: s_store_id
+ outputColumnNames: _col0
Statistics: Num rows: 12 Data size: 25632 Basic stats: COMPLETE Column stats: NONE
Group By Operator
- keys: s_store_id (type: string), '0' (type: string)
+ keys: _col0 (type: string), '0' (type: string)
mode: hash
outputColumnNames: _col0, _col1
Statistics: Num rows: 24 Data size: 51264 Basic stats: COMPLETE Column stats: NONE
@@ -329,10 +329,10 @@ STAGE PLANS:
Statistics: Num rows: 12 Data size: 25632 Basic stats: COMPLETE Column stats: NONE
Select Operator
expressions: s_store_id (type: string)
- outputColumnNames: s_store_id
+ outputColumnNames: _col0
Statistics: Num rows: 12 Data size: 25632 Basic stats: COMPLETE Column stats: NONE
Group By Operator
- keys: s_store_id (type: string), '0' (type: string)
+ keys: _col0 (type: string), '0' (type: string)
mode: hash
outputColumnNames: _col0, _col1
Statistics: Num rows: 24 Data size: 51264 Basic stats: COMPLETE Column stats: NONE
http://git-wip-us.apache.org/repos/asf/hive/blob/67c022f0/ql/src/test/results/clientpositive/perf/query70.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/perf/query70.q.out b/ql/src/test/results/clientpositive/perf/query70.q.out
index 5a82cd1..611af74 100644
--- a/ql/src/test/results/clientpositive/perf/query70.q.out
+++ b/ql/src/test/results/clientpositive/perf/query70.q.out
@@ -21,46 +21,46 @@ Stage-0
limit:100
Stage-1
Reducer 6
- File Output Operator [FS_62]
- Limit [LIM_61] (rows=100 width=88)
+ File Output Operator [FS_63]
+ Limit [LIM_62] (rows=100 width=88)
Number of rows:100
- Select Operator [SEL_60] (rows=1045432122 width=88)
+ Select Operator [SEL_61] (rows=1045432122 width=88)
Output:["_col0","_col1","_col2","_col3","_col4"]
<-Reducer 5 [SIMPLE_EDGE]
- SHUFFLE [RS_59]
- Select Operator [SEL_57] (rows=1045432122 width=88)
+ SHUFFLE [RS_60]
+ Select Operator [SEL_58] (rows=1045432122 width=88)
Output:["_col0","_col1","_col2","_col3","_col4"]
- PTF Operator [PTF_56] (rows=1045432122 width=88)
+ PTF Operator [PTF_57] (rows=1045432122 width=88)
Function definitions:[{},{"name:":"windowingtablefunction","order by:":"_col4 ASC NULLS FIRST","partition by:":"_col5, CASE WHEN ((_col5 = 2)) THEN (_col0) ELSE (null) END"}]
- Select Operator [SEL_55] (rows=1045432122 width=88)
+ Select Operator [SEL_56] (rows=1045432122 width=88)
Output:["_col0","_col1","_col4","_col5"]
<-Reducer 4 [SIMPLE_EDGE]
- SHUFFLE [RS_54]
+ SHUFFLE [RS_55]
PartitionCols:_col5, CASE WHEN ((_col5 = 2)) THEN (_col0) ELSE (null) END
- Select Operator [SEL_53] (rows=1045432122 width=88)
+ Select Operator [SEL_54] (rows=1045432122 width=88)
Output:["_col0","_col1","_col4","_col5"]
- Group By Operator [GBY_52] (rows=1045432122 width=88)
+ Group By Operator [GBY_53] (rows=1045432122 width=88)
Output:["_col0","_col1","_col2","_col3"],aggregations:["sum(VALUE._col0)"],keys:KEY._col0, KEY._col1, KEY._col2
<-Reducer 3 [SIMPLE_EDGE]
- SHUFFLE [RS_51]
+ SHUFFLE [RS_52]
PartitionCols:_col0, _col1, _col2
- Group By Operator [GBY_50] (rows=2090864244 width=88)
- Output:["_col0","_col1","_col2","_col3"],aggregations:["sum(_col2)"],keys:_col6, _col7, '0'
+ Group By Operator [GBY_51] (rows=2090864244 width=88)
+ Output:["_col0","_col1","_col2","_col3"],aggregations:["sum(_col2)"],keys:_col0, _col1, '0'
Select Operator [SEL_49] (rows=696954748 width=88)
- Output:["_col6","_col7","_col2"]
- Merge Join Operator [MERGEJOIN_91] (rows=696954748 width=88)
+ Output:["_col0","_col1","_col2"]
+ Merge Join Operator [MERGEJOIN_92] (rows=696954748 width=88)
Conds:RS_46._col1=RS_47._col0(Inner),Output:["_col2","_col6","_col7"]
<-Reducer 2 [SIMPLE_EDGE]
SHUFFLE [RS_46]
PartitionCols:_col1
- Merge Join Operator [MERGEJOIN_87] (rows=633595212 width=88)
+ Merge Join Operator [MERGEJOIN_88] (rows=633595212 width=88)
Conds:RS_43._col0=RS_44._col0(Inner),Output:["_col1","_col2"]
<-Map 1 [SIMPLE_EDGE]
SHUFFLE [RS_43]
PartitionCols:_col0
Select Operator [SEL_2] (rows=575995635 width=88)
Output:["_col0","_col1","_col2"]
- Filter Operator [FIL_80] (rows=575995635 width=88)
+ Filter Operator [FIL_81] (rows=575995635 width=88)
predicate:(ss_sold_date_sk is not null and ss_store_sk is not null)
TableScan [TS_0] (rows=575995635 width=88)
default@store_sales,ss,Tbl:COMPLETE,Col:NONE,Output:["ss_sold_date_sk","ss_store_sk","ss_net_profit"]
@@ -69,21 +69,21 @@ Stage-0
PartitionCols:_col0
Select Operator [SEL_5] (rows=8116 width=1119)
Output:["_col0"]
- Filter Operator [FIL_81] (rows=8116 width=1119)
+ Filter Operator [FIL_82] (rows=8116 width=1119)
predicate:(d_month_seq BETWEEN 1193 AND 1204 and d_date_sk is not null)
TableScan [TS_3] (rows=73049 width=1119)
default@date_dim,d1,Tbl:COMPLETE,Col:NONE,Output:["d_date_sk","d_month_seq"]
<-Reducer 9 [SIMPLE_EDGE]
SHUFFLE [RS_47]
PartitionCols:_col0
- Merge Join Operator [MERGEJOIN_90] (rows=127775039 width=88)
+ Merge Join Operator [MERGEJOIN_91] (rows=127775039 width=88)
Conds:RS_39._col2=RS_40._col0(Left Semi),Output:["_col0","_col1","_col2"]
<-Map 8 [SIMPLE_EDGE]
SHUFFLE [RS_39]
PartitionCols:_col2
Select Operator [SEL_8] (rows=1704 width=1910)
Output:["_col0","_col1","_col2"]
- Filter Operator [FIL_82] (rows=1704 width=1910)
+ Filter Operator [FIL_83] (rows=1704 width=1910)
predicate:(s_store_sk is not null and s_state is not null)
TableScan [TS_6] (rows=1704 width=1910)
default@store,s,Tbl:COMPLETE,Col:NONE,Output:["s_store_sk","s_county","s_state"]
@@ -94,7 +94,7 @@ Stage-0
Output:["_col0"],keys:_col0
Select Operator [SEL_32] (rows=116159124 width=88)
Output:["_col0"]
- Filter Operator [FIL_83] (rows=116159124 width=88)
+ Filter Operator [FIL_84] (rows=116159124 width=88)
predicate:(rank_window_0 <= 5)
PTF Operator [PTF_31] (rows=348477374 width=88)
Function definitions:[{},{"name:":"windowingtablefunction","order by:":"_col1 DESC NULLS LAST","partition by:":"_col0"}]
@@ -112,28 +112,28 @@ Stage-0
Output:["_col0","_col1"],aggregations:["sum(_col2)"],keys:_col6
Select Operator [SEL_24] (rows=696954748 width=88)
Output:["_col6","_col2"]
- Merge Join Operator [MERGEJOIN_89] (rows=696954748 width=88)
+ Merge Join Operator [MERGEJOIN_90] (rows=696954748 width=88)
Conds:RS_21._col1=RS_22._col0(Inner),Output:["_col2","_col6"]
<-Map 16 [SIMPLE_EDGE]
SHUFFLE [RS_22]
PartitionCols:_col0
Select Operator [SEL_17] (rows=1704 width=1910)
Output:["_col0","_col1"]
- Filter Operator [FIL_86] (rows=1704 width=1910)
+ Filter Operator [FIL_87] (rows=1704 width=1910)
predicate:(s_store_sk is not null and s_state is not null)
TableScan [TS_15] (rows=1704 width=1910)
default@store,store,Tbl:COMPLETE,Col:NONE,Output:["s_store_sk","s_state"]
<-Reducer 11 [SIMPLE_EDGE]
SHUFFLE [RS_21]
PartitionCols:_col1
- Merge Join Operator [MERGEJOIN_88] (rows=633595212 width=88)
+ Merge Join Operator [MERGEJOIN_89] (rows=633595212 width=88)
Conds:RS_18._col0=RS_19._col0(Inner),Output:["_col1","_col2"]
<-Map 10 [SIMPLE_EDGE]
SHUFFLE [RS_18]
PartitionCols:_col0
Select Operator [SEL_11] (rows=575995635 width=88)
Output:["_col0","_col1","_col2"]
- Filter Operator [FIL_84] (rows=575995635 width=88)
+ Filter Operator [FIL_85] (rows=575995635 width=88)
predicate:(ss_store_sk is not null and ss_sold_date_sk is not null)
TableScan [TS_9] (rows=575995635 width=88)
default@store_sales,store_sales,Tbl:COMPLETE,Col:NONE,Output:["ss_sold_date_sk","ss_store_sk","ss_net_profit"]
@@ -142,7 +142,7 @@ Stage-0
PartitionCols:_col0
Select Operator [SEL_14] (rows=8116 width=1119)
Output:["_col0"]
- Filter Operator [FIL_85] (rows=8116 width=1119)
+ Filter Operator [FIL_86] (rows=8116 width=1119)
predicate:(d_month_seq BETWEEN 1193 AND 1204 and d_date_sk is not null)
TableScan [TS_12] (rows=73049 width=1119)
default@date_dim,date_dim,Tbl:COMPLETE,Col:NONE,Output:["d_date_sk","d_month_seq"]
http://git-wip-us.apache.org/repos/asf/hive/blob/67c022f0/ql/src/test/results/clientpositive/tez/multi_count_distinct.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/tez/multi_count_distinct.q.out b/ql/src/test/results/clientpositive/tez/multi_count_distinct.q.out
new file mode 100644
index 0000000..b623f1d
--- /dev/null
+++ b/ql/src/test/results/clientpositive/tez/multi_count_distinct.q.out
@@ -0,0 +1,193 @@
+PREHOOK: query: drop table employee
+PREHOOK: type: DROPTABLE
+POSTHOOK: query: drop table employee
+POSTHOOK: type: DROPTABLE
+PREHOOK: query: create table employee (department_id int, gender varchar(10), education_level int)
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@employee
+POSTHOOK: query: create table employee (department_id int, gender varchar(10), education_level int)
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@employee
+PREHOOK: query: insert into employee values (1, 'M', 1),(1, 'M', 1),(2, 'F', 1),(1, 'F', 3),(1, 'M', 2),(4, 'M', 1),(2, 'F', 1),(2, 'F', 3),(3, 'M', 2)
+PREHOOK: type: QUERY
+PREHOOK: Input: default@values__tmp__table__1
+PREHOOK: Output: default@employee
+POSTHOOK: query: insert into employee values (1, 'M', 1),(1, 'M', 1),(2, 'F', 1),(1, 'F', 3),(1, 'M', 2),(4, 'M', 1),(2, 'F', 1),(2, 'F', 3),(3, 'M', 2)
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@values__tmp__table__1
+POSTHOOK: Output: default@employee
+POSTHOOK: Lineage: employee.department_id EXPRESSION [(values__tmp__table__1)values__tmp__table__1.FieldSchema(name:tmp_values_col1, type:string, comment:), ]
+POSTHOOK: Lineage: employee.education_level EXPRESSION [(values__tmp__table__1)values__tmp__table__1.FieldSchema(name:tmp_values_col3, type:string, comment:), ]
+POSTHOOK: Lineage: employee.gender EXPRESSION [(values__tmp__table__1)values__tmp__table__1.FieldSchema(name:tmp_values_col2, type:string, comment:), ]
+PREHOOK: query: explain select count(distinct department_id), count(distinct gender), count(distinct education_level) from employee
+PREHOOK: type: QUERY
+POSTHOOK: query: explain select count(distinct department_id), count(distinct gender), count(distinct education_level) from employee
+POSTHOOK: type: QUERY
+Plan optimized by CBO.
+
+Vertex dependency in root stage
+Reducer 2 <- Map 1 (SIMPLE_EDGE)
+Reducer 3 <- Reducer 2 (SIMPLE_EDGE)
+
+Stage-0
+ Fetch Operator
+ limit:-1
+ Stage-1
+ Reducer 3
+ File Output Operator [FS_12]
+ Group By Operator [GBY_10] (rows=1 width=24)
+ Output:["_col0","_col1","_col2"],aggregations:["count(VALUE._col0)","count(VALUE._col1)","count(VALUE._col2)"]
+ <-Reducer 2 [SIMPLE_EDGE]
+ SHUFFLE [RS_9]
+ Group By Operator [GBY_8] (rows=1 width=24)
+ Output:["_col0","_col1","_col2"],aggregations:["count(_col0)","count(_col1)","count(_col2)"]
+ Select Operator [SEL_6] (rows=13 width=5)
+ Output:["_col0","_col1","_col2"]
+ Group By Operator [GBY_5] (rows=13 width=5)
+ Output:["_col0","_col1","_col2","_col3"],keys:KEY._col0, KEY._col1, KEY._col2, KEY._col3
+ <-Map 1 [SIMPLE_EDGE]
+ SHUFFLE [RS_4]
+ PartitionCols:_col0, _col1, _col2, _col3
+ Group By Operator [GBY_3] (rows=27 width=5)
+ Output:["_col0","_col1","_col2","_col3"],keys:_col0, _col1, _col2, '0'
+ Select Operator [SEL_1] (rows=9 width=5)
+ Output:["_col0","_col1","_col2"]
+ TableScan [TS_0] (rows=9 width=5)
+ default@employee,employee,Tbl:COMPLETE,Col:NONE,Output:["department_id","gender","education_level"]
+
+PREHOOK: query: select count(distinct department_id), count(distinct gender), count(distinct education_level) from employee
+PREHOOK: type: QUERY
+PREHOOK: Input: default@employee
+#### A masked pattern was here ####
+POSTHOOK: query: select count(distinct department_id), count(distinct gender), count(distinct education_level) from employee
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@employee
+#### A masked pattern was here ####
+4 2 3
+PREHOOK: query: select count(distinct department_id), count(distinct gender), count(distinct education_level), count(distinct education_level) from employee
+PREHOOK: type: QUERY
+PREHOOK: Input: default@employee
+#### A masked pattern was here ####
+POSTHOOK: query: select count(distinct department_id), count(distinct gender), count(distinct education_level), count(distinct education_level) from employee
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@employee
+#### A masked pattern was here ####
+4 2 3 3
+PREHOOK: query: select count(distinct department_id), count(distinct gender), count(distinct education_level),
+count(distinct education_level, department_id) from employee
+PREHOOK: type: QUERY
+PREHOOK: Input: default@employee
+#### A masked pattern was here ####
+POSTHOOK: query: select count(distinct department_id), count(distinct gender), count(distinct education_level),
+count(distinct education_level, department_id) from employee
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@employee
+#### A masked pattern was here ####
+4 2 3 7
+PREHOOK: query: select count(distinct gender), count(distinct department_id), count(distinct gender), count(distinct education_level),
+count(distinct education_level, department_id), count(distinct department_id, education_level) from employee
+PREHOOK: type: QUERY
+PREHOOK: Input: default@employee
+#### A masked pattern was here ####
+POSTHOOK: query: select count(distinct gender), count(distinct department_id), count(distinct gender), count(distinct education_level),
+count(distinct education_level, department_id), count(distinct department_id, education_level) from employee
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@employee
+#### A masked pattern was here ####
+2 4 2 3 7 7
+PREHOOK: query: explain select count(distinct gender), count(distinct department_id), count(distinct gender), count(distinct education_level),
+count(distinct education_level, department_id), count(distinct department_id, education_level), count(distinct department_id, education_level, gender) from employee
+PREHOOK: type: QUERY
+POSTHOOK: query: explain select count(distinct gender), count(distinct department_id), count(distinct gender), count(distinct education_level),
+count(distinct education_level, department_id), count(distinct department_id, education_level), count(distinct department_id, education_level, gender) from employee
+POSTHOOK: type: QUERY
+Plan optimized by CBO.
+
+Vertex dependency in root stage
+Reducer 2 <- Map 1 (SIMPLE_EDGE)
+Reducer 3 <- Reducer 2 (SIMPLE_EDGE)
+
+Stage-0
+ Fetch Operator
+ limit:-1
+ Stage-1
+ Reducer 3
+ File Output Operator [FS_12]
+ Select Operator [SEL_11] (rows=1 width=40)
+ Output:["_col0","_col1","_col2","_col3","_col4","_col5","_col6"]
+ Group By Operator [GBY_10] (rows=1 width=40)
+ Output:["_col0","_col1","_col2","_col3","_col4"],aggregations:["count(VALUE._col0)","count(VALUE._col1)","count(VALUE._col2)","count(VALUE._col3)","count(VALUE._col4)"]
+ <-Reducer 2 [SIMPLE_EDGE]
+ SHUFFLE [RS_9]
+ Group By Operator [GBY_8] (rows=1 width=40)
+ Output:["_col0","_col1","_col2","_col3","_col4"],aggregations:["count(_col0)","count(_col1)","count(_col2)","count(_col3)","count(_col4)"]
+ Select Operator [SEL_6] (rows=22 width=5)
+ Output:["_col0","_col1","_col2","_col3","_col4"]
+ Group By Operator [GBY_5] (rows=22 width=5)
+ Output:["_col0","_col1","_col2","_col3"],keys:KEY._col0, KEY._col1, KEY._col2, KEY._col3
+ <-Map 1 [SIMPLE_EDGE]
+ SHUFFLE [RS_4]
+ PartitionCols:_col0, _col1, _col2, _col3
+ Group By Operator [GBY_3] (rows=45 width=5)
+ Output:["_col0","_col1","_col2","_col3"],keys:_col0, _col1, _col2, '0'
+ Select Operator [SEL_1] (rows=9 width=5)
+ Output:["_col0","_col1","_col2"]
+ TableScan [TS_0] (rows=9 width=5)
+ default@employee,employee,Tbl:COMPLETE,Col:NONE,Output:["gender","department_id","education_level"]
+
+PREHOOK: query: select count(distinct gender), count(distinct department_id), count(distinct gender), count(distinct education_level),
+count(distinct education_level, department_id), count(distinct department_id, education_level), count(distinct department_id, education_level, gender) from employee
+PREHOOK: type: QUERY
+PREHOOK: Input: default@employee
+#### A masked pattern was here ####
+POSTHOOK: query: select count(distinct gender), count(distinct department_id), count(distinct gender), count(distinct education_level),
+count(distinct education_level, department_id), count(distinct department_id, education_level), count(distinct department_id, education_level, gender) from employee
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@employee
+#### A masked pattern was here ####
+2 4 2 3 7 7 7
+PREHOOK: query: select count(case i when 1 then 1 else null end) as c0, count(case i when 2 then 1 else null end) as c1,
+count(case i when 4 then 1 else null end) as c2 from (select grouping__id as i, department_id, gender,
+education_level from employee group by department_id, gender, education_level grouping sets
+(department_id, gender, education_level))subq
+PREHOOK: type: QUERY
+PREHOOK: Input: default@employee
+#### A masked pattern was here ####
+POSTHOOK: query: select count(case i when 1 then 1 else null end) as c0, count(case i when 2 then 1 else null end) as c1,
+count(case i when 4 then 1 else null end) as c2 from (select grouping__id as i, department_id, gender,
+education_level from employee group by department_id, gender, education_level grouping sets
+(department_id, gender, education_level))subq
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@employee
+#### A masked pattern was here ####
+4 2 3
+PREHOOK: query: select grouping__id as i, department_id, gender, education_level from employee
+group by department_id, gender, education_level grouping sets
+(department_id, gender, education_level, (education_level, department_id))
+PREHOOK: type: QUERY
+PREHOOK: Input: default@employee
+#### A masked pattern was here ####
+POSTHOOK: query: select grouping__id as i, department_id, gender, education_level from employee
+group by department_id, gender, education_level grouping sets
+(department_id, gender, education_level, (education_level, department_id))
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@employee
+#### A masked pattern was here ####
+4 NULL NULL 1
+4 NULL NULL 2
+4 NULL NULL 3
+2 NULL F NULL
+2 NULL M NULL
+1 1 NULL NULL
+5 1 NULL 1
+5 1 NULL 2
+5 1 NULL 3
+1 2 NULL NULL
+5 2 NULL 1
+5 2 NULL 3
+1 3 NULL NULL
+5 3 NULL 2
+1 4 NULL NULL
+5 4 NULL 1
http://git-wip-us.apache.org/repos/asf/hive/blob/67c022f0/ql/src/test/results/clientpositive/vector_grouping_sets.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/vector_grouping_sets.q.out b/ql/src/test/results/clientpositive/vector_grouping_sets.q.out
index 9e35049..aa9ee87 100644
--- a/ql/src/test/results/clientpositive/vector_grouping_sets.q.out
+++ b/ql/src/test/results/clientpositive/vector_grouping_sets.q.out
@@ -225,10 +225,10 @@ STAGE PLANS:
Statistics: Num rows: 12 Data size: 25632 Basic stats: COMPLETE Column stats: NONE
Select Operator
expressions: s_store_id (type: string)
- outputColumnNames: s_store_id
+ outputColumnNames: _col0
Statistics: Num rows: 12 Data size: 25632 Basic stats: COMPLETE Column stats: NONE
Group By Operator
- keys: s_store_id (type: string), '0' (type: string)
+ keys: _col0 (type: string), '0' (type: string)
mode: hash
outputColumnNames: _col0, _col1
Statistics: Num rows: 24 Data size: 51264 Basic stats: COMPLETE Column stats: NONE
@@ -303,10 +303,10 @@ STAGE PLANS:
Statistics: Num rows: 12 Data size: 25632 Basic stats: COMPLETE Column stats: NONE
Select Operator
expressions: s_store_id (type: string)
- outputColumnNames: s_store_id
+ outputColumnNames: _col0
Statistics: Num rows: 12 Data size: 25632 Basic stats: COMPLETE Column stats: NONE
Group By Operator
- keys: s_store_id (type: string), '0' (type: string)
+ keys: _col0 (type: string), '0' (type: string)
mode: hash
outputColumnNames: _col0, _col1
Statistics: Num rows: 24 Data size: 51264 Basic stats: COMPLETE Column stats: NONE