You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by jc...@apache.org on 2019/11/08 08:09:10 UTC
[hive] branch master updated: HIVE-22448: CBO: Expand the multiple
count distinct with a group-by key (Jesus Camacho Rodriguez,
reviewed by Vineet Garg)
This is an automated email from the ASF dual-hosted git repository.
jcamacho pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hive.git
The following commit(s) were added to refs/heads/master by this push:
new e7d2cd2 HIVE-22448: CBO: Expand the multiple count distinct with a group-by key (Jesus Camacho Rodriguez, reviewed by Vineet Garg)
e7d2cd2 is described below
commit e7d2cd2c05abf4ac52393095cf93c74833f640df
Author: Jesus Camacho Rodriguez <jc...@apache.org>
AuthorDate: Fri Nov 1 18:54:48 2019 -0700
HIVE-22448: CBO: Expand the multiple count distinct with a group-by key (Jesus Camacho Rodriguez, reviewed by Vineet Garg)
Close apache/hive#838
---
.../test/resources/testconfiguration.properties | 1 +
.../rules/HiveExpandDistinctAggregatesRule.java | 96 ++++----
.../queries/clientpositive/multigroupbydistinct.q | 60 +++++
.../clientpositive/llap/limit_pushdown.q.out | 57 +++--
.../clientpositive/llap/limit_pushdown3.q.out | 49 ++--
.../clientpositive/llap/multigroupbydistinct.q.out | 271 +++++++++++++++++++++
.../llap/offset_limit_ppd_optimizer.q.out | 55 +++--
.../llap/reduce_deduplicate_distinct.q.out | 254 ++++++++++++-------
.../spark/auto_join18_multi_distinct.q.out | 44 ++--
.../spark/join18_multi_distinct.q.out | 44 ++--
.../clientpositive/spark/limit_pushdown.q.out | 55 +++--
11 files changed, 723 insertions(+), 263 deletions(-)
diff --git a/itests/src/test/resources/testconfiguration.properties b/itests/src/test/resources/testconfiguration.properties
index e60c4c5..50dcf40 100644
--- a/itests/src/test/resources/testconfiguration.properties
+++ b/itests/src/test/resources/testconfiguration.properties
@@ -640,6 +640,7 @@ minillaplocal.query.files=\
mm_loaddata.q,\
mm_loaddata_split_change.q,\
mrr.q,\
+ multigroupbydistinct.q,\
multiMapJoin1.q,\
multiMapJoin2.q,\
multi_in_clause.q,\
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveExpandDistinctAggregatesRule.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveExpandDistinctAggregatesRule.java
index 103d5e1..e8b2c37 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveExpandDistinctAggregatesRule.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveExpandDistinctAggregatesRule.java
@@ -16,21 +16,22 @@
*/
package org.apache.hadoop.hive.ql.optimizer.calcite.rules;
+import com.google.common.base.Preconditions;
import java.math.BigDecimal;
import java.util.ArrayList;
-import java.util.Collections;
import java.util.HashMap;
-import java.util.HashSet;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
+import java.util.stream.Collectors;
import org.apache.calcite.plan.RelOptCluster;
import org.apache.calcite.plan.RelOptRule;
import org.apache.calcite.plan.RelOptRuleCall;
import org.apache.calcite.rel.RelNode;
import org.apache.calcite.rel.core.Aggregate;
+import org.apache.calcite.rel.core.Aggregate.Group;
import org.apache.calcite.rel.core.AggregateCall;
import org.apache.calcite.rel.core.RelFactories;
import org.apache.calcite.rel.metadata.RelColumnOrigin;
@@ -44,7 +45,6 @@ import org.apache.calcite.sql.fun.SqlStdOperatorTable;
import org.apache.calcite.sql.type.SqlTypeName;
import org.apache.calcite.util.ImmutableBitSet;
import org.apache.calcite.util.Pair;
-import org.apache.calcite.util.Util;
import org.apache.hadoop.hive.ql.optimizer.calcite.CalciteSemanticException;
import org.apache.hadoop.hive.ql.optimizer.calcite.HiveCalciteUtil;
import org.apache.hadoop.hive.ql.optimizer.calcite.HiveRelFactories;
@@ -58,7 +58,6 @@ import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
-import com.google.common.base.Function;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.Lists;
import com.google.common.math.IntMath;
@@ -112,7 +111,7 @@ public final class HiveExpandDistinctAggregatesRule extends RelOptRule {
public void onMatch(RelOptRuleCall call) {
final Aggregate aggregate = call.rel(0);
int numCountDistinct = getNumCountDistinctCall(aggregate);
- if (numCountDistinct == 0) {
+ if (numCountDistinct == 0 || aggregate.getGroupType() != Group.SIMPLE) {
return;
}
@@ -121,7 +120,8 @@ public final class HiveExpandDistinctAggregatesRule extends RelOptRule {
int nonDistinctCount = 0;
List<List<Integer>> argListList = new ArrayList<List<Integer>>();
Set<List<Integer>> argListSets = new LinkedHashSet<List<Integer>>();
- Set<Integer> positions = new HashSet<>();
+ ImmutableBitSet.Builder newGroupSet = ImmutableBitSet.builder();
+ newGroupSet.addAll(aggregate.getGroupSet());
for (AggregateCall aggCall : aggregate.getAggCallList()) {
if (!aggCall.isDistinct()) {
++nonDistinctCount;
@@ -130,33 +130,27 @@ public final class HiveExpandDistinctAggregatesRule extends RelOptRule {
ArrayList<Integer> argList = new ArrayList<Integer>();
for (Integer arg : aggCall.getArgList()) {
argList.add(arg);
- positions.add(arg);
+ newGroupSet.set(arg);
}
// Aggr checks for sorted argList.
argListList.add(argList);
argListSets.add(argList);
}
- Util.permAssert(argListSets.size() > 0, "containsDistinctCall lied");
+ Preconditions.checkArgument(argListSets.size() > 0, "containsDistinctCall lied");
- if (numCountDistinct > 1 && numCountDistinct == aggregate.getAggCallList().size()
- && aggregate.getGroupSet().isEmpty()) {
+ if (numCountDistinct > 1 && numCountDistinct == aggregate.getAggCallList().size()) {
LOG.debug("Trigger countDistinct rewrite. numCountDistinct is " + numCountDistinct);
// now positions contains all the distinct positions, i.e., $5, $4, $6
// we need to first sort them as group by set
// and then get their position later, i.e., $4->1, $5->2, $6->3
cluster = aggregate.getCluster();
rexBuilder = cluster.getRexBuilder();
- RelNode converted = null;
- List<Integer> sourceOfForCountDistinct = new ArrayList<>();
- sourceOfForCountDistinct.addAll(positions);
- Collections.sort(sourceOfForCountDistinct);
try {
- converted = convert(aggregate, argListList, sourceOfForCountDistinct);
+ call.transformTo(convert(aggregate, argListList, newGroupSet.build()));
} catch (CalciteSemanticException e) {
LOG.debug(e.toString());
throw new RuntimeException(e);
}
- call.transformTo(converted);
return;
}
@@ -200,19 +194,23 @@ public final class HiveExpandDistinctAggregatesRule extends RelOptRule {
* (department_id, gender, education_level))subq;
* @throws CalciteSemanticException
*/
- private RelNode convert(Aggregate aggregate, List<List<Integer>> argList, List<Integer> sourceOfForCountDistinct) throws CalciteSemanticException {
+ private RelNode convert(Aggregate aggregate, List<List<Integer>> argList, ImmutableBitSet newGroupSet)
+ throws CalciteSemanticException {
// we use this map to map the position of argList to the position of grouping set
Map<Integer, Integer> map = new HashMap<>();
List<List<Integer>> cleanArgList = new ArrayList<>();
- final Aggregate groupingSets = createGroupingSets(aggregate, argList, cleanArgList, map, sourceOfForCountDistinct);
- return createCount(groupingSets, argList, cleanArgList, map, sourceOfForCountDistinct);
+ final Aggregate groupingSets = createGroupingSets(aggregate, argList, cleanArgList, map, newGroupSet);
+ return createCount(groupingSets, argList, cleanArgList, map, aggregate.getGroupSet(), newGroupSet);
}
- private int getGroupingIdValue(List<Integer> list, List<Integer> sourceOfForCountDistinct,
+ private int getGroupingIdValue(List<Integer> list, ImmutableBitSet originalGroupSet, ImmutableBitSet newGroupSet,
int groupCount) {
int ind = IntMath.pow(2, groupCount) - 1;
+ for (int pos : originalGroupSet) {
+ ind &= ~(1 << groupCount - newGroupSet.indexOf(pos) - 1);
+ }
for (int i : list) {
- ind &= ~(1 << groupCount - sourceOfForCountDistinct.indexOf(i) - 1);
+ ind &= ~(1 << groupCount - newGroupSet.indexOf(i) - 1);
}
return ind;
}
@@ -222,28 +220,28 @@ public final class HiveExpandDistinctAggregatesRule extends RelOptRule {
* @param argList: the original argList in aggregate
* @param cleanArgList: the new argList without duplicates
* @param map: the mapping from the original argList to the new argList
- * @param sourceOfForCountDistinct: the sorted positions of groupset
+ * @param newGroupSet: the sorted positions of groupset
* @return
* @throws CalciteSemanticException
*/
private RelNode createCount(Aggregate aggr, List<List<Integer>> argList,
List<List<Integer>> cleanArgList, Map<Integer, Integer> map,
- List<Integer> sourceOfForCountDistinct) throws CalciteSemanticException {
- List<RexNode> originalInputRefs = Lists.transform(aggr.getRowType().getFieldList(),
- new Function<RelDataTypeField, RexNode>() {
- @Override
- public RexNode apply(RelDataTypeField input) {
- return new RexInputRef(input.getIndex(), input.getType());
- }
- });
+ ImmutableBitSet originalGroupSet, ImmutableBitSet newGroupSet) throws CalciteSemanticException {
+ final List<RexNode> originalInputRefs = aggr.getRowType().getFieldList()
+ .stream()
+ .map(input -> new RexInputRef(input.getIndex(), input.getType()))
+ .collect(Collectors.toList());
final List<RexNode> gbChildProjLst = Lists.newArrayList();
// for singular arg, count should not include null
// e.g., count(case when i=1 and department_id is not null then 1 else null end) as c0,
// for non-singular args, count can include null, i.e. (,) is counted as 1
for (List<Integer> list : cleanArgList) {
- RexNode condition = rexBuilder.makeCall(SqlStdOperatorTable.EQUALS, originalInputRefs
- .get(originalInputRefs.size() - 1), rexBuilder.makeExactLiteral(new BigDecimal(
- getGroupingIdValue(list, sourceOfForCountDistinct, aggr.getGroupCount()))));
+ RexNode condition = rexBuilder.makeCall(
+ SqlStdOperatorTable.EQUALS,
+ originalInputRefs.get(originalInputRefs.size() - 1),
+ rexBuilder.makeExactLiteral(
+ new BigDecimal(
+ getGroupingIdValue(list, originalGroupSet, newGroupSet, aggr.getGroupCount()))));
if (list.size() == 1) {
int pos = list.get(0);
RexNode notNull = rexBuilder.makeCall(SqlStdOperatorTable.IS_NOT_NULL,
@@ -257,6 +255,10 @@ public final class HiveExpandDistinctAggregatesRule extends RelOptRule {
gbChildProjLst.add(when);
}
+ for (int pos : originalGroupSet) {
+ gbChildProjLst.add(originalInputRefs.get(newGroupSet.indexOf(pos)));
+ }
+
// create the project before GB
RelNode gbInputRel = HiveProject.create(aggr, gbChildProjLst, null);
@@ -269,23 +271,25 @@ public final class HiveExpandDistinctAggregatesRule extends RelOptRule {
TypeInfoFactory.longTypeInfo, i, aggFnRetType);
aggregateCalls.add(aggregateCall);
}
+ ImmutableBitSet groupSet =
+ ImmutableBitSet.range(cleanArgList.size(), cleanArgList.size() + originalGroupSet.cardinality());
Aggregate aggregate = new HiveAggregate(cluster, cluster.traitSetOf(HiveRelNode.CONVENTION), gbInputRel,
- ImmutableBitSet.of(), null, aggregateCalls);
+ groupSet, null, aggregateCalls);
// create the project after GB. For those repeated values, e.g., select
// count(distinct x, y), count(distinct y, x), we find the correct mapping.
if (map.isEmpty()) {
return aggregate;
} else {
- List<RexNode> originalAggrRefs = Lists.transform(aggregate.getRowType().getFieldList(),
- new Function<RelDataTypeField, RexNode>() {
- @Override
- public RexNode apply(RelDataTypeField input) {
- return new RexInputRef(input.getIndex(), input.getType());
- }
- });
+ final List<RexNode> originalAggrRefs = aggregate.getRowType().getFieldList()
+ .stream()
+ .map(input -> new RexInputRef(input.getIndex(), input.getType()))
+ .collect(Collectors.toList());
final List<RexNode> projLst = Lists.newArrayList();
int index = 0;
+ for (int i = 0; i < groupSet.cardinality(); i++) {
+ projLst.add(originalAggrRefs.get(index++));
+ }
for (int i = 0; i < argList.size(); i++) {
if (map.containsKey(i)) {
projLst.add(originalAggrRefs.get(map.get(i)));
@@ -302,18 +306,18 @@ public final class HiveExpandDistinctAggregatesRule extends RelOptRule {
* @param argList: the original argList in aggregate
* @param cleanArgList: the new argList without duplicates
* @param map: the mapping from the original argList to the new argList
- * @param sourceOfForCountDistinct: the sorted positions of groupset
+ * @param groupSet: new group set
* @return
*/
private Aggregate createGroupingSets(Aggregate aggregate, List<List<Integer>> argList,
List<List<Integer>> cleanArgList, Map<Integer, Integer> map,
- List<Integer> sourceOfForCountDistinct) {
- final ImmutableBitSet groupSet = ImmutableBitSet.of(sourceOfForCountDistinct);
+ ImmutableBitSet groupSet) {
final List<ImmutableBitSet> origGroupSets = new ArrayList<>();
for (int i = 0; i < argList.size(); i++) {
List<Integer> list = argList.get(i);
- ImmutableBitSet bitSet = ImmutableBitSet.of(list);
+ ImmutableBitSet bitSet = aggregate.getGroupSet().union(
+ ImmutableBitSet.of(list));
int prev = origGroupSets.indexOf(bitSet);
if (prev == -1) {
origGroupSets.add(bitSet);
@@ -323,7 +327,7 @@ public final class HiveExpandDistinctAggregatesRule extends RelOptRule {
}
}
// Calcite expects the grouping sets sorted and without duplicates
- Collections.sort(origGroupSets, ImmutableBitSet.COMPARATOR);
+ origGroupSets.sort(ImmutableBitSet.COMPARATOR);
List<AggregateCall> aggregateCalls = new ArrayList<AggregateCall>();
// Create GroupingID column
diff --git a/ql/src/test/queries/clientpositive/multigroupbydistinct.q b/ql/src/test/queries/clientpositive/multigroupbydistinct.q
new file mode 100644
index 0000000..dbd81ba
--- /dev/null
+++ b/ql/src/test/queries/clientpositive/multigroupbydistinct.q
@@ -0,0 +1,60 @@
+create table tabw4intcols (x integer, y integer, z integer, a integer);
+insert into tabw4intcols values (1, 1, 1, 1), (2, 2, 2, 2), (3, 3, 3, 3), (4, 4, 4, 4),
+ (1, 2, 1, 2), (2, 3, 2, 3), (3, 4, 3, 4), (4, 1, 4, 1),
+ (1, 2, 3, 4), (4, 3, 2, 1), (1, 2, 3, 4), (4, 3, 2, 1);
+
+explain cbo
+select z, count(distinct y), count(distinct a)
+from tabw4intcols
+group by z;
+
+select z, count(distinct y), count(distinct a)
+from tabw4intcols
+group by z;
+
+explain cbo
+select z, x, count(distinct y), count(distinct a)
+from tabw4intcols
+group by z, x;
+
+select z, x, count(distinct y), count(distinct a)
+from tabw4intcols
+group by z, x;
+
+explain cbo
+select x, z, count(distinct y), count(distinct a)
+from tabw4intcols
+group by z, x;
+
+select x, z, count(distinct y), count(distinct a)
+from tabw4intcols
+group by z, x;
+
+explain cbo
+select x, a, y, count(distinct z)
+from tabw4intcols
+group by a, x, y;
+
+select x, a, y, count(distinct z)
+from tabw4intcols
+group by a, x, y;
+
+explain cbo
+select x, count(distinct y), z, count(distinct a)
+from tabw4intcols
+group by z, x;
+
+select x, count(distinct y), z, count(distinct a)
+from tabw4intcols
+group by z, x;
+
+explain cbo
+select count(distinct y), x, z, count(distinct a)
+from tabw4intcols
+group by z, x;
+
+select count(distinct y), x, z, count(distinct a)
+from tabw4intcols
+group by z, x;
+
+drop table tabw4intcols;
diff --git a/ql/src/test/results/clientpositive/llap/limit_pushdown.q.out b/ql/src/test/results/clientpositive/llap/limit_pushdown.q.out
index 041bb28..23038f0 100644
--- a/ql/src/test/results/clientpositive/llap/limit_pushdown.q.out
+++ b/ql/src/test/results/clientpositive/llap/limit_pushdown.q.out
@@ -734,42 +734,49 @@ STAGE PLANS:
Statistics: Num rows: 12288 Data size: 1779850 Basic stats: COMPLETE Column stats: COMPLETE
Select Operator
expressions: ctinyint (type: tinyint), cstring1 (type: string), cstring2 (type: string)
- outputColumnNames: ctinyint, cstring1, cstring2
+ outputColumnNames: _col0, _col1, _col2
Statistics: Num rows: 12288 Data size: 1779850 Basic stats: COMPLETE Column stats: COMPLETE
Group By Operator
- aggregations: count(DISTINCT cstring1), count(DISTINCT cstring2)
- keys: ctinyint (type: tinyint), cstring1 (type: string), cstring2 (type: string)
+ keys: _col0 (type: tinyint), _col1 (type: string), _col2 (type: string), 0L (type: bigint)
minReductionHashAggr: 0.0
mode: hash
- outputColumnNames: _col0, _col1, _col2, _col3, _col4
- Statistics: Num rows: 12288 Data size: 1976458 Basic stats: COMPLETE Column stats: COMPLETE
+ outputColumnNames: _col0, _col1, _col2, _col3
+ Statistics: Num rows: 24576 Data size: 3756114 Basic stats: COMPLETE Column stats: COMPLETE
Reduce Output Operator
- key expressions: _col0 (type: tinyint), _col1 (type: string), _col2 (type: string)
- sort order: +++
+ key expressions: _col0 (type: tinyint), _col1 (type: string), _col2 (type: string), _col3 (type: bigint)
+ sort order: ++++
Map-reduce partition columns: _col0 (type: tinyint)
- Statistics: Num rows: 12288 Data size: 1976458 Basic stats: COMPLETE Column stats: COMPLETE
- TopN Hash Memory Usage: 0.3
- Execution mode: llap
+ Statistics: Num rows: 24576 Data size: 3756114 Basic stats: COMPLETE Column stats: COMPLETE
+ Execution mode: vectorized, llap
LLAP IO: all inputs
Reducer 2
- Execution mode: llap
+ Execution mode: vectorized, llap
Reduce Operator Tree:
Group By Operator
- aggregations: count(DISTINCT KEY._col1:0._col0), count(DISTINCT KEY._col1:1._col0)
- keys: KEY._col0 (type: tinyint)
+ keys: KEY._col0 (type: tinyint), KEY._col1 (type: string), KEY._col2 (type: string), KEY._col3 (type: bigint)
mode: mergepartial
- outputColumnNames: _col0, _col1, _col2
- Statistics: Num rows: 131 Data size: 2492 Basic stats: COMPLETE Column stats: COMPLETE
- Limit
- Number of rows: 20
- Statistics: Num rows: 20 Data size: 384 Basic stats: COMPLETE Column stats: COMPLETE
- File Output Operator
- compressed: false
- Statistics: Num rows: 20 Data size: 384 Basic stats: COMPLETE Column stats: COMPLETE
- table:
- input format: org.apache.hadoop.mapred.SequenceFileInputFormat
- output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
- serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ outputColumnNames: _col0, _col1, _col2, _col3
+ Statistics: Num rows: 24576 Data size: 3756114 Basic stats: COMPLETE Column stats: COMPLETE
+ Select Operator
+ expressions: CASE WHEN (((_col3 = 1L) and _col1 is not null)) THEN (1) ELSE (null) END (type: int), CASE WHEN (((_col3 = 2L) and _col2 is not null)) THEN (1) ELSE (null) END (type: int), _col0 (type: tinyint)
+ outputColumnNames: _col0, _col1, _col2
+ Statistics: Num rows: 24576 Data size: 3756114 Basic stats: COMPLETE Column stats: COMPLETE
+ Group By Operator
+ aggregations: count(_col0), count(_col1)
+ keys: _col2 (type: tinyint)
+ mode: complete
+ outputColumnNames: _col0, _col1, _col2
+ Statistics: Num rows: 131 Data size: 2492 Basic stats: COMPLETE Column stats: COMPLETE
+ Limit
+ Number of rows: 20
+ Statistics: Num rows: 20 Data size: 384 Basic stats: COMPLETE Column stats: COMPLETE
+ File Output Operator
+ compressed: false
+ Statistics: Num rows: 20 Data size: 384 Basic stats: COMPLETE Column stats: COMPLETE
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
Stage: Stage-0
Fetch Operator
diff --git a/ql/src/test/results/clientpositive/llap/limit_pushdown3.q.out b/ql/src/test/results/clientpositive/llap/limit_pushdown3.q.out
index 2d2b6eb..ebf6567 100644
--- a/ql/src/test/results/clientpositive/llap/limit_pushdown3.q.out
+++ b/ql/src/test/results/clientpositive/llap/limit_pushdown3.q.out
@@ -804,38 +804,45 @@ STAGE PLANS:
Statistics: Num rows: 12288 Data size: 1779850 Basic stats: COMPLETE Column stats: COMPLETE
Select Operator
expressions: ctinyint (type: tinyint), cstring1 (type: string), cstring2 (type: string)
- outputColumnNames: ctinyint, cstring1, cstring2
+ outputColumnNames: _col0, _col1, _col2
Statistics: Num rows: 12288 Data size: 1779850 Basic stats: COMPLETE Column stats: COMPLETE
Group By Operator
- aggregations: count(DISTINCT cstring1), count(DISTINCT cstring2)
- keys: ctinyint (type: tinyint), cstring1 (type: string), cstring2 (type: string)
+ keys: _col0 (type: tinyint), _col1 (type: string), _col2 (type: string), 0L (type: bigint)
minReductionHashAggr: 0.0
mode: hash
- outputColumnNames: _col0, _col1, _col2, _col3, _col4
- Statistics: Num rows: 12288 Data size: 1976458 Basic stats: COMPLETE Column stats: COMPLETE
+ outputColumnNames: _col0, _col1, _col2, _col3
+ Statistics: Num rows: 24576 Data size: 3756114 Basic stats: COMPLETE Column stats: COMPLETE
Reduce Output Operator
- key expressions: _col0 (type: tinyint), _col1 (type: string), _col2 (type: string)
- sort order: +++
+ key expressions: _col0 (type: tinyint), _col1 (type: string), _col2 (type: string), _col3 (type: bigint)
+ sort order: ++++
Map-reduce partition columns: _col0 (type: tinyint)
- Statistics: Num rows: 12288 Data size: 1976458 Basic stats: COMPLETE Column stats: COMPLETE
- TopN Hash Memory Usage: 0.3
- Execution mode: llap
+ Statistics: Num rows: 24576 Data size: 3756114 Basic stats: COMPLETE Column stats: COMPLETE
+ Execution mode: vectorized, llap
LLAP IO: all inputs
Reducer 2
- Execution mode: llap
+ Execution mode: vectorized, llap
Reduce Operator Tree:
Group By Operator
- aggregations: count(DISTINCT KEY._col1:0._col0), count(DISTINCT KEY._col1:1._col0)
- keys: KEY._col0 (type: tinyint)
+ keys: KEY._col0 (type: tinyint), KEY._col1 (type: string), KEY._col2 (type: string), KEY._col3 (type: bigint)
mode: mergepartial
- outputColumnNames: _col0, _col1, _col2
- Statistics: Num rows: 131 Data size: 2492 Basic stats: COMPLETE Column stats: COMPLETE
- Reduce Output Operator
- key expressions: _col0 (type: tinyint)
- sort order: +
- Statistics: Num rows: 131 Data size: 2492 Basic stats: COMPLETE Column stats: COMPLETE
- TopN Hash Memory Usage: 0.3
- value expressions: _col1 (type: bigint), _col2 (type: bigint)
+ outputColumnNames: _col0, _col1, _col2, _col3
+ Statistics: Num rows: 24576 Data size: 3756114 Basic stats: COMPLETE Column stats: COMPLETE
+ Select Operator
+ expressions: CASE WHEN (((_col3 = 1L) and _col1 is not null)) THEN (1) ELSE (null) END (type: int), CASE WHEN (((_col3 = 2L) and _col2 is not null)) THEN (1) ELSE (null) END (type: int), _col0 (type: tinyint)
+ outputColumnNames: _col0, _col1, _col2
+ Statistics: Num rows: 24576 Data size: 3756114 Basic stats: COMPLETE Column stats: COMPLETE
+ Group By Operator
+ aggregations: count(_col0), count(_col1)
+ keys: _col2 (type: tinyint)
+ mode: complete
+ outputColumnNames: _col0, _col1, _col2
+ Statistics: Num rows: 131 Data size: 2492 Basic stats: COMPLETE Column stats: COMPLETE
+ Reduce Output Operator
+ key expressions: _col0 (type: tinyint)
+ sort order: +
+ Statistics: Num rows: 131 Data size: 2492 Basic stats: COMPLETE Column stats: COMPLETE
+ TopN Hash Memory Usage: 0.3
+ value expressions: _col1 (type: bigint), _col2 (type: bigint)
Reducer 3
Execution mode: vectorized, llap
Reduce Operator Tree:
diff --git a/ql/src/test/results/clientpositive/llap/multigroupbydistinct.q.out b/ql/src/test/results/clientpositive/llap/multigroupbydistinct.q.out
new file mode 100644
index 0000000..6710ab2
--- /dev/null
+++ b/ql/src/test/results/clientpositive/llap/multigroupbydistinct.q.out
@@ -0,0 +1,271 @@
+PREHOOK: query: create table tabw4intcols (x integer, y integer, z integer, a integer)
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@tabw4intcols
+POSTHOOK: query: create table tabw4intcols (x integer, y integer, z integer, a integer)
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@tabw4intcols
+PREHOOK: query: insert into tabw4intcols values (1, 1, 1, 1), (2, 2, 2, 2), (3, 3, 3, 3), (4, 4, 4, 4),
+ (1, 2, 1, 2), (2, 3, 2, 3), (3, 4, 3, 4), (4, 1, 4, 1),
+ (1, 2, 3, 4), (4, 3, 2, 1), (1, 2, 3, 4), (4, 3, 2, 1)
+PREHOOK: type: QUERY
+PREHOOK: Input: _dummy_database@_dummy_table
+PREHOOK: Output: default@tabw4intcols
+POSTHOOK: query: insert into tabw4intcols values (1, 1, 1, 1), (2, 2, 2, 2), (3, 3, 3, 3), (4, 4, 4, 4),
+ (1, 2, 1, 2), (2, 3, 2, 3), (3, 4, 3, 4), (4, 1, 4, 1),
+ (1, 2, 3, 4), (4, 3, 2, 1), (1, 2, 3, 4), (4, 3, 2, 1)
+POSTHOOK: type: QUERY
+POSTHOOK: Input: _dummy_database@_dummy_table
+POSTHOOK: Output: default@tabw4intcols
+POSTHOOK: Lineage: tabw4intcols.a SCRIPT []
+POSTHOOK: Lineage: tabw4intcols.x SCRIPT []
+POSTHOOK: Lineage: tabw4intcols.y SCRIPT []
+POSTHOOK: Lineage: tabw4intcols.z SCRIPT []
+PREHOOK: query: explain cbo
+select z, count(distinct y), count(distinct a)
+from tabw4intcols
+group by z
+PREHOOK: type: QUERY
+PREHOOK: Input: default@tabw4intcols
+#### A masked pattern was here ####
+POSTHOOK: query: explain cbo
+select z, count(distinct y), count(distinct a)
+from tabw4intcols
+group by z
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@tabw4intcols
+#### A masked pattern was here ####
+CBO PLAN:
+HiveAggregate(group=[{2}], agg#0=[count($0)], agg#1=[count($1)])
+ HiveProject($f0=[CASE(AND(=($3, 1), IS NOT NULL($1)), 1, null:INTEGER)], $f1=[CASE(AND(=($3, 2), IS NOT NULL($2)), 1, null:INTEGER)], $f2=[$0])
+ HiveAggregate(group=[{0, 1, 2}], groups=[[{0, 1}, {0, 2}]], GROUPING__ID=[GROUPING__ID()])
+ HiveProject($f0=[$2], $f1=[$1], $f2=[$3])
+ HiveTableScan(table=[[default, tabw4intcols]], table:alias=[tabw4intcols])
+
+PREHOOK: query: select z, count(distinct y), count(distinct a)
+from tabw4intcols
+group by z
+PREHOOK: type: QUERY
+PREHOOK: Input: default@tabw4intcols
+#### A masked pattern was here ####
+POSTHOOK: query: select z, count(distinct y), count(distinct a)
+from tabw4intcols
+group by z
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@tabw4intcols
+#### A masked pattern was here ####
+1 2 2
+2 2 3
+3 3 2
+4 2 2
+PREHOOK: query: explain cbo
+select z, x, count(distinct y), count(distinct a)
+from tabw4intcols
+group by z, x
+PREHOOK: type: QUERY
+PREHOOK: Input: default@tabw4intcols
+#### A masked pattern was here ####
+POSTHOOK: query: explain cbo
+select z, x, count(distinct y), count(distinct a)
+from tabw4intcols
+group by z, x
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@tabw4intcols
+#### A masked pattern was here ####
+CBO PLAN:
+HiveAggregate(group=[{2, 3}], agg#0=[count($0)], agg#1=[count($1)])
+ HiveProject($f0=[CASE(AND(=($4, 1), IS NOT NULL($2)), 1, null:INTEGER)], $f1=[CASE(AND(=($4, 2), IS NOT NULL($3)), 1, null:INTEGER)], $f2=[$0], $f3=[$1])
+ HiveAggregate(group=[{0, 1, 2, 3}], groups=[[{0, 1, 2}, {0, 1, 3}]], GROUPING__ID=[GROUPING__ID()])
+ HiveProject($f0=[$2], $f1=[$0], $f2=[$1], $f3=[$3])
+ HiveTableScan(table=[[default, tabw4intcols]], table:alias=[tabw4intcols])
+
+PREHOOK: query: select z, x, count(distinct y), count(distinct a)
+from tabw4intcols
+group by z, x
+PREHOOK: type: QUERY
+PREHOOK: Input: default@tabw4intcols
+#### A masked pattern was here ####
+POSTHOOK: query: select z, x, count(distinct y), count(distinct a)
+from tabw4intcols
+group by z, x
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@tabw4intcols
+#### A masked pattern was here ####
+1 1 2 2
+2 2 2 2
+2 4 1 1
+3 1 1 1
+3 3 2 2
+4 4 2 2
+PREHOOK: query: explain cbo
+select x, z, count(distinct y), count(distinct a)
+from tabw4intcols
+group by z, x
+PREHOOK: type: QUERY
+PREHOOK: Input: default@tabw4intcols
+#### A masked pattern was here ####
+POSTHOOK: query: explain cbo
+select x, z, count(distinct y), count(distinct a)
+from tabw4intcols
+group by z, x
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@tabw4intcols
+#### A masked pattern was here ####
+CBO PLAN:
+HiveProject(x=[$1], z=[$0], _o__c2=[$2], _o__c3=[$3])
+ HiveAggregate(group=[{2, 3}], agg#0=[count($0)], agg#1=[count($1)])
+ HiveProject($f0=[CASE(AND(=($4, 1), IS NOT NULL($2)), 1, null:INTEGER)], $f1=[CASE(AND(=($4, 2), IS NOT NULL($3)), 1, null:INTEGER)], $f2=[$0], $f3=[$1])
+ HiveAggregate(group=[{0, 1, 2, 3}], groups=[[{0, 1, 2}, {0, 1, 3}]], GROUPING__ID=[GROUPING__ID()])
+ HiveProject($f0=[$2], $f1=[$0], $f2=[$1], $f3=[$3])
+ HiveTableScan(table=[[default, tabw4intcols]], table:alias=[tabw4intcols])
+
+PREHOOK: query: select x, z, count(distinct y), count(distinct a)
+from tabw4intcols
+group by z, x
+PREHOOK: type: QUERY
+PREHOOK: Input: default@tabw4intcols
+#### A masked pattern was here ####
+POSTHOOK: query: select x, z, count(distinct y), count(distinct a)
+from tabw4intcols
+group by z, x
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@tabw4intcols
+#### A masked pattern was here ####
+1 1 2 2
+2 2 2 2
+4 2 1 1
+1 3 1 1
+3 3 2 2
+4 4 2 2
+PREHOOK: query: explain cbo
+select x, a, y, count(distinct z)
+from tabw4intcols
+group by a, x, y
+PREHOOK: type: QUERY
+PREHOOK: Input: default@tabw4intcols
+#### A masked pattern was here ####
+POSTHOOK: query: explain cbo
+select x, a, y, count(distinct z)
+from tabw4intcols
+group by a, x, y
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@tabw4intcols
+#### A masked pattern was here ####
+CBO PLAN:
+HiveProject(x=[$0], a=[$2], y=[$1], _o__c3=[$3])
+ HiveAggregate(group=[{0, 1, 3}], agg#0=[count($2)])
+ HiveProject(x=[$0], y=[$1], z=[$2], a=[$3])
+ HiveAggregate(group=[{0, 1, 2, 3}])
+ HiveTableScan(table=[[default, tabw4intcols]], table:alias=[tabw4intcols])
+
+PREHOOK: query: select x, a, y, count(distinct z)
+from tabw4intcols
+group by a, x, y
+PREHOOK: type: QUERY
+PREHOOK: Input: default@tabw4intcols
+#### A masked pattern was here ####
+POSTHOOK: query: select x, a, y, count(distinct z)
+from tabw4intcols
+group by a, x, y
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@tabw4intcols
+#### A masked pattern was here ####
+1 1 1 1
+1 2 2 1
+1 4 2 1
+2 2 2 1
+2 3 3 1
+3 3 3 1
+3 4 4 1
+4 1 1 1
+4 1 3 1
+4 4 4 1
+PREHOOK: query: explain cbo
+select x, count(distinct y), z, count(distinct a)
+from tabw4intcols
+group by z, x
+PREHOOK: type: QUERY
+PREHOOK: Input: default@tabw4intcols
+#### A masked pattern was here ####
+POSTHOOK: query: explain cbo
+select x, count(distinct y), z, count(distinct a)
+from tabw4intcols
+group by z, x
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@tabw4intcols
+#### A masked pattern was here ####
+CBO PLAN:
+HiveProject(x=[$1], _o__c1=[$2], z=[$0], _o__c3=[$3])
+ HiveAggregate(group=[{2, 3}], agg#0=[count($0)], agg#1=[count($1)])
+ HiveProject($f0=[CASE(AND(=($4, 1), IS NOT NULL($2)), 1, null:INTEGER)], $f1=[CASE(AND(=($4, 2), IS NOT NULL($3)), 1, null:INTEGER)], $f2=[$0], $f3=[$1])
+ HiveAggregate(group=[{0, 1, 2, 3}], groups=[[{0, 1, 2}, {0, 1, 3}]], GROUPING__ID=[GROUPING__ID()])
+ HiveProject($f0=[$2], $f1=[$0], $f2=[$1], $f3=[$3])
+ HiveTableScan(table=[[default, tabw4intcols]], table:alias=[tabw4intcols])
+
+PREHOOK: query: select x, count(distinct y), z, count(distinct a)
+from tabw4intcols
+group by z, x
+PREHOOK: type: QUERY
+PREHOOK: Input: default@tabw4intcols
+#### A masked pattern was here ####
+POSTHOOK: query: select x, count(distinct y), z, count(distinct a)
+from tabw4intcols
+group by z, x
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@tabw4intcols
+#### A masked pattern was here ####
+1 2 1 2
+2 2 2 2
+4 1 2 1
+1 1 3 1
+3 2 3 2
+4 2 4 2
+PREHOOK: query: explain cbo
+select count(distinct y), x, z, count(distinct a)
+from tabw4intcols
+group by z, x
+PREHOOK: type: QUERY
+PREHOOK: Input: default@tabw4intcols
+#### A masked pattern was here ####
+POSTHOOK: query: explain cbo
+select count(distinct y), x, z, count(distinct a)
+from tabw4intcols
+group by z, x
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@tabw4intcols
+#### A masked pattern was here ####
+CBO PLAN:
+HiveProject(_o__c0=[$2], x=[$1], z=[$0], _o__c3=[$3])
+ HiveAggregate(group=[{2, 3}], agg#0=[count($0)], agg#1=[count($1)])
+ HiveProject($f0=[CASE(AND(=($4, 1), IS NOT NULL($2)), 1, null:INTEGER)], $f1=[CASE(AND(=($4, 2), IS NOT NULL($3)), 1, null:INTEGER)], $f2=[$0], $f3=[$1])
+ HiveAggregate(group=[{0, 1, 2, 3}], groups=[[{0, 1, 2}, {0, 1, 3}]], GROUPING__ID=[GROUPING__ID()])
+ HiveProject($f0=[$2], $f1=[$0], $f2=[$1], $f3=[$3])
+ HiveTableScan(table=[[default, tabw4intcols]], table:alias=[tabw4intcols])
+
+PREHOOK: query: select count(distinct y), x, z, count(distinct a)
+from tabw4intcols
+group by z, x
+PREHOOK: type: QUERY
+PREHOOK: Input: default@tabw4intcols
+#### A masked pattern was here ####
+POSTHOOK: query: select count(distinct y), x, z, count(distinct a)
+from tabw4intcols
+group by z, x
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@tabw4intcols
+#### A masked pattern was here ####
+2 1 1 2
+2 2 2 2
+1 4 2 1
+1 1 3 1
+2 3 3 2
+2 4 4 2
+PREHOOK: query: drop table tabw4intcols
+PREHOOK: type: DROPTABLE
+PREHOOK: Input: default@tabw4intcols
+PREHOOK: Output: default@tabw4intcols
+POSTHOOK: query: drop table tabw4intcols
+POSTHOOK: type: DROPTABLE
+POSTHOOK: Input: default@tabw4intcols
+POSTHOOK: Output: default@tabw4intcols
diff --git a/ql/src/test/results/clientpositive/llap/offset_limit_ppd_optimizer.q.out b/ql/src/test/results/clientpositive/llap/offset_limit_ppd_optimizer.q.out
index 1e0aa93..34c572e 100644
--- a/ql/src/test/results/clientpositive/llap/offset_limit_ppd_optimizer.q.out
+++ b/ql/src/test/results/clientpositive/llap/offset_limit_ppd_optimizer.q.out
@@ -741,43 +741,50 @@ STAGE PLANS:
Statistics: Num rows: 12288 Data size: 1779850 Basic stats: COMPLETE Column stats: COMPLETE
Select Operator
expressions: ctinyint (type: tinyint), cstring1 (type: string), cstring2 (type: string)
- outputColumnNames: ctinyint, cstring1, cstring2
+ outputColumnNames: _col0, _col1, _col2
Statistics: Num rows: 12288 Data size: 1779850 Basic stats: COMPLETE Column stats: COMPLETE
Group By Operator
- aggregations: count(DISTINCT cstring1), count(DISTINCT cstring2)
- keys: ctinyint (type: tinyint), cstring1 (type: string), cstring2 (type: string)
+ keys: _col0 (type: tinyint), _col1 (type: string), _col2 (type: string), 0L (type: bigint)
minReductionHashAggr: 0.0
mode: hash
- outputColumnNames: _col0, _col1, _col2, _col3, _col4
- Statistics: Num rows: 12288 Data size: 1976458 Basic stats: COMPLETE Column stats: COMPLETE
+ outputColumnNames: _col0, _col1, _col2, _col3
+ Statistics: Num rows: 24576 Data size: 3756114 Basic stats: COMPLETE Column stats: COMPLETE
Reduce Output Operator
- key expressions: _col0 (type: tinyint), _col1 (type: string), _col2 (type: string)
- sort order: +++
+ key expressions: _col0 (type: tinyint), _col1 (type: string), _col2 (type: string), _col3 (type: bigint)
+ sort order: ++++
Map-reduce partition columns: _col0 (type: tinyint)
- Statistics: Num rows: 12288 Data size: 1976458 Basic stats: COMPLETE Column stats: COMPLETE
- TopN Hash Memory Usage: 0.3
+ Statistics: Num rows: 24576 Data size: 3756114 Basic stats: COMPLETE Column stats: COMPLETE
Execution mode: llap
LLAP IO: all inputs
Reducer 2
Execution mode: llap
Reduce Operator Tree:
Group By Operator
- aggregations: count(DISTINCT KEY._col1:0._col0), count(DISTINCT KEY._col1:1._col0)
- keys: KEY._col0 (type: tinyint)
+ keys: KEY._col0 (type: tinyint), KEY._col1 (type: string), KEY._col2 (type: string), KEY._col3 (type: bigint)
mode: mergepartial
- outputColumnNames: _col0, _col1, _col2
- Statistics: Num rows: 131 Data size: 2492 Basic stats: COMPLETE Column stats: COMPLETE
- Limit
- Number of rows: 20
- Offset of rows: 10
- Statistics: Num rows: 20 Data size: 384 Basic stats: COMPLETE Column stats: COMPLETE
- File Output Operator
- compressed: false
- Statistics: Num rows: 20 Data size: 384 Basic stats: COMPLETE Column stats: COMPLETE
- table:
- input format: org.apache.hadoop.mapred.SequenceFileInputFormat
- output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
- serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ outputColumnNames: _col0, _col1, _col2, _col3
+ Statistics: Num rows: 24576 Data size: 3756114 Basic stats: COMPLETE Column stats: COMPLETE
+ Select Operator
+ expressions: CASE WHEN (((_col3 = 1L) and _col1 is not null)) THEN (1) ELSE (null) END (type: int), CASE WHEN (((_col3 = 2L) and _col2 is not null)) THEN (1) ELSE (null) END (type: int), _col0 (type: tinyint)
+ outputColumnNames: _col0, _col1, _col2
+ Statistics: Num rows: 24576 Data size: 3756114 Basic stats: COMPLETE Column stats: COMPLETE
+ Group By Operator
+ aggregations: count(_col0), count(_col1)
+ keys: _col2 (type: tinyint)
+ mode: complete
+ outputColumnNames: _col0, _col1, _col2
+ Statistics: Num rows: 131 Data size: 2492 Basic stats: COMPLETE Column stats: COMPLETE
+ Limit
+ Number of rows: 20
+ Offset of rows: 10
+ Statistics: Num rows: 20 Data size: 384 Basic stats: COMPLETE Column stats: COMPLETE
+ File Output Operator
+ compressed: false
+ Statistics: Num rows: 20 Data size: 384 Basic stats: COMPLETE Column stats: COMPLETE
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
Stage: Stage-0
Fetch Operator
diff --git a/ql/src/test/results/clientpositive/llap/reduce_deduplicate_distinct.q.out b/ql/src/test/results/clientpositive/llap/reduce_deduplicate_distinct.q.out
index 2bacc42..bd96597 100644
--- a/ql/src/test/results/clientpositive/llap/reduce_deduplicate_distinct.q.out
+++ b/ql/src/test/results/clientpositive/llap/reduce_deduplicate_distinct.q.out
@@ -48,38 +48,46 @@ STAGE PLANS:
Statistics: Num rows: 5 Data size: 60 Basic stats: COMPLETE Column stats: COMPLETE
Select Operator
expressions: id (type: int), key (type: int), name (type: int)
- outputColumnNames: id, key, name
+ outputColumnNames: _col0, _col1, _col2
Statistics: Num rows: 5 Data size: 60 Basic stats: COMPLETE Column stats: COMPLETE
Group By Operator
- aggregations: count(DISTINCT key), count(DISTINCT name)
- keys: id (type: int), key (type: int), name (type: int)
- minReductionHashAggr: 0.6
+ keys: _col0 (type: int), _col1 (type: int), _col2 (type: int), 0L (type: bigint)
+ minReductionHashAggr: 0.0
mode: hash
- outputColumnNames: _col0, _col1, _col2, _col3, _col4
- Statistics: Num rows: 2 Data size: 56 Basic stats: COMPLETE Column stats: COMPLETE
+ outputColumnNames: _col0, _col1, _col2, _col3
+ Statistics: Num rows: 5 Data size: 100 Basic stats: COMPLETE Column stats: COMPLETE
Reduce Output Operator
- key expressions: _col0 (type: int), _col1 (type: int), _col2 (type: int)
- sort order: +++
+ key expressions: _col0 (type: int), _col1 (type: int), _col2 (type: int), _col3 (type: bigint)
+ sort order: ++++
Map-reduce partition columns: _col0 (type: int)
- Statistics: Num rows: 2 Data size: 56 Basic stats: COMPLETE Column stats: COMPLETE
- Execution mode: llap
+ Statistics: Num rows: 5 Data size: 100 Basic stats: COMPLETE Column stats: COMPLETE
+ Execution mode: vectorized, llap
LLAP IO: no inputs
Reducer 2
- Execution mode: llap
+ Execution mode: vectorized, llap
Reduce Operator Tree:
Group By Operator
- aggregations: count(DISTINCT KEY._col1:0._col0), count(DISTINCT KEY._col1:1._col0)
- keys: KEY._col0 (type: int)
+ keys: KEY._col0 (type: int), KEY._col1 (type: int), KEY._col2 (type: int), KEY._col3 (type: bigint)
mode: mergepartial
- outputColumnNames: _col0, _col1, _col2
- Statistics: Num rows: 1 Data size: 20 Basic stats: COMPLETE Column stats: COMPLETE
- File Output Operator
- compressed: false
- Statistics: Num rows: 1 Data size: 20 Basic stats: COMPLETE Column stats: COMPLETE
- table:
- input format: org.apache.hadoop.mapred.SequenceFileInputFormat
- output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
- serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ outputColumnNames: _col0, _col1, _col2, _col3
+ Statistics: Num rows: 5 Data size: 100 Basic stats: COMPLETE Column stats: COMPLETE
+ Select Operator
+ expressions: CASE WHEN (((_col3 = 1L) and _col1 is not null)) THEN (1) ELSE (null) END (type: int), CASE WHEN (((_col3 = 2L) and _col2 is not null)) THEN (1) ELSE (null) END (type: int), _col0 (type: int)
+ outputColumnNames: _col0, _col1, _col2
+ Statistics: Num rows: 5 Data size: 100 Basic stats: COMPLETE Column stats: COMPLETE
+ Group By Operator
+ aggregations: count(_col0), count(_col1)
+ keys: _col2 (type: int)
+ mode: complete
+ outputColumnNames: _col0, _col1, _col2
+ Statistics: Num rows: 1 Data size: 20 Basic stats: COMPLETE Column stats: COMPLETE
+ File Output Operator
+ compressed: false
+ Statistics: Num rows: 1 Data size: 20 Basic stats: COMPLETE Column stats: COMPLETE
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
Stage: Stage-0
Fetch Operator
@@ -148,7 +156,7 @@ STAGE PLANS:
Execution mode: vectorized, llap
LLAP IO: no inputs
Reducer 2
- Execution mode: llap
+ Execution mode: vectorized, llap
Reduce Operator Tree:
Group By Operator
keys: KEY._col0 (type: int), KEY._col1 (type: int), KEY._col2 (type: int)
@@ -156,33 +164,41 @@ STAGE PLANS:
outputColumnNames: _col0, _col1, _col2
Statistics: Num rows: 2 Data size: 24 Basic stats: COMPLETE Column stats: COMPLETE
Group By Operator
- aggregations: count(DISTINCT _col1), count(DISTINCT _col2)
- keys: _col0 (type: int), _col1 (type: int), _col2 (type: int)
- minReductionHashAggr: 0.5
+ keys: _col0 (type: int), _col1 (type: int), _col2 (type: int), 0L (type: bigint)
+ minReductionHashAggr: 0.0
mode: hash
- outputColumnNames: _col0, _col1, _col2, _col3, _col4
- Statistics: Num rows: 1 Data size: 28 Basic stats: COMPLETE Column stats: COMPLETE
+ outputColumnNames: _col0, _col1, _col2, _col3
+ Statistics: Num rows: 2 Data size: 40 Basic stats: COMPLETE Column stats: COMPLETE
Reduce Output Operator
- key expressions: _col0 (type: int), _col1 (type: int), _col2 (type: int)
- sort order: +++
+ key expressions: _col0 (type: int), _col1 (type: int), _col2 (type: int), _col3 (type: bigint)
+ sort order: ++++
Map-reduce partition columns: _col0 (type: int)
- Statistics: Num rows: 1 Data size: 28 Basic stats: COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 2 Data size: 40 Basic stats: COMPLETE Column stats: COMPLETE
Reducer 3
- Execution mode: llap
+ Execution mode: vectorized, llap
Reduce Operator Tree:
Group By Operator
- aggregations: count(DISTINCT KEY._col1:0._col0), count(DISTINCT KEY._col1:1._col0)
- keys: KEY._col0 (type: int)
+ keys: KEY._col0 (type: int), KEY._col1 (type: int), KEY._col2 (type: int), KEY._col3 (type: bigint)
mode: mergepartial
- outputColumnNames: _col0, _col1, _col2
- Statistics: Num rows: 1 Data size: 20 Basic stats: COMPLETE Column stats: COMPLETE
- File Output Operator
- compressed: false
- Statistics: Num rows: 1 Data size: 20 Basic stats: COMPLETE Column stats: COMPLETE
- table:
- input format: org.apache.hadoop.mapred.SequenceFileInputFormat
- output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
- serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ outputColumnNames: _col0, _col1, _col2, _col3
+ Statistics: Num rows: 2 Data size: 40 Basic stats: COMPLETE Column stats: COMPLETE
+ Select Operator
+ expressions: CASE WHEN (((_col3 = 1L) and _col1 is not null)) THEN (1) ELSE (null) END (type: int), CASE WHEN (((_col3 = 2L) and _col2 is not null)) THEN (1) ELSE (null) END (type: int), _col0 (type: int)
+ outputColumnNames: _col0, _col1, _col2
+ Statistics: Num rows: 2 Data size: 40 Basic stats: COMPLETE Column stats: COMPLETE
+ Group By Operator
+ aggregations: count(_col0), count(_col1)
+ keys: _col2 (type: int)
+ mode: complete
+ outputColumnNames: _col0, _col1, _col2
+ Statistics: Num rows: 1 Data size: 20 Basic stats: COMPLETE Column stats: COMPLETE
+ File Output Operator
+ compressed: false
+ Statistics: Num rows: 1 Data size: 20 Basic stats: COMPLETE Column stats: COMPLETE
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
Stage: Stage-0
Fetch Operator
@@ -238,7 +254,7 @@ STAGE PLANS:
outputColumnNames: id, key, name
Statistics: Num rows: 5 Data size: 60 Basic stats: COMPLETE Column stats: COMPLETE
Group By Operator
- keys: id (type: int), key (type: int), name (type: int)
+ keys: id (type: int), name (type: int), key (type: int)
minReductionHashAggr: 0.6
mode: hash
outputColumnNames: _col0, _col1, _col2
@@ -251,7 +267,7 @@ STAGE PLANS:
Execution mode: vectorized, llap
LLAP IO: no inputs
Reducer 2
- Execution mode: llap
+ Execution mode: vectorized, llap
Reduce Operator Tree:
Group By Operator
keys: KEY._col0 (type: int), KEY._col1 (type: int), KEY._col2 (type: int)
@@ -259,33 +275,41 @@ STAGE PLANS:
outputColumnNames: _col0, _col1, _col2
Statistics: Num rows: 2 Data size: 24 Basic stats: COMPLETE Column stats: COMPLETE
Group By Operator
- aggregations: count(DISTINCT _col2), count(DISTINCT _col1)
- keys: _col0 (type: int), _col2 (type: int), _col1 (type: int)
- minReductionHashAggr: 0.5
+ keys: _col0 (type: int), _col1 (type: int), _col2 (type: int), 0L (type: bigint)
+ minReductionHashAggr: 0.0
mode: hash
- outputColumnNames: _col0, _col1, _col2, _col3, _col4
- Statistics: Num rows: 1 Data size: 28 Basic stats: COMPLETE Column stats: COMPLETE
+ outputColumnNames: _col0, _col1, _col2, _col3
+ Statistics: Num rows: 2 Data size: 40 Basic stats: COMPLETE Column stats: COMPLETE
Reduce Output Operator
- key expressions: _col0 (type: int), _col1 (type: int), _col2 (type: int)
- sort order: +++
+ key expressions: _col0 (type: int), _col1 (type: int), _col2 (type: int), _col3 (type: bigint)
+ sort order: ++++
Map-reduce partition columns: _col0 (type: int)
- Statistics: Num rows: 1 Data size: 28 Basic stats: COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 2 Data size: 40 Basic stats: COMPLETE Column stats: COMPLETE
Reducer 3
- Execution mode: llap
+ Execution mode: vectorized, llap
Reduce Operator Tree:
Group By Operator
- aggregations: count(DISTINCT KEY._col1:0._col0), count(DISTINCT KEY._col1:1._col0)
- keys: KEY._col0 (type: int)
+ keys: KEY._col0 (type: int), KEY._col1 (type: int), KEY._col2 (type: int), KEY._col3 (type: bigint)
mode: mergepartial
- outputColumnNames: _col0, _col1, _col2
- Statistics: Num rows: 1 Data size: 20 Basic stats: COMPLETE Column stats: COMPLETE
- File Output Operator
- compressed: false
- Statistics: Num rows: 1 Data size: 20 Basic stats: COMPLETE Column stats: COMPLETE
- table:
- input format: org.apache.hadoop.mapred.SequenceFileInputFormat
- output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
- serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ outputColumnNames: _col0, _col1, _col2, _col3
+ Statistics: Num rows: 2 Data size: 40 Basic stats: COMPLETE Column stats: COMPLETE
+ Select Operator
+ expressions: CASE WHEN (((_col3 = 1L) and _col1 is not null)) THEN (1) ELSE (null) END (type: int), CASE WHEN (((_col3 = 2L) and _col2 is not null)) THEN (1) ELSE (null) END (type: int), _col0 (type: int)
+ outputColumnNames: _col0, _col1, _col2
+ Statistics: Num rows: 2 Data size: 40 Basic stats: COMPLETE Column stats: COMPLETE
+ Group By Operator
+ aggregations: count(_col0), count(_col1)
+ keys: _col2 (type: int)
+ mode: complete
+ outputColumnNames: _col0, _col1, _col2
+ Statistics: Num rows: 1 Data size: 20 Basic stats: COMPLETE Column stats: COMPLETE
+ File Output Operator
+ compressed: false
+ Statistics: Num rows: 1 Data size: 20 Basic stats: COMPLETE Column stats: COMPLETE
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
Stage: Stage-0
Fetch Operator
@@ -329,6 +353,7 @@ STAGE PLANS:
Edges:
Reducer 2 <- Map 1 (SIMPLE_EDGE)
Reducer 3 <- Reducer 2 (SIMPLE_EDGE)
+ Reducer 4 <- Reducer 3 (SIMPLE_EDGE)
#### A masked pattern was here ####
Vertices:
Map 1
@@ -354,7 +379,7 @@ STAGE PLANS:
Execution mode: vectorized, llap
LLAP IO: no inputs
Reducer 2
- Execution mode: llap
+ Execution mode: vectorized, llap
Reduce Operator Tree:
Group By Operator
keys: KEY._col0 (type: int), KEY._col1 (type: int), KEY._col2 (type: int)
@@ -362,22 +387,46 @@ STAGE PLANS:
outputColumnNames: _col0, _col1, _col2
Statistics: Num rows: 2 Data size: 24 Basic stats: COMPLETE Column stats: COMPLETE
Group By Operator
- aggregations: count(DISTINCT _col1), count(DISTINCT _col2)
- keys: _col0 (type: int), _col1 (type: int), _col2 (type: int)
- minReductionHashAggr: 0.5
+ keys: _col0 (type: int), _col1 (type: int), _col2 (type: int), 0L (type: bigint)
+ minReductionHashAggr: 0.0
mode: hash
- outputColumnNames: _col0, _col1, _col2, _col3, _col4
- Statistics: Num rows: 1 Data size: 28 Basic stats: COMPLETE Column stats: COMPLETE
+ outputColumnNames: _col0, _col1, _col2, _col3
+ Statistics: Num rows: 2 Data size: 40 Basic stats: COMPLETE Column stats: COMPLETE
Reduce Output Operator
- key expressions: _col0 (type: int), _col1 (type: int), _col2 (type: int)
- sort order: +++
- Map-reduce partition columns: _col0 (type: int)
- Statistics: Num rows: 1 Data size: 28 Basic stats: COMPLETE Column stats: COMPLETE
+ key expressions: _col0 (type: int), _col1 (type: int), _col2 (type: int), _col3 (type: bigint)
+ sort order: ++++
+ Map-reduce partition columns: _col0 (type: int), _col1 (type: int), _col2 (type: int), _col3 (type: bigint)
+ Statistics: Num rows: 2 Data size: 40 Basic stats: COMPLETE Column stats: COMPLETE
Reducer 3
- Execution mode: llap
+ Execution mode: vectorized, llap
Reduce Operator Tree:
Group By Operator
- aggregations: count(DISTINCT KEY._col1:0._col0), count(DISTINCT KEY._col1:1._col0)
+ keys: KEY._col0 (type: int), KEY._col1 (type: int), KEY._col2 (type: int), KEY._col3 (type: bigint)
+ mode: mergepartial
+ outputColumnNames: _col0, _col1, _col2, _col3
+ Statistics: Num rows: 2 Data size: 40 Basic stats: COMPLETE Column stats: COMPLETE
+ Select Operator
+ expressions: CASE WHEN (((_col3 = 1L) and _col1 is not null)) THEN (1) ELSE (null) END (type: int), CASE WHEN (((_col3 = 2L) and _col2 is not null)) THEN (1) ELSE (null) END (type: int), _col0 (type: int)
+ outputColumnNames: _col0, _col1, _col2
+ Statistics: Num rows: 2 Data size: 40 Basic stats: COMPLETE Column stats: COMPLETE
+ Group By Operator
+ aggregations: count(_col0), count(_col1)
+ keys: _col2 (type: int)
+ minReductionHashAggr: 0.5
+ mode: hash
+ outputColumnNames: _col0, _col1, _col2
+ Statistics: Num rows: 1 Data size: 20 Basic stats: COMPLETE Column stats: COMPLETE
+ Reduce Output Operator
+ key expressions: _col0 (type: int)
+ sort order: +
+ Map-reduce partition columns: _col0 (type: int)
+ Statistics: Num rows: 1 Data size: 20 Basic stats: COMPLETE Column stats: COMPLETE
+ value expressions: _col1 (type: bigint), _col2 (type: bigint)
+ Reducer 4
+ Execution mode: vectorized, llap
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations: count(VALUE._col0), count(VALUE._col1)
keys: KEY._col0 (type: int)
mode: mergepartial
outputColumnNames: _col0, _col1, _col2
@@ -432,6 +481,7 @@ STAGE PLANS:
Edges:
Reducer 2 <- Map 1 (SIMPLE_EDGE)
Reducer 3 <- Reducer 2 (SIMPLE_EDGE)
+ Reducer 4 <- Reducer 3 (SIMPLE_EDGE)
#### A masked pattern was here ####
Vertices:
Map 1
@@ -444,7 +494,7 @@ STAGE PLANS:
outputColumnNames: id, key, name
Statistics: Num rows: 5 Data size: 60 Basic stats: COMPLETE Column stats: COMPLETE
Group By Operator
- keys: id (type: int), key (type: int), name (type: int)
+ keys: id (type: int), name (type: int), key (type: int)
minReductionHashAggr: 0.6
mode: hash
outputColumnNames: _col0, _col1, _col2
@@ -457,7 +507,7 @@ STAGE PLANS:
Execution mode: vectorized, llap
LLAP IO: no inputs
Reducer 2
- Execution mode: llap
+ Execution mode: vectorized, llap
Reduce Operator Tree:
Group By Operator
keys: KEY._col0 (type: int), KEY._col1 (type: int), KEY._col2 (type: int)
@@ -465,22 +515,46 @@ STAGE PLANS:
outputColumnNames: _col0, _col1, _col2
Statistics: Num rows: 2 Data size: 24 Basic stats: COMPLETE Column stats: COMPLETE
Group By Operator
- aggregations: count(DISTINCT _col2), count(DISTINCT _col1)
- keys: _col0 (type: int), _col2 (type: int), _col1 (type: int)
- minReductionHashAggr: 0.5
+ keys: _col0 (type: int), _col1 (type: int), _col2 (type: int), 0L (type: bigint)
+ minReductionHashAggr: 0.0
mode: hash
- outputColumnNames: _col0, _col1, _col2, _col3, _col4
- Statistics: Num rows: 1 Data size: 28 Basic stats: COMPLETE Column stats: COMPLETE
+ outputColumnNames: _col0, _col1, _col2, _col3
+ Statistics: Num rows: 2 Data size: 40 Basic stats: COMPLETE Column stats: COMPLETE
Reduce Output Operator
- key expressions: _col0 (type: int), _col1 (type: int), _col2 (type: int)
- sort order: +++
- Map-reduce partition columns: _col0 (type: int)
- Statistics: Num rows: 1 Data size: 28 Basic stats: COMPLETE Column stats: COMPLETE
+ key expressions: _col0 (type: int), _col1 (type: int), _col2 (type: int), _col3 (type: bigint)
+ sort order: ++++
+ Map-reduce partition columns: _col0 (type: int), _col1 (type: int), _col2 (type: int), _col3 (type: bigint)
+ Statistics: Num rows: 2 Data size: 40 Basic stats: COMPLETE Column stats: COMPLETE
Reducer 3
- Execution mode: llap
+ Execution mode: vectorized, llap
+ Reduce Operator Tree:
+ Group By Operator
+ keys: KEY._col0 (type: int), KEY._col1 (type: int), KEY._col2 (type: int), KEY._col3 (type: bigint)
+ mode: mergepartial
+ outputColumnNames: _col0, _col1, _col2, _col3
+ Statistics: Num rows: 2 Data size: 40 Basic stats: COMPLETE Column stats: COMPLETE
+ Select Operator
+ expressions: CASE WHEN (((_col3 = 1L) and _col1 is not null)) THEN (1) ELSE (null) END (type: int), CASE WHEN (((_col3 = 2L) and _col2 is not null)) THEN (1) ELSE (null) END (type: int), _col0 (type: int)
+ outputColumnNames: _col0, _col1, _col2
+ Statistics: Num rows: 2 Data size: 40 Basic stats: COMPLETE Column stats: COMPLETE
+ Group By Operator
+ aggregations: count(_col0), count(_col1)
+ keys: _col2 (type: int)
+ minReductionHashAggr: 0.5
+ mode: hash
+ outputColumnNames: _col0, _col1, _col2
+ Statistics: Num rows: 1 Data size: 20 Basic stats: COMPLETE Column stats: COMPLETE
+ Reduce Output Operator
+ key expressions: _col0 (type: int)
+ sort order: +
+ Map-reduce partition columns: _col0 (type: int)
+ Statistics: Num rows: 1 Data size: 20 Basic stats: COMPLETE Column stats: COMPLETE
+ value expressions: _col1 (type: bigint), _col2 (type: bigint)
+ Reducer 4
+ Execution mode: vectorized, llap
Reduce Operator Tree:
Group By Operator
- aggregations: count(DISTINCT KEY._col1:0._col0), count(DISTINCT KEY._col1:1._col0)
+ aggregations: count(VALUE._col0), count(VALUE._col1)
keys: KEY._col0 (type: int)
mode: mergepartial
outputColumnNames: _col0, _col1, _col2
diff --git a/ql/src/test/results/clientpositive/spark/auto_join18_multi_distinct.q.out b/ql/src/test/results/clientpositive/spark/auto_join18_multi_distinct.q.out
index 042e4fe..df7f9bc 100644
--- a/ql/src/test/results/clientpositive/spark/auto_join18_multi_distinct.q.out
+++ b/ql/src/test/results/clientpositive/spark/auto_join18_multi_distinct.q.out
@@ -76,20 +76,20 @@ STAGE PLANS:
Statistics: Num rows: 25 Data size: 191 Basic stats: COMPLETE Column stats: NONE
Select Operator
expressions: key (type: string), value (type: string)
- outputColumnNames: key, value
+ outputColumnNames: _col0, _col1
Statistics: Num rows: 25 Data size: 191 Basic stats: COMPLETE Column stats: NONE
Group By Operator
- aggregations: count(DISTINCT value), count(DISTINCT key)
- keys: key (type: string), value (type: string)
+ keys: _col0 (type: string), _col1 (type: string), 0L (type: bigint)
minReductionHashAggr: 0.99
mode: hash
- outputColumnNames: _col0, _col1, _col2, _col3
- Statistics: Num rows: 25 Data size: 191 Basic stats: COMPLETE Column stats: NONE
+ outputColumnNames: _col0, _col1, _col2
+ Statistics: Num rows: 50 Data size: 382 Basic stats: COMPLETE Column stats: NONE
Reduce Output Operator
- key expressions: _col0 (type: string), _col1 (type: string)
- sort order: ++
+ key expressions: _col0 (type: string), _col1 (type: string), _col2 (type: bigint)
+ sort order: +++
Map-reduce partition columns: _col0 (type: string)
- Statistics: Num rows: 25 Data size: 191 Basic stats: COMPLETE Column stats: NONE
+ Statistics: Num rows: 50 Data size: 382 Basic stats: COMPLETE Column stats: NONE
+ Execution mode: vectorized
Reducer 2
Execution mode: vectorized
Reduce Operator Tree:
@@ -145,19 +145,29 @@ STAGE PLANS:
output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
Reducer 6
+ Execution mode: vectorized
Reduce Operator Tree:
Group By Operator
- aggregations: count(DISTINCT KEY._col1:0._col0), count(DISTINCT KEY._col1:1._col0)
- keys: KEY._col0 (type: string)
+ keys: KEY._col0 (type: string), KEY._col1 (type: string), KEY._col2 (type: bigint)
mode: mergepartial
outputColumnNames: _col0, _col1, _col2
- Statistics: Num rows: 12 Data size: 91 Basic stats: COMPLETE Column stats: NONE
- Reduce Output Operator
- key expressions: _col0 (type: string)
- sort order: +
- Map-reduce partition columns: _col0 (type: string)
- Statistics: Num rows: 12 Data size: 91 Basic stats: COMPLETE Column stats: NONE
- value expressions: _col1 (type: bigint), _col2 (type: bigint)
+ Statistics: Num rows: 25 Data size: 191 Basic stats: COMPLETE Column stats: NONE
+ Select Operator
+ expressions: CASE WHEN (((_col2 = 0L) and _col1 is not null)) THEN (1) ELSE (null) END (type: int), CASE WHEN (((_col2 = 1L) and _col0 is not null)) THEN (1) ELSE (null) END (type: int), _col0 (type: string)
+ outputColumnNames: _col0, _col1, _col2
+ Statistics: Num rows: 25 Data size: 191 Basic stats: COMPLETE Column stats: NONE
+ Group By Operator
+ aggregations: count(_col0), count(_col1)
+ keys: _col2 (type: string)
+ mode: complete
+ outputColumnNames: _col0, _col1, _col2
+ Statistics: Num rows: 12 Data size: 91 Basic stats: COMPLETE Column stats: NONE
+ Reduce Output Operator
+ key expressions: _col0 (type: string)
+ sort order: +
+ Map-reduce partition columns: _col0 (type: string)
+ Statistics: Num rows: 12 Data size: 91 Basic stats: COMPLETE Column stats: NONE
+ value expressions: _col1 (type: bigint), _col2 (type: bigint)
Stage: Stage-0
Fetch Operator
diff --git a/ql/src/test/results/clientpositive/spark/join18_multi_distinct.q.out b/ql/src/test/results/clientpositive/spark/join18_multi_distinct.q.out
index 4347144..fe8d3f1 100644
--- a/ql/src/test/results/clientpositive/spark/join18_multi_distinct.q.out
+++ b/ql/src/test/results/clientpositive/spark/join18_multi_distinct.q.out
@@ -75,20 +75,20 @@ STAGE PLANS:
Statistics: Num rows: 25 Data size: 191 Basic stats: COMPLETE Column stats: NONE
Select Operator
expressions: key (type: string), value (type: string)
- outputColumnNames: key, value
+ outputColumnNames: _col0, _col1
Statistics: Num rows: 25 Data size: 191 Basic stats: COMPLETE Column stats: NONE
Group By Operator
- aggregations: count(DISTINCT value), count(DISTINCT key)
- keys: key (type: string), value (type: string)
+ keys: _col0 (type: string), _col1 (type: string), 0L (type: bigint)
minReductionHashAggr: 0.99
mode: hash
- outputColumnNames: _col0, _col1, _col2, _col3
- Statistics: Num rows: 25 Data size: 191 Basic stats: COMPLETE Column stats: NONE
+ outputColumnNames: _col0, _col1, _col2
+ Statistics: Num rows: 50 Data size: 382 Basic stats: COMPLETE Column stats: NONE
Reduce Output Operator
- key expressions: _col0 (type: string), _col1 (type: string)
- sort order: ++
+ key expressions: _col0 (type: string), _col1 (type: string), _col2 (type: bigint)
+ sort order: +++
Map-reduce partition columns: _col0 (type: string)
- Statistics: Num rows: 25 Data size: 191 Basic stats: COMPLETE Column stats: NONE
+ Statistics: Num rows: 50 Data size: 382 Basic stats: COMPLETE Column stats: NONE
+ Execution mode: vectorized
Reducer 2
Execution mode: vectorized
Reduce Operator Tree:
@@ -122,19 +122,29 @@ STAGE PLANS:
output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
Reducer 5
+ Execution mode: vectorized
Reduce Operator Tree:
Group By Operator
- aggregations: count(DISTINCT KEY._col1:0._col0), count(DISTINCT KEY._col1:1._col0)
- keys: KEY._col0 (type: string)
+ keys: KEY._col0 (type: string), KEY._col1 (type: string), KEY._col2 (type: bigint)
mode: mergepartial
outputColumnNames: _col0, _col1, _col2
- Statistics: Num rows: 12 Data size: 91 Basic stats: COMPLETE Column stats: NONE
- Reduce Output Operator
- key expressions: _col0 (type: string)
- sort order: +
- Map-reduce partition columns: _col0 (type: string)
- Statistics: Num rows: 12 Data size: 91 Basic stats: COMPLETE Column stats: NONE
- value expressions: _col1 (type: bigint), _col2 (type: bigint)
+ Statistics: Num rows: 25 Data size: 191 Basic stats: COMPLETE Column stats: NONE
+ Select Operator
+ expressions: CASE WHEN (((_col2 = 0L) and _col1 is not null)) THEN (1) ELSE (null) END (type: int), CASE WHEN (((_col2 = 1L) and _col0 is not null)) THEN (1) ELSE (null) END (type: int), _col0 (type: string)
+ outputColumnNames: _col0, _col1, _col2
+ Statistics: Num rows: 25 Data size: 191 Basic stats: COMPLETE Column stats: NONE
+ Group By Operator
+ aggregations: count(_col0), count(_col1)
+ keys: _col2 (type: string)
+ mode: complete
+ outputColumnNames: _col0, _col1, _col2
+ Statistics: Num rows: 12 Data size: 91 Basic stats: COMPLETE Column stats: NONE
+ Reduce Output Operator
+ key expressions: _col0 (type: string)
+ sort order: +
+ Map-reduce partition columns: _col0 (type: string)
+ Statistics: Num rows: 12 Data size: 91 Basic stats: COMPLETE Column stats: NONE
+ value expressions: _col1 (type: bigint), _col2 (type: bigint)
Stage: Stage-0
Fetch Operator
diff --git a/ql/src/test/results/clientpositive/spark/limit_pushdown.q.out b/ql/src/test/results/clientpositive/spark/limit_pushdown.q.out
index 693198e..8336176 100644
--- a/ql/src/test/results/clientpositive/spark/limit_pushdown.q.out
+++ b/ql/src/test/results/clientpositive/spark/limit_pushdown.q.out
@@ -704,39 +704,48 @@ STAGE PLANS:
Statistics: Num rows: 12288 Data size: 2907994 Basic stats: COMPLETE Column stats: NONE
Select Operator
expressions: ctinyint (type: tinyint), cstring1 (type: string), cstring2 (type: string)
- outputColumnNames: ctinyint, cstring1, cstring2
+ outputColumnNames: _col0, _col1, _col2
Statistics: Num rows: 12288 Data size: 2907994 Basic stats: COMPLETE Column stats: NONE
Group By Operator
- aggregations: count(DISTINCT cstring1), count(DISTINCT cstring2)
- keys: ctinyint (type: tinyint), cstring1 (type: string), cstring2 (type: string)
+ keys: _col0 (type: tinyint), _col1 (type: string), _col2 (type: string), 0L (type: bigint)
minReductionHashAggr: 0.99
mode: hash
- outputColumnNames: _col0, _col1, _col2, _col3, _col4
- Statistics: Num rows: 12288 Data size: 2907994 Basic stats: COMPLETE Column stats: NONE
+ outputColumnNames: _col0, _col1, _col2, _col3
+ Statistics: Num rows: 24576 Data size: 5815988 Basic stats: COMPLETE Column stats: NONE
Reduce Output Operator
- key expressions: _col0 (type: tinyint), _col1 (type: string), _col2 (type: string)
- sort order: +++
+ key expressions: _col0 (type: tinyint), _col1 (type: string), _col2 (type: string), _col3 (type: bigint)
+ sort order: ++++
Map-reduce partition columns: _col0 (type: tinyint)
- Statistics: Num rows: 12288 Data size: 2907994 Basic stats: COMPLETE Column stats: NONE
- TopN Hash Memory Usage: 0.3
+ Statistics: Num rows: 24576 Data size: 5815988 Basic stats: COMPLETE Column stats: NONE
+ Execution mode: vectorized
Reducer 2
+ Execution mode: vectorized
Reduce Operator Tree:
Group By Operator
- aggregations: count(DISTINCT KEY._col1:0._col0), count(DISTINCT KEY._col1:1._col0)
- keys: KEY._col0 (type: tinyint)
+ keys: KEY._col0 (type: tinyint), KEY._col1 (type: string), KEY._col2 (type: string), KEY._col3 (type: bigint)
mode: mergepartial
- outputColumnNames: _col0, _col1, _col2
- Statistics: Num rows: 6144 Data size: 1453997 Basic stats: COMPLETE Column stats: NONE
- Limit
- Number of rows: 20
- Statistics: Num rows: 20 Data size: 4720 Basic stats: COMPLETE Column stats: NONE
- File Output Operator
- compressed: false
- Statistics: Num rows: 20 Data size: 4720 Basic stats: COMPLETE Column stats: NONE
- table:
- input format: org.apache.hadoop.mapred.SequenceFileInputFormat
- output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
- serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ outputColumnNames: _col0, _col1, _col2, _col3
+ Statistics: Num rows: 12288 Data size: 2907994 Basic stats: COMPLETE Column stats: NONE
+ Select Operator
+ expressions: CASE WHEN (((_col3 = 1L) and _col1 is not null)) THEN (1) ELSE (null) END (type: int), CASE WHEN (((_col3 = 2L) and _col2 is not null)) THEN (1) ELSE (null) END (type: int), _col0 (type: tinyint)
+ outputColumnNames: _col0, _col1, _col2
+ Statistics: Num rows: 12288 Data size: 2907994 Basic stats: COMPLETE Column stats: NONE
+ Group By Operator
+ aggregations: count(_col0), count(_col1)
+ keys: _col2 (type: tinyint)
+ mode: complete
+ outputColumnNames: _col0, _col1, _col2
+ Statistics: Num rows: 6144 Data size: 1453997 Basic stats: COMPLETE Column stats: NONE
+ Limit
+ Number of rows: 20
+ Statistics: Num rows: 20 Data size: 4720 Basic stats: COMPLETE Column stats: NONE
+ File Output Operator
+ compressed: false
+ Statistics: Num rows: 20 Data size: 4720 Basic stats: COMPLETE Column stats: NONE
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
Stage: Stage-0
Fetch Operator