You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by jc...@apache.org on 2019/11/18 04:11:00 UTC
[hive] branch master updated: HIVE-22464: Implement support for
NULLS FIRST/LAST in TopNKeyOperator (Krisztian Kasa,
reviewed by Jesus Camacho Rodriguez)
This is an automated email from the ASF dual-hosted git repository.
jcamacho pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hive.git
The following commit(s) were added to refs/heads/master by this push:
new fc81b89 HIVE-22464: Implement support for NULLS FIRST/LAST in TopNKeyOperator (Krisztian Kasa, reviewed by Jesus Camacho Rodriguez)
fc81b89 is described below
commit fc81b8909b1a0e6aa15900387a98bccf38ae2247
Author: Krisztian Kasa <kk...@cloudera.com>
AuthorDate: Sun Nov 17 20:09:10 2019 -0800
HIVE-22464: Implement support for NULLS FIRST/LAST in TopNKeyOperator (Krisztian Kasa, reviewed by Jesus Camacho Rodriguez)
---
.../hadoop/hive/ql/exec/TopNKeyOperator.java | 43 +-
.../hadoop/hive/ql/optimizer/TopNKeyProcessor.java | 10 +-
.../apache/hadoop/hive/ql/plan/TopNKeyDesc.java | 14 +
.../apache/hadoop/hive/ql/util/NullOrdering.java | 21 +-
ql/src/test/queries/clientpositive/topnkey.q | 55 ++-
.../test/results/clientpositive/llap/topnkey.q.out | 472 ++++++++++++++++++-
.../test/results/clientpositive/tez/topnkey.q.out | 482 ++++++++++++++++++--
ql/src/test/results/clientpositive/topnkey.q.out | 499 ++++++++++++++++++++-
.../objectinspector/ObjectInspectorUtils.java | 51 ++-
9 files changed, 1571 insertions(+), 76 deletions(-)
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/TopNKeyOperator.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/TopNKeyOperator.java
index 4734824..d16500e 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/exec/TopNKeyOperator.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/TopNKeyOperator.java
@@ -24,6 +24,7 @@ import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.plan.ExprNodeDesc;
import org.apache.hadoop.hive.ql.plan.TopNKeyDesc;
import org.apache.hadoop.hive.ql.plan.api.OperatorType;
+import org.apache.hadoop.hive.ql.util.NullOrdering;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils;
@@ -58,21 +59,26 @@ public class TopNKeyOperator extends Operator<TopNKeyDesc> implements Serializab
}
public static class KeyWrapperComparator implements Comparator<KeyWrapper> {
- private ObjectInspector[] objectInspectors1;
- private ObjectInspector[] objectInspectors2;
- private boolean[] columnSortOrderIsDesc;
-
- public KeyWrapperComparator(ObjectInspector[] objectInspectors1, ObjectInspector[]
- objectInspectors2, boolean[] columnSortOrderIsDesc) {
- this.objectInspectors1 = objectInspectors1;
- this.objectInspectors2 = objectInspectors2;
- this.columnSortOrderIsDesc = columnSortOrderIsDesc;
+
+ private final ObjectInspector[] keyObjectInspectors;
+ private final boolean[] columnSortOrderIsDesc;
+ private final ObjectInspectorUtils.NullValueOption[] nullSortOrder;
+
+ KeyWrapperComparator(ObjectInspector[] keyObjectInspectors, String columnSortOrder, String nullSortOrder) {
+ this.keyObjectInspectors = keyObjectInspectors;
+ this.columnSortOrderIsDesc = new boolean[columnSortOrder.length()];
+ this.nullSortOrder = new ObjectInspectorUtils.NullValueOption[nullSortOrder.length()];
+ for (int i = 0; i < columnSortOrder.length(); ++i) {
+ this.columnSortOrderIsDesc[i] = columnSortOrder.charAt(i) == '-';
+ this.nullSortOrder[i] = NullOrdering.fromSign(nullSortOrder.charAt(i)).getNullValueOption();
+ }
}
@Override
public int compare(KeyWrapper key1, KeyWrapper key2) {
- return ObjectInspectorUtils.compare(key1.getKeyArray(), objectInspectors1,
- key2.getKeyArray(), objectInspectors2, columnSortOrderIsDesc);
+ return ObjectInspectorUtils.compare(
+ key1.getKeyArray(), keyObjectInspectors, key2.getKeyArray(), keyObjectInspectors,
+ columnSortOrderIsDesc, nullSortOrder);
}
}
@@ -82,12 +88,6 @@ public class TopNKeyOperator extends Operator<TopNKeyDesc> implements Serializab
this.topN = conf.getTopN();
- String columnSortOrder = conf.getColumnSortOrder();
- boolean[] columnSortOrderIsDesc = new boolean[columnSortOrder.length()];
- for (int i = 0; i < columnSortOrderIsDesc.length; i++) {
- columnSortOrderIsDesc[i] = (columnSortOrder.charAt(i) == '-');
- }
-
ObjectInspector rowInspector = inputObjInspectors[0];
ObjectInspector standardObjInspector = ObjectInspectorUtils.getStandardObjectInspector(rowInspector);
outputObjInspector = rowInspector;
@@ -107,8 +107,13 @@ public class TopNKeyOperator extends Operator<TopNKeyDesc> implements Serializab
standardKeyObjectInspectors[i] = standardKeyFields[i].initialize(standardObjInspector);
}
- priorityQueue = new PriorityQueue<>(topN + 1, new TopNKeyOperator.KeyWrapperComparator(
- standardKeyObjectInspectors, standardKeyObjectInspectors, columnSortOrderIsDesc));
+ String columnSortOrder = conf.getColumnSortOrder();
+ String nullSortOrder = conf.getNullOrder();
+
+ // We need a reversed comparator because the PriorityQueue.poll() method is used for filtering out keys.
+ // Ex.: When ORDER BY key1 ASC then call of poll() should remove the largest key.
+ priorityQueue = new PriorityQueue<>(topN + 1,
+ new KeyWrapperComparator(standardKeyObjectInspectors, columnSortOrder, nullSortOrder).reversed());
KeyWrapperFactory keyWrapperFactory = new KeyWrapperFactory(keyFields, keyObjectInspectors,
standardKeyObjectInspectors);
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/TopNKeyProcessor.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/TopNKeyProcessor.java
index 721a9b9..4b4cf99 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/TopNKeyProcessor.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/TopNKeyProcessor.java
@@ -53,7 +53,7 @@ public class TopNKeyProcessor implements NodeProcessor {
@Override
public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx,
- Object... nodeOutputs) throws SemanticException {
+ Object... nodeOutputs) throws SemanticException {
// Get ReduceSinkOperator
ReduceSinkOperator reduceSinkOperator = (ReduceSinkOperator) nd;
@@ -95,11 +95,11 @@ public class TopNKeyProcessor implements NodeProcessor {
}
// Insert a new top n key operator between the group by operator and its parent
- TopNKeyDesc topNKeyDesc = new TopNKeyDesc(reduceSinkDesc.getTopN(), reduceSinkDesc.getOrder(),
- groupByKeyColumns);
+ TopNKeyDesc topNKeyDesc = new TopNKeyDesc(
+ reduceSinkDesc.getTopN(), reduceSinkDesc.getOrder(), reduceSinkDesc.getNullOrder(), groupByKeyColumns);
Operator<? extends OperatorDesc> newOperator = OperatorFactory.getAndMakeChild(
- groupByOperator.getCompilationOpContext(), (OperatorDesc) topNKeyDesc,
- new RowSchema(groupByOperator.getSchema()), groupByOperator.getParentOperators());
+ groupByOperator.getCompilationOpContext(), (OperatorDesc) topNKeyDesc,
+ new RowSchema(groupByOperator.getSchema()), groupByOperator.getParentOperators());
newOperator.getChildOperators().add(groupByOperator);
groupByOperator.getParentOperators().add(newOperator);
parentOperator.removeChild(groupByOperator);
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/plan/TopNKeyDesc.java b/ql/src/java/org/apache/hadoop/hive/ql/plan/TopNKeyDesc.java
index c62c4a9..84e6149 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/plan/TopNKeyDesc.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/plan/TopNKeyDesc.java
@@ -33,6 +33,7 @@ public class TopNKeyDesc extends AbstractOperatorDesc {
private int topN;
private String columnSortOrder;
+ private String nullOrder;
private List<ExprNodeDesc> keyColumns;
public TopNKeyDesc() {
@@ -41,10 +42,12 @@ public class TopNKeyDesc extends AbstractOperatorDesc {
public TopNKeyDesc(
final int topN,
final String columnSortOrder,
+ final String nullOrder,
final List<ExprNodeDesc> keyColumns) {
this.topN = topN;
this.columnSortOrder = columnSortOrder;
+ this.nullOrder = nullOrder;
this.keyColumns = keyColumns;
}
@@ -66,6 +69,15 @@ public class TopNKeyDesc extends AbstractOperatorDesc {
this.columnSortOrder = columnSortOrder;
}
+ @Explain(displayName = "null sort order", explainLevels = { Level.EXTENDED })
+ public String getNullOrder() {
+ return nullOrder;
+ }
+
+ public void setNullOrder(String nullOrder) {
+ this.nullOrder = nullOrder;
+ }
+
@Explain(displayName = "keys")
public String getKeyString() {
return PlanUtils.getExprListString(keyColumns);
@@ -98,6 +110,7 @@ public class TopNKeyDesc extends AbstractOperatorDesc {
TopNKeyDesc otherDesc = (TopNKeyDesc) other;
return getTopN() == otherDesc.getTopN() &&
Objects.equals(columnSortOrder, otherDesc.columnSortOrder) &&
+ Objects.equals(nullOrder, otherDesc.nullOrder) &&
ExprNodeDescUtils.isSame(keyColumns, otherDesc.keyColumns);
}
return false;
@@ -108,6 +121,7 @@ public class TopNKeyDesc extends AbstractOperatorDesc {
TopNKeyDesc ret = new TopNKeyDesc();
ret.setTopN(topN);
ret.setColumnSortOrder(columnSortOrder);
+ ret.setNullOrder(nullOrder);
ret.setKeyColumns(getKeyColumns() == null ? null : new ArrayList<>(getKeyColumns()));
return ret;
}
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/util/NullOrdering.java b/ql/src/java/org/apache/hadoop/hive/ql/util/NullOrdering.java
index 6bf1db2..46ff329 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/util/NullOrdering.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/util/NullOrdering.java
@@ -24,18 +24,20 @@ import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils.NullVa
* Enum for converting different Null ordering description types.
*/
public enum NullOrdering {
- NULLS_FIRST(1, HiveParser.TOK_NULLS_FIRST, NullValueOption.MAXVALUE),
- NULLS_LAST(0, HiveParser.TOK_NULLS_LAST, NullValueOption.MINVALUE);
+ NULLS_FIRST(1, HiveParser.TOK_NULLS_FIRST, NullValueOption.MAXVALUE, 'a'),
+ NULLS_LAST(0, HiveParser.TOK_NULLS_LAST, NullValueOption.MINVALUE, 'z');
- NullOrdering(int code, int token, NullValueOption nullValueOption) {
+ NullOrdering(int code, int token, NullValueOption nullValueOption, char sign) {
this.code = code;
this.token = token;
this.nullValueOption = nullValueOption;
+ this.sign = sign;
}
private final int code;
private final int token;
private final NullValueOption nullValueOption;
+ private final char sign;
public static NullOrdering fromToken(int token) {
for (NullOrdering nullOrdering : NullOrdering.values()) {
@@ -55,6 +57,15 @@ public enum NullOrdering {
throw new EnumConstantNotPresentException(NullOrdering.class, "No enum constant present with code " + code);
}
+ public static NullOrdering fromSign(char sign) {
+ for (NullOrdering nullOrdering : NullOrdering.values()) {
+ if (nullOrdering.sign == sign) {
+ return nullOrdering;
+ }
+ }
+ throw new EnumConstantNotPresentException(NullOrdering.class, "No enum constant present with sign " + sign);
+ }
+
public int getCode() {
return code;
}
@@ -66,4 +77,8 @@ public enum NullOrdering {
public NullValueOption getNullValueOption() {
return nullValueOption;
}
+
+ public char getSign() {
+ return sign;
+ }
}
diff --git a/ql/src/test/queries/clientpositive/topnkey.q b/ql/src/test/queries/clientpositive/topnkey.q
index e02a41d..283f426 100644
--- a/ql/src/test/queries/clientpositive/topnkey.q
+++ b/ql/src/test/queries/clientpositive/topnkey.q
@@ -14,7 +14,7 @@ set hive.tez.dynamic.partition.pruning=true;
set hive.stats.fetch.column.stats=true;
set hive.cbo.enable=true;
-EXPLAIN
+EXPLAIN EXTENDED
SELECT key, SUM(CAST(SUBSTR(value,5) AS INT)) FROM src GROUP BY key ORDER BY key LIMIT 5;
SELECT key, SUM(CAST(SUBSTR(value,5) AS INT)) FROM src GROUP BY key ORDER BY key LIMIT 5;
@@ -29,3 +29,56 @@ SELECT src1.key, src2.value FROM src src1 JOIN src src2 ON (src1.key = src2.key)
SELECT src1.key, src2.value FROM src src1 JOIN src src2 ON (src1.key = src2.key) ORDER BY src1.key LIMIT 5;
+CREATE TABLE t_test(
+ a int,
+ b int,
+ c int
+);
+
+INSERT INTO t_test VALUES
+(NULL, NULL, NULL),
+(5, 2, 3),
+(NULL, NULL, NULL),
+(NULL, NULL, NULL),
+(6, 2, 1),
+(7, 8, 4), (7, 8, 4), (7, 8, 4),
+(5, 1, 2), (5, 1, 2), (5, 1, 2),
+(NULL, NULL, NULL);
+
+EXPLAIN
+SELECT a, b FROM t_test GROUP BY a, b ORDER BY a, b LIMIT 3;
+SELECT a, b FROM t_test GROUP BY a, b ORDER BY a, b LIMIT 3;
+
+
+EXPLAIN
+SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a LIMIT 2;
+SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a LIMIT 2;
+SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a NULLS FIRST LIMIT 2;
+SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a NULLS LAST LIMIT 2;
+SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a ASC LIMIT 2;
+SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a ASC NULLS FIRST LIMIT 2;
+SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a ASC NULLS LAST LIMIT 2;
+
+DROP TABLE IF EXISTS t_test;
+
+CREATE TABLE t_test(
+ a int,
+ b int,
+ c int
+);
+
+INSERT INTO t_test VALUES
+(7, 8, 4), (7, 8, 4), (7, 8, 4),
+(NULL, NULL, NULL),
+(5, 2, 3),
+(NULL, NULL, NULL),
+(NULL, NULL, NULL),
+(6, 2, 1),
+(5, 1, 2), (5, 1, 2), (5, 1, 2),
+(NULL, NULL, NULL);
+
+SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a DESC LIMIT 2;
+SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a DESC NULLS FIRST LIMIT 2;
+SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a DESC NULLS LAST LIMIT 2;
+
+DROP TABLE IF EXISTS t_test;
diff --git a/ql/src/test/results/clientpositive/llap/topnkey.q.out b/ql/src/test/results/clientpositive/llap/topnkey.q.out
index e3c0bc1..cd47e9d 100644
--- a/ql/src/test/results/clientpositive/llap/topnkey.q.out
+++ b/ql/src/test/results/clientpositive/llap/topnkey.q.out
@@ -1,13 +1,18 @@
-PREHOOK: query: EXPLAIN
+PREHOOK: query: EXPLAIN EXTENDED
SELECT key, SUM(CAST(SUBSTR(value,5) AS INT)) FROM src GROUP BY key ORDER BY key LIMIT 5
PREHOOK: type: QUERY
PREHOOK: Input: default@src
#### A masked pattern was here ####
-POSTHOOK: query: EXPLAIN
+POSTHOOK: query: EXPLAIN EXTENDED
SELECT key, SUM(CAST(SUBSTR(value,5) AS INT)) FROM src GROUP BY key ORDER BY key LIMIT 5
POSTHOOK: type: QUERY
POSTHOOK: Input: default@src
#### A masked pattern was here ####
+OPTIMIZED SQL: SELECT `key` AS `$f0`, SUM(CAST(SUBSTR(`value`, 5) AS INTEGER)) AS `$f1`
+FROM `default`.`src`
+GROUP BY `key`
+ORDER BY `key`
+LIMIT 5
STAGE DEPENDENCIES:
Stage-1 is a root stage
Stage-0 depends on stages: Stage-1
@@ -26,6 +31,7 @@ STAGE PLANS:
TableScan
alias: src
Statistics: Num rows: 500 Data size: 89000 Basic stats: COMPLETE Column stats: COMPLETE
+ GatherStats: false
Select Operator
expressions: key (type: string), UDFToInteger(substr(value, 5)) (type: int)
outputColumnNames: _col0, _col1
@@ -33,6 +39,7 @@ STAGE PLANS:
Top N Key Operator
sort order: +
keys: _col0 (type: string)
+ null sort order: z
Statistics: Num rows: 500 Data size: 89000 Basic stats: COMPLETE Column stats: COMPLETE
top n: 5
Group By Operator
@@ -44,15 +51,73 @@ STAGE PLANS:
Statistics: Num rows: 250 Data size: 23750 Basic stats: COMPLETE Column stats: COMPLETE
Reduce Output Operator
key expressions: _col0 (type: string)
+ null sort order: z
sort order: +
Map-reduce partition columns: _col0 (type: string)
Statistics: Num rows: 250 Data size: 23750 Basic stats: COMPLETE Column stats: COMPLETE
+ tag: -1
+ TopN: 5
TopN Hash Memory Usage: 0.1
value expressions: _col1 (type: bigint)
+ auto parallelism: true
Execution mode: llap
LLAP IO: no inputs
+ Path -> Alias:
+#### A masked pattern was here ####
+ Path -> Partition:
+#### A masked pattern was here ####
+ Partition
+ base file name: src
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ COLUMN_STATS_ACCURATE {"BASIC_STATS":"true","COLUMN_STATS":{"key":"true","value":"true"}}
+ bucket_count -1
+ bucketing_version 2
+ column.name.delimiter ,
+ columns key,value
+ columns.comments 'default','default'
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.src
+ numFiles 1
+ numRows 500
+ rawDataSize 5312
+ serialization.ddl struct src { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 5812
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ COLUMN_STATS_ACCURATE {"BASIC_STATS":"true","COLUMN_STATS":{"key":"true","value":"true"}}
+ bucket_count -1
+ bucketing_version 2
+ column.name.delimiter ,
+ columns key,value
+ columns.comments 'default','default'
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.src
+ numFiles 1
+ numRows 500
+ rawDataSize 5312
+ serialization.ddl struct src { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 5812
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.src
+ name: default.src
+ Truncated Path -> Alias:
+ /src [src]
Reducer 2
Execution mode: llap
+ Needs Tagging: false
Reduce Operator Tree:
Group By Operator
aggregations: sum(VALUE._col0)
@@ -62,12 +127,17 @@ STAGE PLANS:
Statistics: Num rows: 250 Data size: 23750 Basic stats: COMPLETE Column stats: COMPLETE
Reduce Output Operator
key expressions: _col0 (type: string)
+ null sort order: z
sort order: +
Statistics: Num rows: 250 Data size: 23750 Basic stats: COMPLETE Column stats: COMPLETE
+ tag: -1
+ TopN: 5
TopN Hash Memory Usage: 0.1
value expressions: _col1 (type: bigint)
+ auto parallelism: false
Reducer 3
Execution mode: llap
+ Needs Tagging: false
Reduce Operator Tree:
Select Operator
expressions: KEY.reducesinkkey0 (type: string), VALUE._col0 (type: bigint)
@@ -78,11 +148,26 @@ STAGE PLANS:
Statistics: Num rows: 5 Data size: 475 Basic stats: COMPLETE Column stats: COMPLETE
File Output Operator
compressed: false
+ GlobalTableId: 0
+#### A masked pattern was here ####
+ NumFilesPerFileSink: 1
Statistics: Num rows: 5 Data size: 475 Basic stats: COMPLETE Column stats: COMPLETE
+#### A masked pattern was here ####
table:
input format: org.apache.hadoop.mapred.SequenceFileInputFormat
output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ properties:
+ columns _col0,_col1
+ columns.types string:bigint
+ escape.delim \
+ hive.serialization.extend.additional.nesting.levels true
+ serialization.escape.crlf true
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ TotalFiles: 1
+ GatherStats: false
+ MultiFileSpray: false
Stage: Stage-0
Fetch Operator
@@ -330,3 +415,386 @@ POSTHOOK: Input: default@src
0 val_0
0 val_0
0 val_0
+PREHOOK: query: CREATE TABLE t_test(
+ a int,
+ b int,
+ c int
+)
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@t_test
+POSTHOOK: query: CREATE TABLE t_test(
+ a int,
+ b int,
+ c int
+)
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@t_test
+PREHOOK: query: INSERT INTO t_test VALUES
+(NULL, NULL, NULL),
+(5, 2, 3),
+(NULL, NULL, NULL),
+(NULL, NULL, NULL),
+(6, 2, 1),
+(7, 8, 4), (7, 8, 4), (7, 8, 4),
+(5, 1, 2), (5, 1, 2), (5, 1, 2),
+(NULL, NULL, NULL)
+PREHOOK: type: QUERY
+PREHOOK: Input: _dummy_database@_dummy_table
+PREHOOK: Output: default@t_test
+POSTHOOK: query: INSERT INTO t_test VALUES
+(NULL, NULL, NULL),
+(5, 2, 3),
+(NULL, NULL, NULL),
+(NULL, NULL, NULL),
+(6, 2, 1),
+(7, 8, 4), (7, 8, 4), (7, 8, 4),
+(5, 1, 2), (5, 1, 2), (5, 1, 2),
+(NULL, NULL, NULL)
+POSTHOOK: type: QUERY
+POSTHOOK: Input: _dummy_database@_dummy_table
+POSTHOOK: Output: default@t_test
+POSTHOOK: Lineage: t_test.a SCRIPT []
+POSTHOOK: Lineage: t_test.b SCRIPT []
+POSTHOOK: Lineage: t_test.c SCRIPT []
+PREHOOK: query: EXPLAIN
+SELECT a, b FROM t_test GROUP BY a, b ORDER BY a, b LIMIT 3
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t_test
+#### A masked pattern was here ####
+POSTHOOK: query: EXPLAIN
+SELECT a, b FROM t_test GROUP BY a, b ORDER BY a, b LIMIT 3
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t_test
+#### A masked pattern was here ####
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+ Stage: Stage-1
+ Tez
+#### A masked pattern was here ####
+ Edges:
+ Reducer 2 <- Map 1 (SIMPLE_EDGE)
+ Reducer 3 <- Reducer 2 (SIMPLE_EDGE)
+#### A masked pattern was here ####
+ Vertices:
+ Map 1
+ Map Operator Tree:
+ TableScan
+ alias: t_test
+ Statistics: Num rows: 12 Data size: 72 Basic stats: COMPLETE Column stats: COMPLETE
+ Select Operator
+ expressions: a (type: int), b (type: int)
+ outputColumnNames: a, b
+ Statistics: Num rows: 12 Data size: 72 Basic stats: COMPLETE Column stats: COMPLETE
+ Top N Key Operator
+ sort order: ++
+ keys: a (type: int), b (type: int)
+ Statistics: Num rows: 12 Data size: 72 Basic stats: COMPLETE Column stats: COMPLETE
+ top n: 3
+ Group By Operator
+ keys: a (type: int), b (type: int)
+ minReductionHashAggr: 0.3333333
+ mode: hash
+ outputColumnNames: _col0, _col1
+ Statistics: Num rows: 6 Data size: 24 Basic stats: COMPLETE Column stats: COMPLETE
+ Reduce Output Operator
+ key expressions: _col0 (type: int), _col1 (type: int)
+ sort order: ++
+ Map-reduce partition columns: _col0 (type: int), _col1 (type: int)
+ Statistics: Num rows: 6 Data size: 24 Basic stats: COMPLETE Column stats: COMPLETE
+ TopN Hash Memory Usage: 0.1
+ Execution mode: llap
+ LLAP IO: no inputs
+ Reducer 2
+ Execution mode: llap
+ Reduce Operator Tree:
+ Group By Operator
+ keys: KEY._col0 (type: int), KEY._col1 (type: int)
+ mode: mergepartial
+ outputColumnNames: _col0, _col1
+ Statistics: Num rows: 6 Data size: 24 Basic stats: COMPLETE Column stats: COMPLETE
+ Reduce Output Operator
+ key expressions: _col0 (type: int), _col1 (type: int)
+ sort order: ++
+ Statistics: Num rows: 6 Data size: 24 Basic stats: COMPLETE Column stats: COMPLETE
+ TopN Hash Memory Usage: 0.1
+ Reducer 3
+ Execution mode: llap
+ Reduce Operator Tree:
+ Select Operator
+ expressions: KEY.reducesinkkey0 (type: int), KEY.reducesinkkey1 (type: int)
+ outputColumnNames: _col0, _col1
+ Statistics: Num rows: 6 Data size: 24 Basic stats: COMPLETE Column stats: COMPLETE
+ Limit
+ Number of rows: 3
+ Statistics: Num rows: 3 Data size: 16 Basic stats: COMPLETE Column stats: COMPLETE
+ File Output Operator
+ compressed: false
+ Statistics: Num rows: 3 Data size: 16 Basic stats: COMPLETE Column stats: COMPLETE
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: 3
+ Processor Tree:
+ ListSink
+
+PREHOOK: query: SELECT a, b FROM t_test GROUP BY a, b ORDER BY a, b LIMIT 3
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t_test
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT a, b FROM t_test GROUP BY a, b ORDER BY a, b LIMIT 3
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t_test
+#### A masked pattern was here ####
+5 1
+5 2
+6 2
+PREHOOK: query: EXPLAIN
+SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a LIMIT 2
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t_test
+#### A masked pattern was here ####
+POSTHOOK: query: EXPLAIN
+SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a LIMIT 2
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t_test
+#### A masked pattern was here ####
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+ Stage: Stage-1
+ Tez
+#### A masked pattern was here ####
+ Edges:
+ Reducer 2 <- Map 1 (SIMPLE_EDGE)
+ Reducer 3 <- Reducer 2 (SIMPLE_EDGE)
+#### A masked pattern was here ####
+ Vertices:
+ Map 1
+ Map Operator Tree:
+ TableScan
+ alias: t_test
+ Statistics: Num rows: 12 Data size: 72 Basic stats: COMPLETE Column stats: COMPLETE
+ Select Operator
+ expressions: a (type: int), b (type: int)
+ outputColumnNames: a, b
+ Statistics: Num rows: 12 Data size: 72 Basic stats: COMPLETE Column stats: COMPLETE
+ Top N Key Operator
+ sort order: +
+ keys: a (type: int)
+ Statistics: Num rows: 12 Data size: 72 Basic stats: COMPLETE Column stats: COMPLETE
+ top n: 2
+ Group By Operator
+ aggregations: count(b)
+ keys: a (type: int)
+ minReductionHashAggr: 0.6666666
+ mode: hash
+ outputColumnNames: _col0, _col1
+ Statistics: Num rows: 4 Data size: 40 Basic stats: COMPLETE Column stats: COMPLETE
+ Reduce Output Operator
+ key expressions: _col0 (type: int)
+ sort order: +
+ Map-reduce partition columns: _col0 (type: int)
+ Statistics: Num rows: 4 Data size: 40 Basic stats: COMPLETE Column stats: COMPLETE
+ TopN Hash Memory Usage: 0.1
+ value expressions: _col1 (type: bigint)
+ Execution mode: llap
+ LLAP IO: no inputs
+ Reducer 2
+ Execution mode: llap
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations: count(VALUE._col0)
+ keys: KEY._col0 (type: int)
+ mode: mergepartial
+ outputColumnNames: _col0, _col1
+ Statistics: Num rows: 4 Data size: 40 Basic stats: COMPLETE Column stats: COMPLETE
+ Reduce Output Operator
+ key expressions: _col0 (type: int)
+ sort order: +
+ Statistics: Num rows: 4 Data size: 40 Basic stats: COMPLETE Column stats: COMPLETE
+ TopN Hash Memory Usage: 0.1
+ value expressions: _col1 (type: bigint)
+ Reducer 3
+ Execution mode: llap
+ Reduce Operator Tree:
+ Select Operator
+ expressions: KEY.reducesinkkey0 (type: int), VALUE._col0 (type: bigint)
+ outputColumnNames: _col0, _col1
+ Statistics: Num rows: 4 Data size: 40 Basic stats: COMPLETE Column stats: COMPLETE
+ Limit
+ Number of rows: 2
+ Statistics: Num rows: 2 Data size: 20 Basic stats: COMPLETE Column stats: COMPLETE
+ File Output Operator
+ compressed: false
+ Statistics: Num rows: 2 Data size: 20 Basic stats: COMPLETE Column stats: COMPLETE
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: 2
+ Processor Tree:
+ ListSink
+
+PREHOOK: query: SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a LIMIT 2
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t_test
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a LIMIT 2
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t_test
+#### A masked pattern was here ####
+5 4
+6 1
+PREHOOK: query: SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a NULLS FIRST LIMIT 2
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t_test
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a NULLS FIRST LIMIT 2
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t_test
+#### A masked pattern was here ####
+NULL 0
+5 4
+PREHOOK: query: SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a NULLS LAST LIMIT 2
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t_test
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a NULLS LAST LIMIT 2
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t_test
+#### A masked pattern was here ####
+5 4
+6 1
+PREHOOK: query: SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a ASC LIMIT 2
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t_test
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a ASC LIMIT 2
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t_test
+#### A masked pattern was here ####
+5 4
+6 1
+PREHOOK: query: SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a ASC NULLS FIRST LIMIT 2
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t_test
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a ASC NULLS FIRST LIMIT 2
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t_test
+#### A masked pattern was here ####
+NULL 0
+5 4
+PREHOOK: query: SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a ASC NULLS LAST LIMIT 2
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t_test
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a ASC NULLS LAST LIMIT 2
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t_test
+#### A masked pattern was here ####
+5 4
+6 1
+PREHOOK: query: DROP TABLE IF EXISTS t_test
+PREHOOK: type: DROPTABLE
+PREHOOK: Input: default@t_test
+PREHOOK: Output: default@t_test
+POSTHOOK: query: DROP TABLE IF EXISTS t_test
+POSTHOOK: type: DROPTABLE
+POSTHOOK: Input: default@t_test
+POSTHOOK: Output: default@t_test
+PREHOOK: query: CREATE TABLE t_test(
+ a int,
+ b int,
+ c int
+)
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@t_test
+POSTHOOK: query: CREATE TABLE t_test(
+ a int,
+ b int,
+ c int
+)
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@t_test
+PREHOOK: query: INSERT INTO t_test VALUES
+(7, 8, 4), (7, 8, 4), (7, 8, 4),
+(NULL, NULL, NULL),
+(5, 2, 3),
+(NULL, NULL, NULL),
+(NULL, NULL, NULL),
+(6, 2, 1),
+(5, 1, 2), (5, 1, 2), (5, 1, 2),
+(NULL, NULL, NULL)
+PREHOOK: type: QUERY
+PREHOOK: Input: _dummy_database@_dummy_table
+PREHOOK: Output: default@t_test
+POSTHOOK: query: INSERT INTO t_test VALUES
+(7, 8, 4), (7, 8, 4), (7, 8, 4),
+(NULL, NULL, NULL),
+(5, 2, 3),
+(NULL, NULL, NULL),
+(NULL, NULL, NULL),
+(6, 2, 1),
+(5, 1, 2), (5, 1, 2), (5, 1, 2),
+(NULL, NULL, NULL)
+POSTHOOK: type: QUERY
+POSTHOOK: Input: _dummy_database@_dummy_table
+POSTHOOK: Output: default@t_test
+POSTHOOK: Lineage: t_test.a SCRIPT []
+POSTHOOK: Lineage: t_test.b SCRIPT []
+POSTHOOK: Lineage: t_test.c SCRIPT []
+PREHOOK: query: SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a DESC LIMIT 2
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t_test
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a DESC LIMIT 2
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t_test
+#### A masked pattern was here ####
+7 3
+6 1
+PREHOOK: query: SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a DESC NULLS FIRST LIMIT 2
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t_test
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a DESC NULLS FIRST LIMIT 2
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t_test
+#### A masked pattern was here ####
+NULL 0
+7 3
+PREHOOK: query: SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a DESC NULLS LAST LIMIT 2
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t_test
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a DESC NULLS LAST LIMIT 2
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t_test
+#### A masked pattern was here ####
+7 3
+6 1
+PREHOOK: query: DROP TABLE IF EXISTS t_test
+PREHOOK: type: DROPTABLE
+PREHOOK: Input: default@t_test
+PREHOOK: Output: default@t_test
+POSTHOOK: query: DROP TABLE IF EXISTS t_test
+POSTHOOK: type: DROPTABLE
+POSTHOOK: Input: default@t_test
+POSTHOOK: Output: default@t_test
diff --git a/ql/src/test/results/clientpositive/tez/topnkey.q.out b/ql/src/test/results/clientpositive/tez/topnkey.q.out
index 3267f79..206c0c8 100644
--- a/ql/src/test/results/clientpositive/tez/topnkey.q.out
+++ b/ql/src/test/results/clientpositive/tez/topnkey.q.out
@@ -1,44 +1,177 @@
-PREHOOK: query: EXPLAIN
+PREHOOK: query: EXPLAIN EXTENDED
SELECT key, SUM(CAST(SUBSTR(value,5) AS INT)) FROM src GROUP BY key ORDER BY key LIMIT 5
PREHOOK: type: QUERY
PREHOOK: Input: default@src
PREHOOK: Output: hdfs://### HDFS PATH ###
-POSTHOOK: query: EXPLAIN
+POSTHOOK: query: EXPLAIN EXTENDED
SELECT key, SUM(CAST(SUBSTR(value,5) AS INT)) FROM src GROUP BY key ORDER BY key LIMIT 5
POSTHOOK: type: QUERY
POSTHOOK: Input: default@src
POSTHOOK: Output: hdfs://### HDFS PATH ###
-Plan optimized by CBO.
+OPTIMIZED SQL: SELECT `key` AS `$f0`, SUM(CAST(SUBSTR(`value`, 5) AS INTEGER)) AS `$f1`
+FROM `default`.`src`
+GROUP BY `key`
+ORDER BY `key`
+LIMIT 5
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 depends on stages: Stage-1
-Vertex dependency in root stage
-Reducer 2 <- Map 1 (SIMPLE_EDGE)
-Reducer 3 <- Reducer 2 (SIMPLE_EDGE)
+STAGE PLANS:
+ Stage: Stage-1
+ Tez
+#### A masked pattern was here ####
+ Edges:
+ Reducer 2 <- Map 1 (SIMPLE_EDGE)
+ Reducer 3 <- Reducer 2 (SIMPLE_EDGE)
+#### A masked pattern was here ####
+ Vertices:
+ Map 1
+ Map Operator Tree:
+ TableScan
+ alias: src
+ Statistics: Num rows: 500 Data size: 89000 Basic stats: COMPLETE Column stats: COMPLETE
+ GatherStats: false
+ Select Operator
+ expressions: key (type: string), UDFToInteger(substr(value, 5)) (type: int)
+ outputColumnNames: _col0, _col1
+ Statistics: Num rows: 500 Data size: 89000 Basic stats: COMPLETE Column stats: COMPLETE
+ Top N Key Operator
+ sort order: +
+ keys: _col0 (type: string)
+ null sort order: z
+ Statistics: Num rows: 500 Data size: 89000 Basic stats: COMPLETE Column stats: COMPLETE
+ top n: 5
+ Group By Operator
+ aggregations: sum(_col1)
+ keys: _col0 (type: string)
+ minReductionHashAggr: 0.5
+ mode: hash
+ outputColumnNames: _col0, _col1
+ Statistics: Num rows: 250 Data size: 23750 Basic stats: COMPLETE Column stats: COMPLETE
+ Reduce Output Operator
+ key expressions: _col0 (type: string)
+ null sort order: z
+ sort order: +
+ Map-reduce partition columns: _col0 (type: string)
+ Statistics: Num rows: 250 Data size: 23750 Basic stats: COMPLETE Column stats: COMPLETE
+ tag: -1
+ TopN: 5
+ TopN Hash Memory Usage: 0.1
+ value expressions: _col1 (type: bigint)
+ auto parallelism: true
+ Path -> Alias:
+ hdfs://### HDFS PATH ### [src]
+ Path -> Partition:
+ hdfs://### HDFS PATH ###
+ Partition
+ base file name: src
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ COLUMN_STATS_ACCURATE {"BASIC_STATS":"true","COLUMN_STATS":{"key":"true","value":"true"}}
+ bucket_count -1
+ bucketing_version 2
+ column.name.delimiter ,
+ columns key,value
+ columns.comments 'default','default'
+ columns.types string:string
+#### A masked pattern was here ####
+ location hdfs://### HDFS PATH ###
+ name default.src
+ numFiles 1
+ numRows 500
+ rawDataSize 5312
+ serialization.ddl struct src { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 5812
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ COLUMN_STATS_ACCURATE {"BASIC_STATS":"true","COLUMN_STATS":{"key":"true","value":"true"}}
+ bucket_count -1
+ bucketing_version 2
+ column.name.delimiter ,
+ columns key,value
+ columns.comments 'default','default'
+ columns.types string:string
+#### A masked pattern was here ####
+ location hdfs://### HDFS PATH ###
+ name default.src
+ numFiles 1
+ numRows 500
+ rawDataSize 5312
+ serialization.ddl struct src { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 5812
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.src
+ name: default.src
+ Truncated Path -> Alias:
+ /src [src]
+ Reducer 2
+ Needs Tagging: false
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations: sum(VALUE._col0)
+ keys: KEY._col0 (type: string)
+ mode: mergepartial
+ outputColumnNames: _col0, _col1
+ Statistics: Num rows: 250 Data size: 23750 Basic stats: COMPLETE Column stats: COMPLETE
+ Reduce Output Operator
+ key expressions: _col0 (type: string)
+ null sort order: z
+ sort order: +
+ Statistics: Num rows: 250 Data size: 23750 Basic stats: COMPLETE Column stats: COMPLETE
+ tag: -1
+ TopN: 5
+ TopN Hash Memory Usage: 0.1
+ value expressions: _col1 (type: bigint)
+ auto parallelism: false
+ Reducer 3
+ Needs Tagging: false
+ Reduce Operator Tree:
+ Select Operator
+ expressions: KEY.reducesinkkey0 (type: string), VALUE._col0 (type: bigint)
+ outputColumnNames: _col0, _col1
+ Statistics: Num rows: 250 Data size: 23750 Basic stats: COMPLETE Column stats: COMPLETE
+ Limit
+ Number of rows: 5
+ Statistics: Num rows: 5 Data size: 475 Basic stats: COMPLETE Column stats: COMPLETE
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ directory: hdfs://### HDFS PATH ###
+ NumFilesPerFileSink: 1
+ Statistics: Num rows: 5 Data size: 475 Basic stats: COMPLETE Column stats: COMPLETE
+ Stats Publishing Key Prefix: hdfs://### HDFS PATH ###
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ properties:
+ columns _col0,_col1
+ columns.types string:bigint
+ escape.delim \
+ hive.serialization.extend.additional.nesting.levels true
+ serialization.escape.crlf true
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ TotalFiles: 1
+ GatherStats: false
+ MultiFileSpray: false
-Stage-0
- Fetch Operator
- limit:5
- Stage-1
- Reducer 3
- File Output Operator [FS_10]
- Limit [LIM_9] (rows=5 width=95)
- Number of rows:5
- Select Operator [SEL_8] (rows=250 width=95)
- Output:["_col0","_col1"]
- <-Reducer 2 [SIMPLE_EDGE]
- SHUFFLE [RS_7]
- Group By Operator [GBY_5] (rows=250 width=95)
- Output:["_col0","_col1"],aggregations:["sum(VALUE._col0)"],keys:KEY._col0
- <-Map 1 [SIMPLE_EDGE]
- SHUFFLE [RS_4]
- PartitionCols:_col0
- Group By Operator [GBY_3] (rows=250 width=95)
- Output:["_col0","_col1"],aggregations:["sum(_col1)"],keys:_col0
- Top N Key Operator [TNK_11] (rows=500 width=178)
- keys:_col0,sort order:+,top n:5
- Select Operator [SEL_1] (rows=500 width=178)
- Output:["_col0","_col1"]
- TableScan [TS_0] (rows=500 width=178)
- default@src,src,Tbl:COMPLETE,Col:COMPLETE,Output:["key","value"]
+ Stage: Stage-0
+ Fetch Operator
+ limit: 5
+ Processor Tree:
+ ListSink
PREHOOK: query: SELECT key, SUM(CAST(SUBSTR(value,5) AS INT)) FROM src GROUP BY key ORDER BY key LIMIT 5
PREHOOK: type: QUERY
@@ -228,3 +361,290 @@ POSTHOOK: Output: hdfs://### HDFS PATH ###
0 val_0
0 val_0
0 val_0
+PREHOOK: query: CREATE TABLE t_test(
+ a int,
+ b int,
+ c int
+)
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@t_test
+POSTHOOK: query: CREATE TABLE t_test(
+ a int,
+ b int,
+ c int
+)
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@t_test
+PREHOOK: query: INSERT INTO t_test VALUES
+(NULL, NULL, NULL),
+(5, 2, 3),
+(NULL, NULL, NULL),
+(NULL, NULL, NULL),
+(6, 2, 1),
+(7, 8, 4), (7, 8, 4), (7, 8, 4),
+(5, 1, 2), (5, 1, 2), (5, 1, 2),
+(NULL, NULL, NULL)
+PREHOOK: type: QUERY
+PREHOOK: Input: _dummy_database@_dummy_table
+PREHOOK: Output: default@t_test
+POSTHOOK: query: INSERT INTO t_test VALUES
+(NULL, NULL, NULL),
+(5, 2, 3),
+(NULL, NULL, NULL),
+(NULL, NULL, NULL),
+(6, 2, 1),
+(7, 8, 4), (7, 8, 4), (7, 8, 4),
+(5, 1, 2), (5, 1, 2), (5, 1, 2),
+(NULL, NULL, NULL)
+POSTHOOK: type: QUERY
+POSTHOOK: Input: _dummy_database@_dummy_table
+POSTHOOK: Output: default@t_test
+POSTHOOK: Lineage: t_test.a SCRIPT []
+POSTHOOK: Lineage: t_test.b SCRIPT []
+POSTHOOK: Lineage: t_test.c SCRIPT []
+PREHOOK: query: EXPLAIN
+SELECT a, b FROM t_test GROUP BY a, b ORDER BY a, b LIMIT 3
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t_test
+PREHOOK: Output: hdfs://### HDFS PATH ###
+POSTHOOK: query: EXPLAIN
+SELECT a, b FROM t_test GROUP BY a, b ORDER BY a, b LIMIT 3
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t_test
+POSTHOOK: Output: hdfs://### HDFS PATH ###
+Plan optimized by CBO.
+
+Vertex dependency in root stage
+Reducer 2 <- Map 1 (SIMPLE_EDGE)
+Reducer 3 <- Reducer 2 (SIMPLE_EDGE)
+
+Stage-0
+ Fetch Operator
+ limit:3
+ Stage-1
+ Reducer 3
+ File Output Operator [FS_9]
+ Limit [LIM_8] (rows=3 width=5)
+ Number of rows:3
+ Select Operator [SEL_7] (rows=6 width=4)
+ Output:["_col0","_col1"]
+ <-Reducer 2 [SIMPLE_EDGE]
+ SHUFFLE [RS_6]
+ Group By Operator [GBY_4] (rows=6 width=4)
+ Output:["_col0","_col1"],keys:KEY._col0, KEY._col1
+ <-Map 1 [SIMPLE_EDGE]
+ SHUFFLE [RS_3]
+ PartitionCols:_col0, _col1
+ Group By Operator [GBY_2] (rows=6 width=4)
+ Output:["_col0","_col1"],keys:a, b
+ Top N Key Operator [TNK_10] (rows=12 width=6)
+ keys:a, b,sort order:++,top n:3
+ Select Operator [SEL_1] (rows=12 width=6)
+ Output:["a","b"]
+ TableScan [TS_0] (rows=12 width=6)
+ default@t_test,t_test,Tbl:COMPLETE,Col:COMPLETE,Output:["a","b"]
+
+PREHOOK: query: SELECT a, b FROM t_test GROUP BY a, b ORDER BY a, b LIMIT 3
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t_test
+PREHOOK: Output: hdfs://### HDFS PATH ###
+POSTHOOK: query: SELECT a, b FROM t_test GROUP BY a, b ORDER BY a, b LIMIT 3
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t_test
+POSTHOOK: Output: hdfs://### HDFS PATH ###
+5 1
+5 2
+6 2
+PREHOOK: query: EXPLAIN
+SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a LIMIT 2
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t_test
+PREHOOK: Output: hdfs://### HDFS PATH ###
+POSTHOOK: query: EXPLAIN
+SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a LIMIT 2
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t_test
+POSTHOOK: Output: hdfs://### HDFS PATH ###
+Plan optimized by CBO.
+
+Vertex dependency in root stage
+Reducer 2 <- Map 1 (SIMPLE_EDGE)
+Reducer 3 <- Reducer 2 (SIMPLE_EDGE)
+
+Stage-0
+ Fetch Operator
+ limit:2
+ Stage-1
+ Reducer 3
+ File Output Operator [FS_9]
+ Limit [LIM_8] (rows=2 width=10)
+ Number of rows:2
+ Select Operator [SEL_7] (rows=4 width=10)
+ Output:["_col0","_col1"]
+ <-Reducer 2 [SIMPLE_EDGE]
+ SHUFFLE [RS_6]
+ Group By Operator [GBY_4] (rows=4 width=10)
+ Output:["_col0","_col1"],aggregations:["count(VALUE._col0)"],keys:KEY._col0
+ <-Map 1 [SIMPLE_EDGE]
+ SHUFFLE [RS_3]
+ PartitionCols:_col0
+ Group By Operator [GBY_2] (rows=4 width=10)
+ Output:["_col0","_col1"],aggregations:["count(b)"],keys:a
+ Top N Key Operator [TNK_10] (rows=12 width=6)
+ keys:a,sort order:+,top n:2
+ Select Operator [SEL_1] (rows=12 width=6)
+ Output:["a","b"]
+ TableScan [TS_0] (rows=12 width=6)
+ default@t_test,t_test,Tbl:COMPLETE,Col:COMPLETE,Output:["a","b"]
+
+PREHOOK: query: SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a LIMIT 2
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t_test
+PREHOOK: Output: hdfs://### HDFS PATH ###
+POSTHOOK: query: SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a LIMIT 2
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t_test
+POSTHOOK: Output: hdfs://### HDFS PATH ###
+5 4
+6 1
+PREHOOK: query: SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a NULLS FIRST LIMIT 2
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t_test
+PREHOOK: Output: hdfs://### HDFS PATH ###
+POSTHOOK: query: SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a NULLS FIRST LIMIT 2
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t_test
+POSTHOOK: Output: hdfs://### HDFS PATH ###
+NULL 0
+5 4
+PREHOOK: query: SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a NULLS LAST LIMIT 2
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t_test
+PREHOOK: Output: hdfs://### HDFS PATH ###
+POSTHOOK: query: SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a NULLS LAST LIMIT 2
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t_test
+POSTHOOK: Output: hdfs://### HDFS PATH ###
+5 4
+6 1
+PREHOOK: query: SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a ASC LIMIT 2
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t_test
+PREHOOK: Output: hdfs://### HDFS PATH ###
+POSTHOOK: query: SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a ASC LIMIT 2
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t_test
+POSTHOOK: Output: hdfs://### HDFS PATH ###
+5 4
+6 1
+PREHOOK: query: SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a ASC NULLS FIRST LIMIT 2
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t_test
+PREHOOK: Output: hdfs://### HDFS PATH ###
+POSTHOOK: query: SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a ASC NULLS FIRST LIMIT 2
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t_test
+POSTHOOK: Output: hdfs://### HDFS PATH ###
+NULL 0
+5 4
+PREHOOK: query: SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a ASC NULLS LAST LIMIT 2
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t_test
+PREHOOK: Output: hdfs://### HDFS PATH ###
+POSTHOOK: query: SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a ASC NULLS LAST LIMIT 2
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t_test
+POSTHOOK: Output: hdfs://### HDFS PATH ###
+5 4
+6 1
+PREHOOK: query: DROP TABLE IF EXISTS t_test
+PREHOOK: type: DROPTABLE
+PREHOOK: Input: default@t_test
+PREHOOK: Output: default@t_test
+POSTHOOK: query: DROP TABLE IF EXISTS t_test
+POSTHOOK: type: DROPTABLE
+POSTHOOK: Input: default@t_test
+POSTHOOK: Output: default@t_test
+PREHOOK: query: CREATE TABLE t_test(
+ a int,
+ b int,
+ c int
+)
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@t_test
+POSTHOOK: query: CREATE TABLE t_test(
+ a int,
+ b int,
+ c int
+)
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@t_test
+PREHOOK: query: INSERT INTO t_test VALUES
+(7, 8, 4), (7, 8, 4), (7, 8, 4),
+(NULL, NULL, NULL),
+(5, 2, 3),
+(NULL, NULL, NULL),
+(NULL, NULL, NULL),
+(6, 2, 1),
+(5, 1, 2), (5, 1, 2), (5, 1, 2),
+(NULL, NULL, NULL)
+PREHOOK: type: QUERY
+PREHOOK: Input: _dummy_database@_dummy_table
+PREHOOK: Output: default@t_test
+POSTHOOK: query: INSERT INTO t_test VALUES
+(7, 8, 4), (7, 8, 4), (7, 8, 4),
+(NULL, NULL, NULL),
+(5, 2, 3),
+(NULL, NULL, NULL),
+(NULL, NULL, NULL),
+(6, 2, 1),
+(5, 1, 2), (5, 1, 2), (5, 1, 2),
+(NULL, NULL, NULL)
+POSTHOOK: type: QUERY
+POSTHOOK: Input: _dummy_database@_dummy_table
+POSTHOOK: Output: default@t_test
+POSTHOOK: Lineage: t_test.a SCRIPT []
+POSTHOOK: Lineage: t_test.b SCRIPT []
+POSTHOOK: Lineage: t_test.c SCRIPT []
+PREHOOK: query: SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a DESC LIMIT 2
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t_test
+PREHOOK: Output: hdfs://### HDFS PATH ###
+POSTHOOK: query: SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a DESC LIMIT 2
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t_test
+POSTHOOK: Output: hdfs://### HDFS PATH ###
+7 3
+6 1
+PREHOOK: query: SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a DESC NULLS FIRST LIMIT 2
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t_test
+PREHOOK: Output: hdfs://### HDFS PATH ###
+POSTHOOK: query: SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a DESC NULLS FIRST LIMIT 2
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t_test
+POSTHOOK: Output: hdfs://### HDFS PATH ###
+NULL 0
+7 3
+PREHOOK: query: SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a DESC NULLS LAST LIMIT 2
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t_test
+PREHOOK: Output: hdfs://### HDFS PATH ###
+POSTHOOK: query: SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a DESC NULLS LAST LIMIT 2
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t_test
+POSTHOOK: Output: hdfs://### HDFS PATH ###
+7 3
+6 1
+PREHOOK: query: DROP TABLE IF EXISTS t_test
+PREHOOK: type: DROPTABLE
+PREHOOK: Input: default@t_test
+PREHOOK: Output: default@t_test
+POSTHOOK: query: DROP TABLE IF EXISTS t_test
+POSTHOOK: type: DROPTABLE
+POSTHOOK: Input: default@t_test
+POSTHOOK: Output: default@t_test
diff --git a/ql/src/test/results/clientpositive/topnkey.q.out b/ql/src/test/results/clientpositive/topnkey.q.out
index 7ea1a7a..66d0eca 100644
--- a/ql/src/test/results/clientpositive/topnkey.q.out
+++ b/ql/src/test/results/clientpositive/topnkey.q.out
@@ -1,13 +1,18 @@
-PREHOOK: query: EXPLAIN
+PREHOOK: query: EXPLAIN EXTENDED
SELECT key, SUM(CAST(SUBSTR(value,5) AS INT)) FROM src GROUP BY key ORDER BY key LIMIT 5
PREHOOK: type: QUERY
PREHOOK: Input: default@src
#### A masked pattern was here ####
-POSTHOOK: query: EXPLAIN
+POSTHOOK: query: EXPLAIN EXTENDED
SELECT key, SUM(CAST(SUBSTR(value,5) AS INT)) FROM src GROUP BY key ORDER BY key LIMIT 5
POSTHOOK: type: QUERY
POSTHOOK: Input: default@src
#### A masked pattern was here ####
+OPTIMIZED SQL: SELECT `key` AS `$f0`, SUM(CAST(SUBSTR(`value`, 5) AS INTEGER)) AS `$f1`
+FROM `default`.`src`
+GROUP BY `key`
+ORDER BY `key`
+LIMIT 5
STAGE DEPENDENCIES:
Stage-1 is a root stage
Stage-2 depends on stages: Stage-1
@@ -20,6 +25,7 @@ STAGE PLANS:
TableScan
alias: src
Statistics: Num rows: 500 Data size: 89000 Basic stats: COMPLETE Column stats: COMPLETE
+ GatherStats: false
Select Operator
expressions: key (type: string), UDFToInteger(substr(value, 5)) (type: int)
outputColumnNames: _col0, _col1
@@ -33,11 +39,69 @@ STAGE PLANS:
Statistics: Num rows: 250 Data size: 23750 Basic stats: COMPLETE Column stats: COMPLETE
Reduce Output Operator
key expressions: _col0 (type: string)
+ null sort order: z
sort order: +
Map-reduce partition columns: _col0 (type: string)
Statistics: Num rows: 250 Data size: 23750 Basic stats: COMPLETE Column stats: COMPLETE
+ tag: -1
+ TopN: 5
TopN Hash Memory Usage: 0.1
value expressions: _col1 (type: bigint)
+ auto parallelism: false
+ Path -> Alias:
+#### A masked pattern was here ####
+ Path -> Partition:
+#### A masked pattern was here ####
+ Partition
+ base file name: src
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ COLUMN_STATS_ACCURATE {"BASIC_STATS":"true","COLUMN_STATS":{"key":"true","value":"true"}}
+ bucket_count -1
+ bucketing_version 2
+ column.name.delimiter ,
+ columns key,value
+ columns.comments 'default','default'
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.src
+ numFiles 1
+ numRows 500
+ rawDataSize 5312
+ serialization.ddl struct src { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 5812
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ COLUMN_STATS_ACCURATE {"BASIC_STATS":"true","COLUMN_STATS":{"key":"true","value":"true"}}
+ bucket_count -1
+ bucketing_version 2
+ column.name.delimiter ,
+ columns key,value
+ columns.comments 'default','default'
+ columns.types string:string
+#### A masked pattern was here ####
+ name default.src
+ numFiles 1
+ numRows 500
+ rawDataSize 5312
+ serialization.ddl struct src { string key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 5812
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.src
+ name: default.src
+ Truncated Path -> Alias:
+ /src [$hdt$_0:src]
+ Needs Tagging: false
Reduce Operator Tree:
Group By Operator
aggregations: sum(VALUE._col0)
@@ -47,21 +111,66 @@ STAGE PLANS:
Statistics: Num rows: 250 Data size: 23750 Basic stats: COMPLETE Column stats: COMPLETE
File Output Operator
compressed: false
+ GlobalTableId: 0
+#### A masked pattern was here ####
+ NumFilesPerFileSink: 1
table:
input format: org.apache.hadoop.mapred.SequenceFileInputFormat
output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ properties:
+ column.name.delimiter ,
+ columns _col0,_col1
+ columns.types string,bigint
+ escape.delim \
+ serialization.lib org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe
serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe
+ TotalFiles: 1
+ GatherStats: false
+ MultiFileSpray: false
Stage: Stage-2
Map Reduce
Map Operator Tree:
TableScan
+ GatherStats: false
Reduce Output Operator
key expressions: _col0 (type: string)
+ null sort order: z
sort order: +
Statistics: Num rows: 250 Data size: 23750 Basic stats: COMPLETE Column stats: COMPLETE
+ tag: -1
+ TopN: 5
TopN Hash Memory Usage: 0.1
value expressions: _col1 (type: bigint)
+ auto parallelism: false
+ Path -> Alias:
+#### A masked pattern was here ####
+ Path -> Partition:
+#### A masked pattern was here ####
+ Partition
+ base file name: -mr-10004
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ properties:
+ column.name.delimiter ,
+ columns _col0,_col1
+ columns.types string,bigint
+ escape.delim \
+ serialization.lib org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe
+ serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe
+
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ properties:
+ column.name.delimiter ,
+ columns _col0,_col1
+ columns.types string,bigint
+ escape.delim \
+ serialization.lib org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe
+ serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe
+ Truncated Path -> Alias:
+#### A masked pattern was here ####
+ Needs Tagging: false
Reduce Operator Tree:
Select Operator
expressions: KEY.reducesinkkey0 (type: string), VALUE._col0 (type: bigint)
@@ -72,11 +181,26 @@ STAGE PLANS:
Statistics: Num rows: 5 Data size: 475 Basic stats: COMPLETE Column stats: COMPLETE
File Output Operator
compressed: false
+ GlobalTableId: 0
+#### A masked pattern was here ####
+ NumFilesPerFileSink: 1
Statistics: Num rows: 5 Data size: 475 Basic stats: COMPLETE Column stats: COMPLETE
+#### A masked pattern was here ####
table:
input format: org.apache.hadoop.mapred.SequenceFileInputFormat
output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ properties:
+ columns _col0,_col1
+ columns.types string:bigint
+ escape.delim \
+ hive.serialization.extend.additional.nesting.levels true
+ serialization.escape.crlf true
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ TotalFiles: 1
+ GatherStats: false
+ MultiFileSpray: false
Stage: Stage-0
Fetch Operator
@@ -313,3 +437,374 @@ POSTHOOK: Input: default@src
0 val_0
0 val_0
0 val_0
+PREHOOK: query: CREATE TABLE t_test(
+ a int,
+ b int,
+ c int
+)
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@t_test
+POSTHOOK: query: CREATE TABLE t_test(
+ a int,
+ b int,
+ c int
+)
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@t_test
+PREHOOK: query: INSERT INTO t_test VALUES
+(NULL, NULL, NULL),
+(5, 2, 3),
+(NULL, NULL, NULL),
+(NULL, NULL, NULL),
+(6, 2, 1),
+(7, 8, 4), (7, 8, 4), (7, 8, 4),
+(5, 1, 2), (5, 1, 2), (5, 1, 2),
+(NULL, NULL, NULL)
+PREHOOK: type: QUERY
+PREHOOK: Input: _dummy_database@_dummy_table
+PREHOOK: Output: default@t_test
+POSTHOOK: query: INSERT INTO t_test VALUES
+(NULL, NULL, NULL),
+(5, 2, 3),
+(NULL, NULL, NULL),
+(NULL, NULL, NULL),
+(6, 2, 1),
+(7, 8, 4), (7, 8, 4), (7, 8, 4),
+(5, 1, 2), (5, 1, 2), (5, 1, 2),
+(NULL, NULL, NULL)
+POSTHOOK: type: QUERY
+POSTHOOK: Input: _dummy_database@_dummy_table
+POSTHOOK: Output: default@t_test
+POSTHOOK: Lineage: t_test.a SCRIPT []
+POSTHOOK: Lineage: t_test.b SCRIPT []
+POSTHOOK: Lineage: t_test.c SCRIPT []
+PREHOOK: query: EXPLAIN
+SELECT a, b FROM t_test GROUP BY a, b ORDER BY a, b LIMIT 3
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t_test
+#### A masked pattern was here ####
+POSTHOOK: query: EXPLAIN
+SELECT a, b FROM t_test GROUP BY a, b ORDER BY a, b LIMIT 3
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t_test
+#### A masked pattern was here ####
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-2 depends on stages: Stage-1
+ Stage-0 depends on stages: Stage-2
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Map Operator Tree:
+ TableScan
+ alias: t_test
+ Statistics: Num rows: 12 Data size: 72 Basic stats: COMPLETE Column stats: COMPLETE
+ Select Operator
+ expressions: a (type: int), b (type: int)
+ outputColumnNames: a, b
+ Statistics: Num rows: 12 Data size: 72 Basic stats: COMPLETE Column stats: COMPLETE
+ Group By Operator
+ keys: a (type: int), b (type: int)
+ minReductionHashAggr: 0.99
+ mode: hash
+ outputColumnNames: _col0, _col1
+ Statistics: Num rows: 6 Data size: 40 Basic stats: COMPLETE Column stats: COMPLETE
+ Reduce Output Operator
+ key expressions: _col0 (type: int), _col1 (type: int)
+ sort order: ++
+ Map-reduce partition columns: _col0 (type: int), _col1 (type: int)
+ Statistics: Num rows: 6 Data size: 40 Basic stats: COMPLETE Column stats: COMPLETE
+ TopN Hash Memory Usage: 0.1
+ Reduce Operator Tree:
+ Group By Operator
+ keys: KEY._col0 (type: int), KEY._col1 (type: int)
+ mode: mergepartial
+ outputColumnNames: _col0, _col1
+ Statistics: Num rows: 6 Data size: 40 Basic stats: COMPLETE Column stats: COMPLETE
+ File Output Operator
+ compressed: false
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe
+
+ Stage: Stage-2
+ Map Reduce
+ Map Operator Tree:
+ TableScan
+ Reduce Output Operator
+ key expressions: _col0 (type: int), _col1 (type: int)
+ sort order: ++
+ Statistics: Num rows: 6 Data size: 40 Basic stats: COMPLETE Column stats: COMPLETE
+ TopN Hash Memory Usage: 0.1
+ Reduce Operator Tree:
+ Select Operator
+ expressions: KEY.reducesinkkey0 (type: int), KEY.reducesinkkey1 (type: int)
+ outputColumnNames: _col0, _col1
+ Statistics: Num rows: 6 Data size: 40 Basic stats: COMPLETE Column stats: COMPLETE
+ Limit
+ Number of rows: 3
+ Statistics: Num rows: 3 Data size: 24 Basic stats: COMPLETE Column stats: COMPLETE
+ File Output Operator
+ compressed: false
+ Statistics: Num rows: 3 Data size: 24 Basic stats: COMPLETE Column stats: COMPLETE
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: 3
+ Processor Tree:
+ ListSink
+
+PREHOOK: query: SELECT a, b FROM t_test GROUP BY a, b ORDER BY a, b LIMIT 3
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t_test
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT a, b FROM t_test GROUP BY a, b ORDER BY a, b LIMIT 3
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t_test
+#### A masked pattern was here ####
+5 1
+5 2
+6 2
+PREHOOK: query: EXPLAIN
+SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a LIMIT 2
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t_test
+#### A masked pattern was here ####
+POSTHOOK: query: EXPLAIN
+SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a LIMIT 2
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t_test
+#### A masked pattern was here ####
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-2 depends on stages: Stage-1
+ Stage-0 depends on stages: Stage-2
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Map Operator Tree:
+ TableScan
+ alias: t_test
+ Statistics: Num rows: 12 Data size: 72 Basic stats: COMPLETE Column stats: COMPLETE
+ Select Operator
+ expressions: a (type: int), b (type: int)
+ outputColumnNames: a, b
+ Statistics: Num rows: 12 Data size: 72 Basic stats: COMPLETE Column stats: COMPLETE
+ Group By Operator
+ aggregations: count(b)
+ keys: a (type: int)
+ minReductionHashAggr: 0.99
+ mode: hash
+ outputColumnNames: _col0, _col1
+ Statistics: Num rows: 4 Data size: 48 Basic stats: COMPLETE Column stats: COMPLETE
+ Reduce Output Operator
+ key expressions: _col0 (type: int)
+ sort order: +
+ Map-reduce partition columns: _col0 (type: int)
+ Statistics: Num rows: 4 Data size: 48 Basic stats: COMPLETE Column stats: COMPLETE
+ TopN Hash Memory Usage: 0.1
+ value expressions: _col1 (type: bigint)
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations: count(VALUE._col0)
+ keys: KEY._col0 (type: int)
+ mode: mergepartial
+ outputColumnNames: _col0, _col1
+ Statistics: Num rows: 4 Data size: 48 Basic stats: COMPLETE Column stats: COMPLETE
+ File Output Operator
+ compressed: false
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe
+
+ Stage: Stage-2
+ Map Reduce
+ Map Operator Tree:
+ TableScan
+ Reduce Output Operator
+ key expressions: _col0 (type: int)
+ sort order: +
+ Statistics: Num rows: 4 Data size: 48 Basic stats: COMPLETE Column stats: COMPLETE
+ TopN Hash Memory Usage: 0.1
+ value expressions: _col1 (type: bigint)
+ Reduce Operator Tree:
+ Select Operator
+ expressions: KEY.reducesinkkey0 (type: int), VALUE._col0 (type: bigint)
+ outputColumnNames: _col0, _col1
+ Statistics: Num rows: 4 Data size: 48 Basic stats: COMPLETE Column stats: COMPLETE
+ Limit
+ Number of rows: 2
+ Statistics: Num rows: 2 Data size: 24 Basic stats: COMPLETE Column stats: COMPLETE
+ File Output Operator
+ compressed: false
+ Statistics: Num rows: 2 Data size: 24 Basic stats: COMPLETE Column stats: COMPLETE
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: 2
+ Processor Tree:
+ ListSink
+
+PREHOOK: query: SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a LIMIT 2
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t_test
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a LIMIT 2
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t_test
+#### A masked pattern was here ####
+5 4
+6 1
+PREHOOK: query: SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a NULLS FIRST LIMIT 2
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t_test
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a NULLS FIRST LIMIT 2
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t_test
+#### A masked pattern was here ####
+NULL 0
+5 4
+PREHOOK: query: SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a NULLS LAST LIMIT 2
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t_test
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a NULLS LAST LIMIT 2
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t_test
+#### A masked pattern was here ####
+5 4
+6 1
+PREHOOK: query: SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a ASC LIMIT 2
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t_test
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a ASC LIMIT 2
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t_test
+#### A masked pattern was here ####
+5 4
+6 1
+PREHOOK: query: SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a ASC NULLS FIRST LIMIT 2
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t_test
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a ASC NULLS FIRST LIMIT 2
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t_test
+#### A masked pattern was here ####
+NULL 0
+5 4
+PREHOOK: query: SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a ASC NULLS LAST LIMIT 2
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t_test
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a ASC NULLS LAST LIMIT 2
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t_test
+#### A masked pattern was here ####
+5 4
+6 1
+PREHOOK: query: DROP TABLE IF EXISTS t_test
+PREHOOK: type: DROPTABLE
+PREHOOK: Input: default@t_test
+PREHOOK: Output: default@t_test
+POSTHOOK: query: DROP TABLE IF EXISTS t_test
+POSTHOOK: type: DROPTABLE
+POSTHOOK: Input: default@t_test
+POSTHOOK: Output: default@t_test
+PREHOOK: query: CREATE TABLE t_test(
+ a int,
+ b int,
+ c int
+)
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@t_test
+POSTHOOK: query: CREATE TABLE t_test(
+ a int,
+ b int,
+ c int
+)
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@t_test
+PREHOOK: query: INSERT INTO t_test VALUES
+(7, 8, 4), (7, 8, 4), (7, 8, 4),
+(NULL, NULL, NULL),
+(5, 2, 3),
+(NULL, NULL, NULL),
+(NULL, NULL, NULL),
+(6, 2, 1),
+(5, 1, 2), (5, 1, 2), (5, 1, 2),
+(NULL, NULL, NULL)
+PREHOOK: type: QUERY
+PREHOOK: Input: _dummy_database@_dummy_table
+PREHOOK: Output: default@t_test
+POSTHOOK: query: INSERT INTO t_test VALUES
+(7, 8, 4), (7, 8, 4), (7, 8, 4),
+(NULL, NULL, NULL),
+(5, 2, 3),
+(NULL, NULL, NULL),
+(NULL, NULL, NULL),
+(6, 2, 1),
+(5, 1, 2), (5, 1, 2), (5, 1, 2),
+(NULL, NULL, NULL)
+POSTHOOK: type: QUERY
+POSTHOOK: Input: _dummy_database@_dummy_table
+POSTHOOK: Output: default@t_test
+POSTHOOK: Lineage: t_test.a SCRIPT []
+POSTHOOK: Lineage: t_test.b SCRIPT []
+POSTHOOK: Lineage: t_test.c SCRIPT []
+PREHOOK: query: SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a DESC LIMIT 2
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t_test
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a DESC LIMIT 2
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t_test
+#### A masked pattern was here ####
+7 3
+6 1
+PREHOOK: query: SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a DESC NULLS FIRST LIMIT 2
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t_test
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a DESC NULLS FIRST LIMIT 2
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t_test
+#### A masked pattern was here ####
+NULL 0
+7 3
+PREHOOK: query: SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a DESC NULLS LAST LIMIT 2
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t_test
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a DESC NULLS LAST LIMIT 2
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t_test
+#### A masked pattern was here ####
+7 3
+6 1
+PREHOOK: query: DROP TABLE IF EXISTS t_test
+PREHOOK: type: DROPTABLE
+PREHOOK: Input: default@t_test
+PREHOOK: Output: default@t_test
+POSTHOOK: query: DROP TABLE IF EXISTS t_test
+POSTHOOK: type: DROPTABLE
+POSTHOOK: Input: default@t_test
+POSTHOOK: Output: default@t_test
diff --git a/serde/src/java/org/apache/hadoop/hive/serde2/objectinspector/ObjectInspectorUtils.java b/serde/src/java/org/apache/hadoop/hive/serde2/objectinspector/ObjectInspectorUtils.java
index df6e788..ab1cfdc 100644
--- a/serde/src/java/org/apache/hadoop/hive/serde2/objectinspector/ObjectInspectorUtils.java
+++ b/serde/src/java/org/apache/hadoop/hive/serde2/objectinspector/ObjectInspectorUtils.java
@@ -18,12 +18,16 @@
package org.apache.hadoop.hive.serde2.objectinspector;
+import static java.util.Comparator.nullsFirst;
+import static java.util.Comparator.nullsLast;
+
import java.lang.reflect.Field;
import java.lang.reflect.Modifier;
import java.lang.reflect.Type;
import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.Arrays;
+import java.util.Comparator;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
@@ -944,20 +948,41 @@ public final class ObjectInspectorUtils {
return 0;
}
- public static int compare(Object[] o1, ObjectInspector[] oi1, Object[] o2,
- ObjectInspector[] oi2, boolean[] columnSortOrderIsDesc) {
- assert (o1.length == oi1.length);
- assert (o2.length == oi2.length);
- assert (o1.length == o2.length);
+ public static int compare(
+ Object[] objectArray1, ObjectInspector[] oi1, Object[] objectArray2, ObjectInspector[] oi2,
+ boolean[] columnSortOrderIsDesc, NullValueOption[] nullSortOrder) {
+ assert (objectArray1.length == objectArray2.length);
+ assert (objectArray1.length == oi1.length);
+ assert (objectArray2.length == oi2.length);
+ assert (columnSortOrderIsDesc.length == objectArray1.length);
+ assert (nullSortOrder.length == objectArray1.length);
- for (int i = 0; i < o1.length; i++) {
- int r = compare(o1[i], oi1[i], o2[i], oi2[i]);
- if (r != 0) {
- if (columnSortOrderIsDesc[i]) {
- return r;
- } else {
- return -r;
- }
+ List<Comparator<Object>> comparators = new ArrayList<>(oi1.length);
+ for (int i = 0; i < oi1.length; i++) {
+ final int keyIndex = i;
+
+ Comparator<Object> comparator = (o1, o2) -> compare(
+ o1, oi1[keyIndex], o2, oi2[keyIndex]);
+
+ if (columnSortOrderIsDesc[i]) {
+ comparator = comparator.reversed();
+ }
+
+ if (nullSortOrder[i] == NullValueOption.MAXVALUE) {
+ comparators.add(nullsFirst(comparator));
+ } else {
+ comparators.add(nullsLast(comparator));
+ }
+ }
+
+ return compare(comparators, objectArray1, objectArray2);
+ }
+
+ public static <T> int compare(List<Comparator<T>> comparatorList, T[] o1, T[] o2) {
+ for (int i = 0; i < comparatorList.size(); ++i) {
+ int c = comparatorList.get(i).compare(o1[i], o2[i]);
+ if (c != 0) {
+ return c;
}
}
return 0;