You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by jc...@apache.org on 2016/05/30 23:07:00 UTC
hive git commit: HIVE-13849: Wrong plan for
hive.optimize.sort.dynamic.partition=true (Jesus Camacho Rodriguez,
reviewed by Ashutosh Chauhan)
Repository: hive
Updated Branches:
refs/heads/master 20c3bd9fb -> 07f593a73
HIVE-13849: Wrong plan for hive.optimize.sort.dynamic.partition=true (Jesus Camacho Rodriguez, reviewed by Ashutosh Chauhan)
Project: http://git-wip-us.apache.org/repos/asf/hive/repo
Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/07f593a7
Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/07f593a7
Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/07f593a7
Branch: refs/heads/master
Commit: 07f593a7387ce8016feaa444f14a0c4fe481da91
Parents: 20c3bd9
Author: Jesus Camacho Rodriguez <jc...@apache.org>
Authored: Wed May 25 17:58:48 2016 +0100
Committer: Jesus Camacho Rodriguez <jc...@apache.org>
Committed: Tue May 31 00:05:43 2016 +0100
----------------------------------------------------------------------
.../optimizer/SortedDynPartitionOptimizer.java | 48 ++++++++++++-
.../dynpart_sort_optimization_acid2.q | 12 ++++
.../dynpart_sort_opt_vectorization.q.out | 12 ++--
.../dynpart_sort_optimization.q.out | 4 +-
.../dynpart_sort_optimization_acid2.q.out | 72 ++++++++++++++++++++
.../tez/dynpart_sort_opt_vectorization.q.out | 12 ++--
.../tez/dynpart_sort_optimization.q.out | 4 +-
7 files changed, 145 insertions(+), 19 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/hive/blob/07f593a7/ql/src/java/org/apache/hadoop/hive/ql/optimizer/SortedDynPartitionOptimizer.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/SortedDynPartitionOptimizer.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/SortedDynPartitionOptimizer.java
index 36b7036..febd446 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/SortedDynPartitionOptimizer.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/SortedDynPartitionOptimizer.java
@@ -57,7 +57,6 @@ import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc;
import org.apache.hadoop.hive.ql.plan.ExprNodeConstantDesc;
import org.apache.hadoop.hive.ql.plan.ExprNodeDesc;
import org.apache.hadoop.hive.ql.plan.ExprNodeDescUtils;
-import org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc;
import org.apache.hadoop.hive.ql.plan.FileSinkDesc;
import org.apache.hadoop.hive.ql.plan.ListBucketingCtx;
import org.apache.hadoop.hive.ql.plan.OperatorDesc;
@@ -193,8 +192,16 @@ public class SortedDynPartitionOptimizer extends Transform {
sortPositions = Arrays.asList(0);
sortOrder = Arrays.asList(1); // 1 means asc, could really use enum here in the thrift if
} else {
- sortPositions = getSortPositions(destTable.getSortCols(), destTable.getCols());
- sortOrder = getSortOrders(destTable.getSortCols(), destTable.getCols());
+ if (!destTable.getSortCols().isEmpty()) {
+ // Sort columns specified by table
+ sortPositions = getSortPositions(destTable.getSortCols(), destTable.getCols());
+ sortOrder = getSortOrders(destTable.getSortCols(), destTable.getCols());
+ } else {
+ // Infer sort columns from operator tree
+ sortPositions = Lists.newArrayList();
+ sortOrder = Lists.newArrayList();
+ inferSortPositions(fsParent, sortPositions, sortOrder);
+ }
}
List<Integer> sortNullOrder = new ArrayList<Integer>();
for (int order : sortOrder) {
@@ -380,6 +387,41 @@ public class SortedDynPartitionOptimizer extends Transform {
return posns;
}
+ // Try to infer possible sort columns in the query
+ // i.e. the sequence must be pRS-SEL*-fsParent
+ // Returns true if columns could be inferred, false otherwise
+ private void inferSortPositions(Operator<? extends OperatorDesc> fsParent,
+ List<Integer> sortPositions, List<Integer> sortOrder) throws SemanticException {
+ // If it is not a SEL operator, we bail out
+ if (!(fsParent instanceof SelectOperator)) {
+ return;
+ }
+ SelectOperator pSel = (SelectOperator) fsParent;
+ Operator<? extends OperatorDesc> parent = pSel;
+ while (!(parent instanceof ReduceSinkOperator)) {
+ if (parent.getNumParent() != 1 ||
+ !(parent instanceof SelectOperator)) {
+ return;
+ }
+ parent = parent.getParentOperators().get(0);
+ }
+ // Backtrack SEL columns to pRS
+ List<ExprNodeDesc> selColsInPRS =
+ ExprNodeDescUtils.backtrack(pSel.getConf().getColList(), pSel, parent);
+ ReduceSinkOperator pRS = (ReduceSinkOperator) parent;
+ for (int i = 0; i < pRS.getConf().getKeyCols().size(); i++) {
+ ExprNodeDesc col = pRS.getConf().getKeyCols().get(i);
+ int pos = selColsInPRS.indexOf(col);
+ if (pos == -1) {
+ sortPositions.clear();
+ sortOrder.clear();
+ return;
+ }
+ sortPositions.add(pos);
+ sortOrder.add(pRS.getConf().getOrder().charAt(i) == '+' ? 1 : 0); // 1 asc, 0 desc
+ }
+ }
+
public ReduceSinkOperator getReduceSinkOp(List<Integer> partitionPositions,
List<Integer> sortPositions, List<Integer> sortOrder, List<Integer> sortNullOrder,
ArrayList<ExprNodeDesc> allCols, ArrayList<ExprNodeDesc> bucketColumns, int numBuckets,
http://git-wip-us.apache.org/repos/asf/hive/blob/07f593a7/ql/src/test/queries/clientpositive/dynpart_sort_optimization_acid2.q
----------------------------------------------------------------------
diff --git a/ql/src/test/queries/clientpositive/dynpart_sort_optimization_acid2.q b/ql/src/test/queries/clientpositive/dynpart_sort_optimization_acid2.q
new file mode 100644
index 0000000..c115e62
--- /dev/null
+++ b/ql/src/test/queries/clientpositive/dynpart_sort_optimization_acid2.q
@@ -0,0 +1,12 @@
+set hive.support.concurrency=true;
+set hive.txn.manager=org.apache.hadoop.hive.ql.lockmgr.DbTxnManager;
+set hive.exec.dynamic.partition.mode=nonstrict;
+set hive.optimize.sort.dynamic.partition=true;
+
+CREATE TABLE non_acid(key string, value string)
+PARTITIONED BY(ds string, hr int)
+CLUSTERED BY(key) INTO 2 BUCKETS
+STORED AS ORC;
+
+explain
+insert into table non_acid partition(ds,hr) select * from srcpart sort by value;
http://git-wip-us.apache.org/repos/asf/hive/blob/07f593a7/ql/src/test/results/clientpositive/dynpart_sort_opt_vectorization.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/dynpart_sort_opt_vectorization.q.out b/ql/src/test/results/clientpositive/dynpart_sort_opt_vectorization.q.out
index ab8f96c..fc4f483 100644
--- a/ql/src/test/results/clientpositive/dynpart_sort_opt_vectorization.q.out
+++ b/ql/src/test/results/clientpositive/dynpart_sort_opt_vectorization.q.out
@@ -181,11 +181,11 @@ STAGE PLANS:
sort order: ++
Map-reduce partition columns: _col4 (type: tinyint)
Statistics: Num rows: 1048 Data size: 310873 Basic stats: COMPLETE Column stats: NONE
- value expressions: _col0 (type: smallint), _col1 (type: int), _col2 (type: bigint), _col3 (type: float)
+ value expressions: _col1 (type: int), _col2 (type: bigint), _col3 (type: float)
Execution mode: vectorized
Reduce Operator Tree:
Select Operator
- expressions: VALUE._col0 (type: smallint), VALUE._col1 (type: int), VALUE._col2 (type: bigint), VALUE._col3 (type: float), KEY._col4 (type: tinyint)
+ expressions: KEY._col0 (type: smallint), VALUE._col1 (type: int), VALUE._col2 (type: bigint), VALUE._col3 (type: float), KEY._col4 (type: tinyint)
outputColumnNames: _col0, _col1, _col2, _col3, _col4
Statistics: Num rows: 1048 Data size: 310873 Basic stats: COMPLETE Column stats: NONE
File Output Operator
@@ -517,11 +517,11 @@ STAGE PLANS:
sort order: ++
Map-reduce partition columns: _col4 (type: tinyint)
Statistics: Num rows: 1048 Data size: 310873 Basic stats: COMPLETE Column stats: NONE
- value expressions: _col0 (type: smallint), _col1 (type: int), _col2 (type: bigint), _col3 (type: float)
+ value expressions: _col1 (type: int), _col2 (type: bigint), _col3 (type: float)
Execution mode: vectorized
Reduce Operator Tree:
Select Operator
- expressions: VALUE._col0 (type: smallint), VALUE._col1 (type: int), VALUE._col2 (type: bigint), VALUE._col3 (type: float), KEY._col4 (type: tinyint)
+ expressions: KEY._col0 (type: smallint), VALUE._col1 (type: int), VALUE._col2 (type: bigint), VALUE._col3 (type: float), KEY._col4 (type: tinyint)
outputColumnNames: _col0, _col1, _col2, _col3, _col4
Statistics: Num rows: 1048 Data size: 310873 Basic stats: COMPLETE Column stats: NONE
File Output Operator
@@ -1314,11 +1314,11 @@ STAGE PLANS:
sort order: ++
Map-reduce partition columns: _col4 (type: tinyint)
Statistics: Num rows: 1048 Data size: 310873 Basic stats: COMPLETE Column stats: NONE
- value expressions: _col0 (type: smallint), _col1 (type: int), _col2 (type: bigint), _col3 (type: float)
+ value expressions: _col0 (type: smallint), _col2 (type: bigint), _col3 (type: float)
Execution mode: vectorized
Reduce Operator Tree:
Select Operator
- expressions: VALUE._col0 (type: smallint), VALUE._col1 (type: int), VALUE._col2 (type: bigint), VALUE._col3 (type: float), KEY._col4 (type: tinyint)
+ expressions: VALUE._col0 (type: smallint), KEY._col1 (type: int), VALUE._col2 (type: bigint), VALUE._col3 (type: float), KEY._col4 (type: tinyint)
outputColumnNames: _col0, _col1, _col2, _col3, _col4
Statistics: Num rows: 1048 Data size: 310873 Basic stats: COMPLETE Column stats: NONE
File Output Operator
http://git-wip-us.apache.org/repos/asf/hive/blob/07f593a7/ql/src/test/results/clientpositive/dynpart_sort_optimization.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/dynpart_sort_optimization.q.out b/ql/src/test/results/clientpositive/dynpart_sort_optimization.q.out
index e0d022f..d24ee16 100644
--- a/ql/src/test/results/clientpositive/dynpart_sort_optimization.q.out
+++ b/ql/src/test/results/clientpositive/dynpart_sort_optimization.q.out
@@ -1262,10 +1262,10 @@ STAGE PLANS:
sort order: ++
Map-reduce partition columns: _col4 (type: tinyint)
Statistics: Num rows: 4442 Data size: 106611 Basic stats: COMPLETE Column stats: NONE
- value expressions: _col0 (type: smallint), _col1 (type: int), _col2 (type: bigint), _col3 (type: float)
+ value expressions: _col0 (type: smallint), _col2 (type: bigint), _col3 (type: float)
Reduce Operator Tree:
Select Operator
- expressions: VALUE._col0 (type: smallint), VALUE._col1 (type: int), VALUE._col2 (type: bigint), VALUE._col3 (type: float), KEY._col4 (type: tinyint)
+ expressions: VALUE._col0 (type: smallint), KEY._col1 (type: int), VALUE._col2 (type: bigint), VALUE._col3 (type: float), KEY._col4 (type: tinyint)
outputColumnNames: _col0, _col1, _col2, _col3, _col4
Statistics: Num rows: 4442 Data size: 106611 Basic stats: COMPLETE Column stats: NONE
File Output Operator
http://git-wip-us.apache.org/repos/asf/hive/blob/07f593a7/ql/src/test/results/clientpositive/dynpart_sort_optimization_acid2.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/dynpart_sort_optimization_acid2.q.out b/ql/src/test/results/clientpositive/dynpart_sort_optimization_acid2.q.out
new file mode 100644
index 0000000..0b6e992
--- /dev/null
+++ b/ql/src/test/results/clientpositive/dynpart_sort_optimization_acid2.q.out
@@ -0,0 +1,72 @@
+PREHOOK: query: CREATE TABLE non_acid(key string, value string)
+PARTITIONED BY(ds string, hr int)
+CLUSTERED BY(key) INTO 2 BUCKETS
+STORED AS ORC
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@non_acid
+POSTHOOK: query: CREATE TABLE non_acid(key string, value string)
+PARTITIONED BY(ds string, hr int)
+CLUSTERED BY(key) INTO 2 BUCKETS
+STORED AS ORC
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@non_acid
+PREHOOK: query: explain
+insert into table non_acid partition(ds,hr) select * from srcpart sort by value
+PREHOOK: type: QUERY
+POSTHOOK: query: explain
+insert into table non_acid partition(ds,hr) select * from srcpart sort by value
+POSTHOOK: type: QUERY
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 depends on stages: Stage-1
+ Stage-2 depends on stages: Stage-0
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Map Operator Tree:
+ TableScan
+ alias: srcpart
+ Statistics: Num rows: 2000 Data size: 21248 Basic stats: COMPLETE Column stats: NONE
+ Select Operator
+ expressions: key (type: string), value (type: string), ds (type: string), hr (type: string)
+ outputColumnNames: _col0, _col1, _col2, _col3
+ Statistics: Num rows: 2000 Data size: 21248 Basic stats: COMPLETE Column stats: NONE
+ Reduce Output Operator
+ key expressions: _col2 (type: string), _col3 (type: string), '_bucket_number' (type: string), _col1 (type: string)
+ sort order: ++++
+ Map-reduce partition columns: _col2 (type: string), _col3 (type: string)
+ Statistics: Num rows: 2000 Data size: 21248 Basic stats: COMPLETE Column stats: NONE
+ value expressions: _col0 (type: string)
+ Reduce Operator Tree:
+ Select Operator
+ expressions: VALUE._col0 (type: string), KEY._col1 (type: string), KEY._col2 (type: string), KEY._col3 (type: string), KEY.'_bucket_number' (type: string)
+ outputColumnNames: _col0, _col1, _col2, _col3, '_bucket_number'
+ Statistics: Num rows: 2000 Data size: 21248 Basic stats: COMPLETE Column stats: NONE
+ File Output Operator
+ compressed: false
+ Statistics: Num rows: 2000 Data size: 21248 Basic stats: COMPLETE Column stats: NONE
+ table:
+ input format: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat
+ output format: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat
+ serde: org.apache.hadoop.hive.ql.io.orc.OrcSerde
+ name: default.non_acid
+
+ Stage: Stage-0
+ Move Operator
+ tables:
+ partition:
+ ds
+ hr
+ replace: false
+ table:
+ input format: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat
+ output format: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat
+ serde: org.apache.hadoop.hive.ql.io.orc.OrcSerde
+ name: default.non_acid
+
+ Stage: Stage-2
+ Stats-Aggr Operator
+
http://git-wip-us.apache.org/repos/asf/hive/blob/07f593a7/ql/src/test/results/clientpositive/tez/dynpart_sort_opt_vectorization.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/tez/dynpart_sort_opt_vectorization.q.out b/ql/src/test/results/clientpositive/tez/dynpart_sort_opt_vectorization.q.out
index 9a72586..789dd5e 100644
--- a/ql/src/test/results/clientpositive/tez/dynpart_sort_opt_vectorization.q.out
+++ b/ql/src/test/results/clientpositive/tez/dynpart_sort_opt_vectorization.q.out
@@ -188,13 +188,13 @@ STAGE PLANS:
sort order: ++
Map-reduce partition columns: _col4 (type: tinyint)
Statistics: Num rows: 1048 Data size: 310873 Basic stats: COMPLETE Column stats: NONE
- value expressions: _col0 (type: smallint), _col1 (type: int), _col2 (type: bigint), _col3 (type: float)
+ value expressions: _col1 (type: int), _col2 (type: bigint), _col3 (type: float)
Execution mode: vectorized
Reducer 2
Execution mode: vectorized
Reduce Operator Tree:
Select Operator
- expressions: VALUE._col0 (type: smallint), VALUE._col1 (type: int), VALUE._col2 (type: bigint), VALUE._col3 (type: float), KEY._col4 (type: tinyint)
+ expressions: KEY._col0 (type: smallint), VALUE._col1 (type: int), VALUE._col2 (type: bigint), VALUE._col3 (type: float), KEY._col4 (type: tinyint)
outputColumnNames: _col0, _col1, _col2, _col3, _col4
Statistics: Num rows: 1048 Data size: 310873 Basic stats: COMPLETE Column stats: NONE
File Output Operator
@@ -563,13 +563,13 @@ STAGE PLANS:
sort order: ++
Map-reduce partition columns: _col4 (type: tinyint)
Statistics: Num rows: 1048 Data size: 310873 Basic stats: COMPLETE Column stats: NONE
- value expressions: _col0 (type: smallint), _col1 (type: int), _col2 (type: bigint), _col3 (type: float)
+ value expressions: _col1 (type: int), _col2 (type: bigint), _col3 (type: float)
Execution mode: vectorized
Reducer 2
Execution mode: vectorized
Reduce Operator Tree:
Select Operator
- expressions: VALUE._col0 (type: smallint), VALUE._col1 (type: int), VALUE._col2 (type: bigint), VALUE._col3 (type: float), KEY._col4 (type: tinyint)
+ expressions: KEY._col0 (type: smallint), VALUE._col1 (type: int), VALUE._col2 (type: bigint), VALUE._col3 (type: float), KEY._col4 (type: tinyint)
outputColumnNames: _col0, _col1, _col2, _col3, _col4
Statistics: Num rows: 1048 Data size: 310873 Basic stats: COMPLETE Column stats: NONE
File Output Operator
@@ -1411,13 +1411,13 @@ STAGE PLANS:
sort order: ++
Map-reduce partition columns: _col4 (type: tinyint)
Statistics: Num rows: 1048 Data size: 310873 Basic stats: COMPLETE Column stats: NONE
- value expressions: _col0 (type: smallint), _col1 (type: int), _col2 (type: bigint), _col3 (type: float)
+ value expressions: _col0 (type: smallint), _col2 (type: bigint), _col3 (type: float)
Execution mode: vectorized
Reducer 2
Execution mode: vectorized
Reduce Operator Tree:
Select Operator
- expressions: VALUE._col0 (type: smallint), VALUE._col1 (type: int), VALUE._col2 (type: bigint), VALUE._col3 (type: float), KEY._col4 (type: tinyint)
+ expressions: VALUE._col0 (type: smallint), KEY._col1 (type: int), VALUE._col2 (type: bigint), VALUE._col3 (type: float), KEY._col4 (type: tinyint)
outputColumnNames: _col0, _col1, _col2, _col3, _col4
Statistics: Num rows: 1048 Data size: 310873 Basic stats: COMPLETE Column stats: NONE
File Output Operator
http://git-wip-us.apache.org/repos/asf/hive/blob/07f593a7/ql/src/test/results/clientpositive/tez/dynpart_sort_optimization.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/tez/dynpart_sort_optimization.q.out b/ql/src/test/results/clientpositive/tez/dynpart_sort_optimization.q.out
index 6689394..9f12f5b 100644
--- a/ql/src/test/results/clientpositive/tez/dynpart_sort_optimization.q.out
+++ b/ql/src/test/results/clientpositive/tez/dynpart_sort_optimization.q.out
@@ -1348,11 +1348,11 @@ STAGE PLANS:
sort order: ++
Map-reduce partition columns: _col4 (type: tinyint)
Statistics: Num rows: 4442 Data size: 106611 Basic stats: COMPLETE Column stats: NONE
- value expressions: _col0 (type: smallint), _col1 (type: int), _col2 (type: bigint), _col3 (type: float)
+ value expressions: _col0 (type: smallint), _col2 (type: bigint), _col3 (type: float)
Reducer 2
Reduce Operator Tree:
Select Operator
- expressions: VALUE._col0 (type: smallint), VALUE._col1 (type: int), VALUE._col2 (type: bigint), VALUE._col3 (type: float), KEY._col4 (type: tinyint)
+ expressions: VALUE._col0 (type: smallint), KEY._col1 (type: int), VALUE._col2 (type: bigint), VALUE._col3 (type: float), KEY._col4 (type: tinyint)
outputColumnNames: _col0, _col1, _col2, _col3, _col4
Statistics: Num rows: 4442 Data size: 106611 Basic stats: COMPLETE Column stats: NONE
File Output Operator