You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by px...@apache.org on 2016/03/16 18:08:00 UTC
hive git commit: HIVE-13260: ReduceSinkDeDuplication throws exception
when pRS key is empty (Pengcheng Xiong, reviewed by Prasanth Jayachandran)
Repository: hive
Updated Branches:
refs/heads/master 868db42a6 -> 06b604a03
HIVE-13260: ReduceSinkDeDuplication throws exception when pRS key is empty (Pengcheng Xiong, reviewed by Prasanth Jayachandran)
Project: http://git-wip-us.apache.org/repos/asf/hive/repo
Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/06b604a0
Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/06b604a0
Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/06b604a0
Branch: refs/heads/master
Commit: 06b604a03ba1c137c771c4f2dcbcd79249ffd141
Parents: 868db42
Author: Pengcheng Xiong <px...@apache.org>
Authored: Wed Mar 16 10:07:44 2016 -0700
Committer: Pengcheng Xiong <px...@apache.org>
Committed: Wed Mar 16 10:07:44 2016 -0700
----------------------------------------------------------------------
.../correlation/ReduceSinkDeDuplication.java | 18 +-
.../reduceSinkDeDuplication_pRS_key_empty.q | 60 +++++
.../reduceSinkDeDuplication_pRS_key_empty.q.out | 220 +++++++++++++++++++
3 files changed, 288 insertions(+), 10 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/hive/blob/06b604a0/ql/src/java/org/apache/hadoop/hive/ql/optimizer/correlation/ReduceSinkDeDuplication.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/correlation/ReduceSinkDeDuplication.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/correlation/ReduceSinkDeDuplication.java
index 59c87a3..733620b 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/correlation/ReduceSinkDeDuplication.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/correlation/ReduceSinkDeDuplication.java
@@ -312,17 +312,15 @@ public class ReduceSinkDeDuplication extends Transform {
if (result[4] > 0) {
// This case happens only when pRS key is empty in which case we can use
// number of distribution keys and key serialization info from cRS
- pRS.getConf().setNumDistributionKeys(cRS.getConf().getNumDistributionKeys());
- List<FieldSchema> fields = PlanUtils.getFieldSchemasFromColumnList(pRS.getConf()
- .getKeyCols(), "reducesinkkey");
- TableDesc keyTable = PlanUtils.getReduceKeyTableDesc(fields, pRS.getConf().getOrder(),
- pRS.getConf().getNullOrder());
- ArrayList<String> outputKeyCols = Lists.newArrayList();
- for (int i = 0; i < fields.size(); i++) {
- outputKeyCols.add(fields.get(i).getName());
+ if (pRS.getConf().getKeyCols() != null && pRS.getConf().getKeyCols().size() == 0
+ && cRS.getConf().getKeyCols() != null && cRS.getConf().getKeyCols().size() == 0) {
+ // As setNumDistributionKeys is a subset of keycols, the size should
+ // be 0 too. This condition maybe too strict. We may extend it in the
+ // future.
+ TableDesc keyTable = PlanUtils.getReduceKeyTableDesc(new ArrayList<FieldSchema>(), pRS
+ .getConf().getOrder(), pRS.getConf().getNullOrder());
+ pRS.getConf().setKeySerializeInfo(keyTable);
}
- pRS.getConf().setOutputKeyColumnNames(outputKeyCols);
- pRS.getConf().setKeySerializeInfo(keyTable);
}
return true;
}
http://git-wip-us.apache.org/repos/asf/hive/blob/06b604a0/ql/src/test/queries/clientpositive/reduceSinkDeDuplication_pRS_key_empty.q
----------------------------------------------------------------------
diff --git a/ql/src/test/queries/clientpositive/reduceSinkDeDuplication_pRS_key_empty.q b/ql/src/test/queries/clientpositive/reduceSinkDeDuplication_pRS_key_empty.q
new file mode 100644
index 0000000..8bbae39
--- /dev/null
+++ b/ql/src/test/queries/clientpositive/reduceSinkDeDuplication_pRS_key_empty.q
@@ -0,0 +1,60 @@
+set hive.mapred.mode=nonstrict;
+set hive.cbo.enable=false;
+
+set hive.map.aggr=false;
+
+set hive.groupby.skewindata=false;
+set mapred.reduce.tasks=31;
+
+
+select compute_stats(a,16),compute_stats(b,16),compute_stats(c,16),compute_stats(d,16)
+from
+(
+select
+ avg(substr(src.value,5)) as a,
+ max(substr(src.value,5)) as b,
+ variance(substr(src.value,5)) as c,
+ var_samp(substr(src.value,5)) as d
+ from src)subq;
+
+explain select compute_stats(a,16),compute_stats(b,16),compute_stats(c,16),compute_stats(d,16)
+from
+(
+select
+ avg(DISTINCT substr(src.value,5)) as a,
+ max(substr(src.value,5)) as b,
+ variance(substr(src.value,5)) as c,
+ var_samp(substr(src.value,5)) as d
+ from src)subq;
+
+select compute_stats(a,16),compute_stats(b,16),compute_stats(c,16),compute_stats(d,16)
+from
+(
+select
+ avg(DISTINCT substr(src.value,5)) as a,
+ max(substr(src.value,5)) as b,
+ variance(substr(src.value,5)) as c,
+ var_samp(substr(src.value,5)) as d
+ from src)subq;
+
+set hive.optimize.reducededuplication=false;
+
+explain select compute_stats(a,16),compute_stats(b,16),compute_stats(c,16),compute_stats(d,16)
+from
+(
+select
+ avg(DISTINCT substr(src.value,5)) as a,
+ max(substr(src.value,5)) as b,
+ variance(substr(src.value,5)) as c,
+ var_samp(substr(src.value,5)) as d
+ from src)subq;
+
+select compute_stats(a,16),compute_stats(b,16),compute_stats(c,16),compute_stats(d,16)
+from
+(
+select
+ avg(DISTINCT substr(src.value,5)) as a,
+ max(substr(src.value,5)) as b,
+ variance(substr(src.value,5)) as c,
+ var_samp(substr(src.value,5)) as d
+ from src)subq;
http://git-wip-us.apache.org/repos/asf/hive/blob/06b604a0/ql/src/test/results/clientpositive/reduceSinkDeDuplication_pRS_key_empty.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/reduceSinkDeDuplication_pRS_key_empty.q.out b/ql/src/test/results/clientpositive/reduceSinkDeDuplication_pRS_key_empty.q.out
new file mode 100644
index 0000000..4a848f2
--- /dev/null
+++ b/ql/src/test/results/clientpositive/reduceSinkDeDuplication_pRS_key_empty.q.out
@@ -0,0 +1,220 @@
+PREHOOK: query: select compute_stats(a,16),compute_stats(b,16),compute_stats(c,16),compute_stats(d,16)
+from
+(
+select
+ avg(substr(src.value,5)) as a,
+ max(substr(src.value,5)) as b,
+ variance(substr(src.value,5)) as c,
+ var_samp(substr(src.value,5)) as d
+ from src)subq
+PREHOOK: type: QUERY
+PREHOOK: Input: default@src
+#### A masked pattern was here ####
+POSTHOOK: query: select compute_stats(a,16),compute_stats(b,16),compute_stats(c,16),compute_stats(d,16)
+from
+(
+select
+ avg(substr(src.value,5)) as a,
+ max(substr(src.value,5)) as b,
+ variance(substr(src.value,5)) as c,
+ var_samp(substr(src.value,5)) as d
+ from src)subq
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@src
+#### A masked pattern was here ####
+{"columntype":"Double","min":260.182,"max":260.182,"countnulls":0,"numdistinctvalues":1,"ndvbitvector":"{1}{0}{0}{0}{1}{1}{1}{0}{0}{0}{0}{0}{1}{2}{1}{0}"} {"columntype":"String","maxlength":2,"avglength":2.0,"countnulls":0,"numdistinctvalues":1,"ndvbitvector":"{1}{2}{0}{3}{6}{3}{0}{1}{1}{0}{0}{0}{0}{0}{0}{0}"} {"columntype":"Double","min":20428.07287599998,"max":20428.07287599998,"countnulls":0,"numdistinctvalues":2,"ndvbitvector":"{0}{0}{3}{0}{1}{1}{0}{0}{0}{0}{0}{0}{0}{4}{2}{0}"} {"columntype":"Double","min":20469.01089779557,"max":20469.01089779557,"countnulls":0,"numdistinctvalues":1,"ndvbitvector":"{0}{1}{3}{2}{3}{5}{2}{0}{1}{0}{1}{1}{1}{1}{0}{1}"}
+PREHOOK: query: explain select compute_stats(a,16),compute_stats(b,16),compute_stats(c,16),compute_stats(d,16)
+from
+(
+select
+ avg(DISTINCT substr(src.value,5)) as a,
+ max(substr(src.value,5)) as b,
+ variance(substr(src.value,5)) as c,
+ var_samp(substr(src.value,5)) as d
+ from src)subq
+PREHOOK: type: QUERY
+POSTHOOK: query: explain select compute_stats(a,16),compute_stats(b,16),compute_stats(c,16),compute_stats(d,16)
+from
+(
+select
+ avg(DISTINCT substr(src.value,5)) as a,
+ max(substr(src.value,5)) as b,
+ variance(substr(src.value,5)) as c,
+ var_samp(substr(src.value,5)) as d
+ from src)subq
+POSTHOOK: type: QUERY
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Map Operator Tree:
+ TableScan
+ alias: src
+ Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE
+ Select Operator
+ expressions: value (type: string)
+ outputColumnNames: value
+ Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE
+ Reduce Output Operator
+ key expressions: substr(value, 5) (type: string)
+ sort order: +
+ Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations: avg(DISTINCT KEY._col0:0._col0), max(KEY._col0:0._col0), variance(KEY._col0:0._col0), var_samp(KEY._col0:0._col0)
+ mode: complete
+ outputColumnNames: _col0, _col1, _col2, _col3
+ Statistics: Num rows: 1 Data size: 108 Basic stats: COMPLETE Column stats: NONE
+ Group By Operator
+ aggregations: compute_stats(_col0, 16), compute_stats(_col1, 16), compute_stats(_col2, 16), compute_stats(_col3, 16)
+ mode: complete
+ outputColumnNames: _col0, _col1, _col2, _col3
+ Statistics: Num rows: 1 Data size: 108 Basic stats: COMPLETE Column stats: NONE
+ File Output Operator
+ compressed: false
+ Statistics: Num rows: 1 Data size: 108 Basic stats: COMPLETE Column stats: NONE
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+ Processor Tree:
+ ListSink
+
+PREHOOK: query: select compute_stats(a,16),compute_stats(b,16),compute_stats(c,16),compute_stats(d,16)
+from
+(
+select
+ avg(DISTINCT substr(src.value,5)) as a,
+ max(substr(src.value,5)) as b,
+ variance(substr(src.value,5)) as c,
+ var_samp(substr(src.value,5)) as d
+ from src)subq
+PREHOOK: type: QUERY
+PREHOOK: Input: default@src
+#### A masked pattern was here ####
+POSTHOOK: query: select compute_stats(a,16),compute_stats(b,16),compute_stats(c,16),compute_stats(d,16)
+from
+(
+select
+ avg(DISTINCT substr(src.value,5)) as a,
+ max(substr(src.value,5)) as b,
+ variance(substr(src.value,5)) as c,
+ var_samp(substr(src.value,5)) as d
+ from src)subq
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@src
+#### A masked pattern was here ####
+{"columntype":"Double","min":256.10355987055016,"max":256.10355987055016,"countnulls":0,"numdistinctvalues":1,"ndvbitvector":"{2}{1}{0}{2}{0}{1}{1}{1}{0}{0}{1}{1}{0}{2}{1}{0}"} {"columntype":"String","maxlength":2,"avglength":2.0,"countnulls":0,"numdistinctvalues":1,"ndvbitvector":"{1}{2}{0}{3}{6}{3}{0}{1}{1}{0}{0}{0}{0}{0}{0}{0}"} {"columntype":"Double","min":20428.07287599999,"max":20428.07287599999,"countnulls":0,"numdistinctvalues":1,"ndvbitvector":"{1}{4}{0}{0}{4}{3}{0}{1}{0}{0}{0}{0}{0}{0}{1}{2}"} {"columntype":"Double","min":20469.010897795582,"max":20469.010897795582,"countnulls":0,"numdistinctvalues":2,"ndvbitvector":"{2}{0}{2}{2}{0}{0}{2}{0}{0}{0}{0}{0}{1}{0}{0}{0}"}
+PREHOOK: query: explain select compute_stats(a,16),compute_stats(b,16),compute_stats(c,16),compute_stats(d,16)
+from
+(
+select
+ avg(DISTINCT substr(src.value,5)) as a,
+ max(substr(src.value,5)) as b,
+ variance(substr(src.value,5)) as c,
+ var_samp(substr(src.value,5)) as d
+ from src)subq
+PREHOOK: type: QUERY
+POSTHOOK: query: explain select compute_stats(a,16),compute_stats(b,16),compute_stats(c,16),compute_stats(d,16)
+from
+(
+select
+ avg(DISTINCT substr(src.value,5)) as a,
+ max(substr(src.value,5)) as b,
+ variance(substr(src.value,5)) as c,
+ var_samp(substr(src.value,5)) as d
+ from src)subq
+POSTHOOK: type: QUERY
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-2 depends on stages: Stage-1
+ Stage-0 depends on stages: Stage-2
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Map Operator Tree:
+ TableScan
+ alias: src
+ Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE
+ Select Operator
+ expressions: value (type: string)
+ outputColumnNames: value
+ Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE
+ Reduce Output Operator
+ key expressions: substr(value, 5) (type: string)
+ sort order: +
+ Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations: avg(DISTINCT KEY._col0:0._col0), max(KEY._col0:0._col0), variance(KEY._col0:0._col0), var_samp(KEY._col0:0._col0)
+ mode: complete
+ outputColumnNames: _col0, _col1, _col2, _col3
+ Statistics: Num rows: 1 Data size: 108 Basic stats: COMPLETE Column stats: NONE
+ File Output Operator
+ compressed: false
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe
+
+ Stage: Stage-2
+ Map Reduce
+ Map Operator Tree:
+ TableScan
+ Reduce Output Operator
+ sort order:
+ Statistics: Num rows: 1 Data size: 108 Basic stats: COMPLETE Column stats: NONE
+ value expressions: _col0 (type: double), _col1 (type: string), _col2 (type: double), _col3 (type: double)
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations: compute_stats(VALUE._col0, 16), compute_stats(VALUE._col2, 16), compute_stats(VALUE._col3, 16), compute_stats(VALUE._col4, 16)
+ mode: complete
+ outputColumnNames: _col0, _col1, _col2, _col3
+ Statistics: Num rows: 1 Data size: 108 Basic stats: COMPLETE Column stats: NONE
+ File Output Operator
+ compressed: false
+ Statistics: Num rows: 1 Data size: 108 Basic stats: COMPLETE Column stats: NONE
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+ Processor Tree:
+ ListSink
+
+PREHOOK: query: select compute_stats(a,16),compute_stats(b,16),compute_stats(c,16),compute_stats(d,16)
+from
+(
+select
+ avg(DISTINCT substr(src.value,5)) as a,
+ max(substr(src.value,5)) as b,
+ variance(substr(src.value,5)) as c,
+ var_samp(substr(src.value,5)) as d
+ from src)subq
+PREHOOK: type: QUERY
+PREHOOK: Input: default@src
+#### A masked pattern was here ####
+POSTHOOK: query: select compute_stats(a,16),compute_stats(b,16),compute_stats(c,16),compute_stats(d,16)
+from
+(
+select
+ avg(DISTINCT substr(src.value,5)) as a,
+ max(substr(src.value,5)) as b,
+ variance(substr(src.value,5)) as c,
+ var_samp(substr(src.value,5)) as d
+ from src)subq
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@src
+#### A masked pattern was here ####
+{"columntype":"Double","min":256.10355987055016,"max":256.10355987055016,"countnulls":0,"numdistinctvalues":1,"ndvbitvector":"{2}{1}{0}{2}{0}{1}{1}{1}{0}{0}{1}{1}{0}{2}{1}{0}"} {"columntype":"String","maxlength":2,"avglength":2.0,"countnulls":0,"numdistinctvalues":1,"ndvbitvector":"{1}{2}{0}{3}{6}{3}{0}{1}{1}{0}{0}{0}{0}{0}{0}{0}"} {"columntype":"Double","min":20428.07287599999,"max":20428.07287599999,"countnulls":0,"numdistinctvalues":1,"ndvbitvector":"{1}{4}{0}{0}{4}{3}{0}{1}{0}{0}{0}{0}{0}{0}{1}{2}"} {"columntype":"Double","min":20469.010897795582,"max":20469.010897795582,"countnulls":0,"numdistinctvalues":2,"ndvbitvector":"{2}{0}{2}{2}{0}{0}{2}{0}{0}{0}{0}{0}{1}{0}{0}{0}"}