You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by px...@apache.org on 2016/03/16 18:08:00 UTC

hive git commit: HIVE-13260: ReduceSinkDeDuplication throws exception when pRS key is empty (Pengcheng Xiong, reviewed by Prasanth Jayachandran)

Repository: hive
Updated Branches:
  refs/heads/master 868db42a6 -> 06b604a03


HIVE-13260: ReduceSinkDeDuplication throws exception when pRS key is empty (Pengcheng Xiong, reviewed by Prasanth Jayachandran)


Project: http://git-wip-us.apache.org/repos/asf/hive/repo
Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/06b604a0
Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/06b604a0
Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/06b604a0

Branch: refs/heads/master
Commit: 06b604a03ba1c137c771c4f2dcbcd79249ffd141
Parents: 868db42
Author: Pengcheng Xiong <px...@apache.org>
Authored: Wed Mar 16 10:07:44 2016 -0700
Committer: Pengcheng Xiong <px...@apache.org>
Committed: Wed Mar 16 10:07:44 2016 -0700

----------------------------------------------------------------------
 .../correlation/ReduceSinkDeDuplication.java    |  18 +-
 .../reduceSinkDeDuplication_pRS_key_empty.q     |  60 +++++
 .../reduceSinkDeDuplication_pRS_key_empty.q.out | 220 +++++++++++++++++++
 3 files changed, 288 insertions(+), 10 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hive/blob/06b604a0/ql/src/java/org/apache/hadoop/hive/ql/optimizer/correlation/ReduceSinkDeDuplication.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/correlation/ReduceSinkDeDuplication.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/correlation/ReduceSinkDeDuplication.java
index 59c87a3..733620b 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/correlation/ReduceSinkDeDuplication.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/correlation/ReduceSinkDeDuplication.java
@@ -312,17 +312,15 @@ public class ReduceSinkDeDuplication extends Transform {
       if (result[4] > 0) {
         // This case happens only when pRS key is empty in which case we can use
         // number of distribution keys and key serialization info from cRS
-        pRS.getConf().setNumDistributionKeys(cRS.getConf().getNumDistributionKeys());
-        List<FieldSchema> fields = PlanUtils.getFieldSchemasFromColumnList(pRS.getConf()
-            .getKeyCols(), "reducesinkkey");
-        TableDesc keyTable = PlanUtils.getReduceKeyTableDesc(fields, pRS.getConf().getOrder(),
-                pRS.getConf().getNullOrder());
-        ArrayList<String> outputKeyCols = Lists.newArrayList();
-        for (int i = 0; i < fields.size(); i++) {
-          outputKeyCols.add(fields.get(i).getName());
+        if (pRS.getConf().getKeyCols() != null && pRS.getConf().getKeyCols().size() == 0
+            && cRS.getConf().getKeyCols() != null && cRS.getConf().getKeyCols().size() == 0) {
+          // As setNumDistributionKeys is a subset of keycols, the size should
+          // be 0 too. This condition maybe too strict. We may extend it in the
+          // future.
+          TableDesc keyTable = PlanUtils.getReduceKeyTableDesc(new ArrayList<FieldSchema>(), pRS
+              .getConf().getOrder(), pRS.getConf().getNullOrder());
+          pRS.getConf().setKeySerializeInfo(keyTable);
         }
-        pRS.getConf().setOutputKeyColumnNames(outputKeyCols);
-        pRS.getConf().setKeySerializeInfo(keyTable);
       }
       return true;
     }

http://git-wip-us.apache.org/repos/asf/hive/blob/06b604a0/ql/src/test/queries/clientpositive/reduceSinkDeDuplication_pRS_key_empty.q
----------------------------------------------------------------------
diff --git a/ql/src/test/queries/clientpositive/reduceSinkDeDuplication_pRS_key_empty.q b/ql/src/test/queries/clientpositive/reduceSinkDeDuplication_pRS_key_empty.q
new file mode 100644
index 0000000..8bbae39
--- /dev/null
+++ b/ql/src/test/queries/clientpositive/reduceSinkDeDuplication_pRS_key_empty.q
@@ -0,0 +1,60 @@
+set hive.mapred.mode=nonstrict;
+set hive.cbo.enable=false;
+
+set hive.map.aggr=false;
+
+set hive.groupby.skewindata=false;
+set mapred.reduce.tasks=31;
+
+
+select compute_stats(a,16),compute_stats(b,16),compute_stats(c,16),compute_stats(d,16)
+from
+(
+select
+  avg(substr(src.value,5)) as a,
+  max(substr(src.value,5)) as b,
+  variance(substr(src.value,5)) as c,
+  var_samp(substr(src.value,5)) as d
+ from src)subq;
+
+explain select compute_stats(a,16),compute_stats(b,16),compute_stats(c,16),compute_stats(d,16)
+from
+(
+select
+  avg(DISTINCT substr(src.value,5)) as a,
+  max(substr(src.value,5)) as b,
+  variance(substr(src.value,5)) as c,
+  var_samp(substr(src.value,5)) as d
+ from src)subq;
+
+select compute_stats(a,16),compute_stats(b,16),compute_stats(c,16),compute_stats(d,16)
+from
+(
+select
+  avg(DISTINCT substr(src.value,5)) as a,
+  max(substr(src.value,5)) as b,
+  variance(substr(src.value,5)) as c,
+  var_samp(substr(src.value,5)) as d
+ from src)subq;
+ 
+set hive.optimize.reducededuplication=false;
+
+explain select compute_stats(a,16),compute_stats(b,16),compute_stats(c,16),compute_stats(d,16)
+from
+(
+select
+  avg(DISTINCT substr(src.value,5)) as a,
+  max(substr(src.value,5)) as b,
+  variance(substr(src.value,5)) as c,
+  var_samp(substr(src.value,5)) as d
+ from src)subq;
+
+select compute_stats(a,16),compute_stats(b,16),compute_stats(c,16),compute_stats(d,16)
+from
+(
+select
+  avg(DISTINCT substr(src.value,5)) as a,
+  max(substr(src.value,5)) as b,
+  variance(substr(src.value,5)) as c,
+  var_samp(substr(src.value,5)) as d
+ from src)subq;

http://git-wip-us.apache.org/repos/asf/hive/blob/06b604a0/ql/src/test/results/clientpositive/reduceSinkDeDuplication_pRS_key_empty.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/reduceSinkDeDuplication_pRS_key_empty.q.out b/ql/src/test/results/clientpositive/reduceSinkDeDuplication_pRS_key_empty.q.out
new file mode 100644
index 0000000..4a848f2
--- /dev/null
+++ b/ql/src/test/results/clientpositive/reduceSinkDeDuplication_pRS_key_empty.q.out
@@ -0,0 +1,220 @@
+PREHOOK: query: select compute_stats(a,16),compute_stats(b,16),compute_stats(c,16),compute_stats(d,16)
+from
+(
+select
+  avg(substr(src.value,5)) as a,
+  max(substr(src.value,5)) as b,
+  variance(substr(src.value,5)) as c,
+  var_samp(substr(src.value,5)) as d
+ from src)subq
+PREHOOK: type: QUERY
+PREHOOK: Input: default@src
+#### A masked pattern was here ####
+POSTHOOK: query: select compute_stats(a,16),compute_stats(b,16),compute_stats(c,16),compute_stats(d,16)
+from
+(
+select
+  avg(substr(src.value,5)) as a,
+  max(substr(src.value,5)) as b,
+  variance(substr(src.value,5)) as c,
+  var_samp(substr(src.value,5)) as d
+ from src)subq
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@src
+#### A masked pattern was here ####
+{"columntype":"Double","min":260.182,"max":260.182,"countnulls":0,"numdistinctvalues":1,"ndvbitvector":"{1}{0}{0}{0}{1}{1}{1}{0}{0}{0}{0}{0}{1}{2}{1}{0}"}	{"columntype":"String","maxlength":2,"avglength":2.0,"countnulls":0,"numdistinctvalues":1,"ndvbitvector":"{1}{2}{0}{3}{6}{3}{0}{1}{1}{0}{0}{0}{0}{0}{0}{0}"}	{"columntype":"Double","min":20428.07287599998,"max":20428.07287599998,"countnulls":0,"numdistinctvalues":2,"ndvbitvector":"{0}{0}{3}{0}{1}{1}{0}{0}{0}{0}{0}{0}{0}{4}{2}{0}"}	{"columntype":"Double","min":20469.01089779557,"max":20469.01089779557,"countnulls":0,"numdistinctvalues":1,"ndvbitvector":"{0}{1}{3}{2}{3}{5}{2}{0}{1}{0}{1}{1}{1}{1}{0}{1}"}
+PREHOOK: query: explain select compute_stats(a,16),compute_stats(b,16),compute_stats(c,16),compute_stats(d,16)
+from
+(
+select
+  avg(DISTINCT substr(src.value,5)) as a,
+  max(substr(src.value,5)) as b,
+  variance(substr(src.value,5)) as c,
+  var_samp(substr(src.value,5)) as d
+ from src)subq
+PREHOOK: type: QUERY
+POSTHOOK: query: explain select compute_stats(a,16),compute_stats(b,16),compute_stats(c,16),compute_stats(d,16)
+from
+(
+select
+  avg(DISTINCT substr(src.value,5)) as a,
+  max(substr(src.value,5)) as b,
+  variance(substr(src.value,5)) as c,
+  var_samp(substr(src.value,5)) as d
+ from src)subq
+POSTHOOK: type: QUERY
+STAGE DEPENDENCIES:
+  Stage-1 is a root stage
+  Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+  Stage: Stage-1
+    Map Reduce
+      Map Operator Tree:
+          TableScan
+            alias: src
+            Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE
+            Select Operator
+              expressions: value (type: string)
+              outputColumnNames: value
+              Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE
+              Reduce Output Operator
+                key expressions: substr(value, 5) (type: string)
+                sort order: +
+                Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE
+      Reduce Operator Tree:
+        Group By Operator
+          aggregations: avg(DISTINCT KEY._col0:0._col0), max(KEY._col0:0._col0), variance(KEY._col0:0._col0), var_samp(KEY._col0:0._col0)
+          mode: complete
+          outputColumnNames: _col0, _col1, _col2, _col3
+          Statistics: Num rows: 1 Data size: 108 Basic stats: COMPLETE Column stats: NONE
+          Group By Operator
+            aggregations: compute_stats(_col0, 16), compute_stats(_col1, 16), compute_stats(_col2, 16), compute_stats(_col3, 16)
+            mode: complete
+            outputColumnNames: _col0, _col1, _col2, _col3
+            Statistics: Num rows: 1 Data size: 108 Basic stats: COMPLETE Column stats: NONE
+            File Output Operator
+              compressed: false
+              Statistics: Num rows: 1 Data size: 108 Basic stats: COMPLETE Column stats: NONE
+              table:
+                  input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+                  output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+                  serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+  Stage: Stage-0
+    Fetch Operator
+      limit: -1
+      Processor Tree:
+        ListSink
+
+PREHOOK: query: select compute_stats(a,16),compute_stats(b,16),compute_stats(c,16),compute_stats(d,16)
+from
+(
+select
+  avg(DISTINCT substr(src.value,5)) as a,
+  max(substr(src.value,5)) as b,
+  variance(substr(src.value,5)) as c,
+  var_samp(substr(src.value,5)) as d
+ from src)subq
+PREHOOK: type: QUERY
+PREHOOK: Input: default@src
+#### A masked pattern was here ####
+POSTHOOK: query: select compute_stats(a,16),compute_stats(b,16),compute_stats(c,16),compute_stats(d,16)
+from
+(
+select
+  avg(DISTINCT substr(src.value,5)) as a,
+  max(substr(src.value,5)) as b,
+  variance(substr(src.value,5)) as c,
+  var_samp(substr(src.value,5)) as d
+ from src)subq
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@src
+#### A masked pattern was here ####
+{"columntype":"Double","min":256.10355987055016,"max":256.10355987055016,"countnulls":0,"numdistinctvalues":1,"ndvbitvector":"{2}{1}{0}{2}{0}{1}{1}{1}{0}{0}{1}{1}{0}{2}{1}{0}"}	{"columntype":"String","maxlength":2,"avglength":2.0,"countnulls":0,"numdistinctvalues":1,"ndvbitvector":"{1}{2}{0}{3}{6}{3}{0}{1}{1}{0}{0}{0}{0}{0}{0}{0}"}	{"columntype":"Double","min":20428.07287599999,"max":20428.07287599999,"countnulls":0,"numdistinctvalues":1,"ndvbitvector":"{1}{4}{0}{0}{4}{3}{0}{1}{0}{0}{0}{0}{0}{0}{1}{2}"}	{"columntype":"Double","min":20469.010897795582,"max":20469.010897795582,"countnulls":0,"numdistinctvalues":2,"ndvbitvector":"{2}{0}{2}{2}{0}{0}{2}{0}{0}{0}{0}{0}{1}{0}{0}{0}"}
+PREHOOK: query: explain select compute_stats(a,16),compute_stats(b,16),compute_stats(c,16),compute_stats(d,16)
+from
+(
+select
+  avg(DISTINCT substr(src.value,5)) as a,
+  max(substr(src.value,5)) as b,
+  variance(substr(src.value,5)) as c,
+  var_samp(substr(src.value,5)) as d
+ from src)subq
+PREHOOK: type: QUERY
+POSTHOOK: query: explain select compute_stats(a,16),compute_stats(b,16),compute_stats(c,16),compute_stats(d,16)
+from
+(
+select
+  avg(DISTINCT substr(src.value,5)) as a,
+  max(substr(src.value,5)) as b,
+  variance(substr(src.value,5)) as c,
+  var_samp(substr(src.value,5)) as d
+ from src)subq
+POSTHOOK: type: QUERY
+STAGE DEPENDENCIES:
+  Stage-1 is a root stage
+  Stage-2 depends on stages: Stage-1
+  Stage-0 depends on stages: Stage-2
+
+STAGE PLANS:
+  Stage: Stage-1
+    Map Reduce
+      Map Operator Tree:
+          TableScan
+            alias: src
+            Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE
+            Select Operator
+              expressions: value (type: string)
+              outputColumnNames: value
+              Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE
+              Reduce Output Operator
+                key expressions: substr(value, 5) (type: string)
+                sort order: +
+                Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE
+      Reduce Operator Tree:
+        Group By Operator
+          aggregations: avg(DISTINCT KEY._col0:0._col0), max(KEY._col0:0._col0), variance(KEY._col0:0._col0), var_samp(KEY._col0:0._col0)
+          mode: complete
+          outputColumnNames: _col0, _col1, _col2, _col3
+          Statistics: Num rows: 1 Data size: 108 Basic stats: COMPLETE Column stats: NONE
+          File Output Operator
+            compressed: false
+            table:
+                input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+                output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+                serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe
+
+  Stage: Stage-2
+    Map Reduce
+      Map Operator Tree:
+          TableScan
+            Reduce Output Operator
+              sort order: 
+              Statistics: Num rows: 1 Data size: 108 Basic stats: COMPLETE Column stats: NONE
+              value expressions: _col0 (type: double), _col1 (type: string), _col2 (type: double), _col3 (type: double)
+      Reduce Operator Tree:
+        Group By Operator
+          aggregations: compute_stats(VALUE._col0, 16), compute_stats(VALUE._col2, 16), compute_stats(VALUE._col3, 16), compute_stats(VALUE._col4, 16)
+          mode: complete
+          outputColumnNames: _col0, _col1, _col2, _col3
+          Statistics: Num rows: 1 Data size: 108 Basic stats: COMPLETE Column stats: NONE
+          File Output Operator
+            compressed: false
+            Statistics: Num rows: 1 Data size: 108 Basic stats: COMPLETE Column stats: NONE
+            table:
+                input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+                output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+                serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+  Stage: Stage-0
+    Fetch Operator
+      limit: -1
+      Processor Tree:
+        ListSink
+
+PREHOOK: query: select compute_stats(a,16),compute_stats(b,16),compute_stats(c,16),compute_stats(d,16)
+from
+(
+select
+  avg(DISTINCT substr(src.value,5)) as a,
+  max(substr(src.value,5)) as b,
+  variance(substr(src.value,5)) as c,
+  var_samp(substr(src.value,5)) as d
+ from src)subq
+PREHOOK: type: QUERY
+PREHOOK: Input: default@src
+#### A masked pattern was here ####
+POSTHOOK: query: select compute_stats(a,16),compute_stats(b,16),compute_stats(c,16),compute_stats(d,16)
+from
+(
+select
+  avg(DISTINCT substr(src.value,5)) as a,
+  max(substr(src.value,5)) as b,
+  variance(substr(src.value,5)) as c,
+  var_samp(substr(src.value,5)) as d
+ from src)subq
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@src
+#### A masked pattern was here ####
+{"columntype":"Double","min":256.10355987055016,"max":256.10355987055016,"countnulls":0,"numdistinctvalues":1,"ndvbitvector":"{2}{1}{0}{2}{0}{1}{1}{1}{0}{0}{1}{1}{0}{2}{1}{0}"}	{"columntype":"String","maxlength":2,"avglength":2.0,"countnulls":0,"numdistinctvalues":1,"ndvbitvector":"{1}{2}{0}{3}{6}{3}{0}{1}{1}{0}{0}{0}{0}{0}{0}{0}"}	{"columntype":"Double","min":20428.07287599999,"max":20428.07287599999,"countnulls":0,"numdistinctvalues":1,"ndvbitvector":"{1}{4}{0}{0}{4}{3}{0}{1}{0}{0}{0}{0}{0}{0}{1}{2}"}	{"columntype":"Double","min":20469.010897795582,"max":20469.010897795582,"countnulls":0,"numdistinctvalues":2,"ndvbitvector":"{2}{0}{2}{2}{0}{0}{2}{0}{0}{0}{0}{0}{1}{0}{0}{0}"}