You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by jc...@apache.org on 2020/03/17 17:58:07 UTC
[hive] branch master updated: HIVE-23011: Shared work optimizer should check residual predicates when comparing joins (Jesus Camacho Rodriguez, reviewed by Vineet Garg)

This is an automated email from the ASF dual-hosted git repository.

jcamacho pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hive.git


The following commit(s) were added to refs/heads/master by this push:
     new 26cc315  HIVE-23011: Shared work optimizer should check residual predicates when comparing joins (Jesus Camacho Rodriguez, reviewed by Vineet Garg)
26cc315 is described below

commit 26cc3154c061d2194fba1c3bb156bb7e06e4a6c5
Author: Jesus Camacho Rodriguez <jc...@apache.org>
AuthorDate: Tue Mar 17 10:57:43 2020 -0700

    HIVE-23011: Shared work optimizer should check residual predicates when comparing joins (Jesus Camacho Rodriguez, reviewed by Vineet Garg)
---
 .../test/resources/testconfiguration.properties    |   1 +
 .../org/apache/hadoop/hive/ql/plan/JoinDesc.java   |   1 +
 .../queries/clientpositive/sharedworkresidual.q    |  46 +++
 .../clientpositive/llap/sharedworkresidual.q.out   | 321 +++++++++++++++++++++
 4 files changed, 369 insertions(+)

diff --git a/itests/src/test/resources/testconfiguration.properties b/itests/src/test/resources/testconfiguration.properties
index 9d27b8d..f71ed3d 100644
--- a/itests/src/test/resources/testconfiguration.properties
+++ b/itests/src/test/resources/testconfiguration.properties
@@ -792,6 +792,7 @@ minillaplocal.query.files=\
   semijoin_reddedup,\
   semijoin_reddedup.q,\
   sharedworkext.q,\
+  sharedworkresidual.q,\
   smb_cache.q,\
   special_character_in_tabnames_1.q,\
   sqlmerge.q,\
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/plan/JoinDesc.java b/ql/src/java/org/apache/hadoop/hive/ql/plan/JoinDesc.java
index 2c93c2a..523fe54 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/plan/JoinDesc.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/plan/JoinDesc.java
@@ -749,6 +749,7 @@ public class JoinDesc extends AbstractOperatorDesc {
           Objects.equals(getFiltersStringMap(), otherDesc.getFiltersStringMap()) &&
           Objects.equals(getOutputColumnNames(), otherDesc.getOutputColumnNames()) &&
           Objects.equals(getCondsList(), otherDesc.getCondsList()) &&
+          Objects.equals(getResidualFilterExprsString(), otherDesc.getResidualFilterExprsString()) &&
           getHandleSkewJoin() == otherDesc.getHandleSkewJoin() &&
           Objects.equals(getNullSafeString(), otherDesc.getNullSafeString());
     }
diff --git a/ql/src/test/queries/clientpositive/sharedworkresidual.q b/ql/src/test/queries/clientpositive/sharedworkresidual.q
new file mode 100644
index 0000000..e8a4b37
--- /dev/null
+++ b/ql/src/test/queries/clientpositive/sharedworkresidual.q
@@ -0,0 +1,46 @@
+CREATE TABLE dimension_date (
+  id int,
+  dateid int,
+  completedate string,
+  daynumberinweek tinyint,
+  dayfullname string,
+  daynumberinmonth tinyint,
+  daynumberinyear int,
+  weeknumberinyear tinyint,
+  monthnumberinyear tinyint,
+  monthfullname string,
+  quarternumber tinyint,
+  quartername string,
+  yearnumber int,
+  weekstartdate string,
+  weekstartdateid int,
+  monthstartdate string,
+  monthstartdateid int);
+
+
+explain
+ with daily as (
+ select * 
+ from dimension_date
+ where dateid = 20200228
+ ),
+ weekly as (
+ select dly.dateid,count(1) as mb_wk
+ from dimension_date dly
+ left join dimension_date wk
+ ON datediff(dly.completedate, wk.completedate) >= 0 
+ AND datediff(dly.completedate, wk.completedate) < 6
+ GROUP BY dly.dateid
+ ),
+ monthly as (
+ select dly.dateid,count(1) as nb_monthly
+ from dimension_date dly
+ left join dimension_date wk
+ ON datediff(dly.completedate, wk.completedate) >= 0 
+ AND datediff(dly.completedate, wk.completedate) < 28
+ GROUP BY dly.dateid
+ )
+ select daily.dateid,mb_wk,nb_monthly
+ from daily
+ left join weekly on daily.dateid = weekly.dateid
+ left join monthly on daily.dateid = monthly.dateid;
diff --git a/ql/src/test/results/clientpositive/llap/sharedworkresidual.q.out b/ql/src/test/results/clientpositive/llap/sharedworkresidual.q.out
new file mode 100644
index 0000000..20eb245
--- /dev/null
+++ b/ql/src/test/results/clientpositive/llap/sharedworkresidual.q.out
@@ -0,0 +1,321 @@
+PREHOOK: query: CREATE TABLE dimension_date (
+  id int,
+  dateid int,
+  completedate string,
+  daynumberinweek tinyint,
+  dayfullname string,
+  daynumberinmonth tinyint,
+  daynumberinyear int,
+  weeknumberinyear tinyint,
+  monthnumberinyear tinyint,
+  monthfullname string,
+  quarternumber tinyint,
+  quartername string,
+  yearnumber int,
+  weekstartdate string,
+  weekstartdateid int,
+  monthstartdate string,
+  monthstartdateid int)
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@dimension_date
+POSTHOOK: query: CREATE TABLE dimension_date (
+  id int,
+  dateid int,
+  completedate string,
+  daynumberinweek tinyint,
+  dayfullname string,
+  daynumberinmonth tinyint,
+  daynumberinyear int,
+  weeknumberinyear tinyint,
+  monthnumberinyear tinyint,
+  monthfullname string,
+  quarternumber tinyint,
+  quartername string,
+  yearnumber int,
+  weekstartdate string,
+  weekstartdateid int,
+  monthstartdate string,
+  monthstartdateid int)
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@dimension_date
+Warning: Shuffle Join MERGEJOIN[42][tables = [$hdt$_1, $hdt$_2]] in Stage 'Reducer 5' is a cross product
+Warning: Shuffle Join MERGEJOIN[44][tables = [$hdt$_0, $hdt$_1]] in Stage 'Reducer 2' is a cross product
+Warning: Shuffle Join MERGEJOIN[43][tables = [$hdt$_2, $hdt$_3]] in Stage 'Reducer 7' is a cross product
+Warning: Shuffle Join MERGEJOIN[45][tables = [$hdt$_0, $hdt$_1, $hdt$_2]] in Stage 'Reducer 3' is a cross product
+PREHOOK: query: explain
+ with daily as (
+ select * 
+ from dimension_date
+ where dateid = 20200228
+ ),
+ weekly as (
+ select dly.dateid,count(1) as mb_wk
+ from dimension_date dly
+ left join dimension_date wk
+ ON datediff(dly.completedate, wk.completedate) >= 0 
+ AND datediff(dly.completedate, wk.completedate) < 6
+ GROUP BY dly.dateid
+ ),
+ monthly as (
+ select dly.dateid,count(1) as nb_monthly
+ from dimension_date dly
+ left join dimension_date wk
+ ON datediff(dly.completedate, wk.completedate) >= 0 
+ AND datediff(dly.completedate, wk.completedate) < 28
+ GROUP BY dly.dateid
+ )
+ select daily.dateid,mb_wk,nb_monthly
+ from daily
+ left join weekly on daily.dateid = weekly.dateid
+ left join monthly on daily.dateid = monthly.dateid
+PREHOOK: type: QUERY
+PREHOOK: Input: default@dimension_date
+#### A masked pattern was here ####
+POSTHOOK: query: explain
+ with daily as (
+ select * 
+ from dimension_date
+ where dateid = 20200228
+ ),
+ weekly as (
+ select dly.dateid,count(1) as mb_wk
+ from dimension_date dly
+ left join dimension_date wk
+ ON datediff(dly.completedate, wk.completedate) >= 0 
+ AND datediff(dly.completedate, wk.completedate) < 6
+ GROUP BY dly.dateid
+ ),
+ monthly as (
+ select dly.dateid,count(1) as nb_monthly
+ from dimension_date dly
+ left join dimension_date wk
+ ON datediff(dly.completedate, wk.completedate) >= 0 
+ AND datediff(dly.completedate, wk.completedate) < 28
+ GROUP BY dly.dateid
+ )
+ select daily.dateid,mb_wk,nb_monthly
+ from daily
+ left join weekly on daily.dateid = weekly.dateid
+ left join monthly on daily.dateid = monthly.dateid
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@dimension_date
+#### A masked pattern was here ####
+STAGE DEPENDENCIES:
+  Stage-1 is a root stage
+  Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+  Stage: Stage-1
+    Tez
+#### A masked pattern was here ####
+      Edges:
+        Reducer 2 <- Map 1 (CUSTOM_SIMPLE_EDGE), Reducer 6 (CUSTOM_SIMPLE_EDGE)
+        Reducer 3 <- Reducer 2 (CUSTOM_SIMPLE_EDGE), Reducer 8 (CUSTOM_SIMPLE_EDGE)
+        Reducer 5 <- Map 4 (CUSTOM_SIMPLE_EDGE), Map 9 (CUSTOM_SIMPLE_EDGE)
+        Reducer 6 <- Reducer 5 (SIMPLE_EDGE)
+        Reducer 7 <- Map 4 (CUSTOM_SIMPLE_EDGE), Map 9 (CUSTOM_SIMPLE_EDGE)
+        Reducer 8 <- Reducer 7 (SIMPLE_EDGE)
+#### A masked pattern was here ####
+      Vertices:
+        Map 1 
+            Map Operator Tree:
+                TableScan
+                  alias: dimension_date
+                  filterExpr: (dateid = 20200228) (type: boolean)
+                  Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE Column stats: NONE
+                  Filter Operator
+                    predicate: (dateid = 20200228) (type: boolean)
+                    Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE Column stats: NONE
+                    Select Operator
+                      Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE Column stats: NONE
+                      Reduce Output Operator
+                        null sort order: 
+                        sort order: 
+                        Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE Column stats: NONE
+            Execution mode: vectorized, llap
+            LLAP IO: no inputs
+        Map 4 
+            Map Operator Tree:
+                TableScan
+                  alias: dly
+                  filterExpr: (dateid = 20200228) (type: boolean)
+                  Statistics: Num rows: 1 Data size: 188 Basic stats: COMPLETE Column stats: NONE
+                  Filter Operator
+                    predicate: (dateid = 20200228) (type: boolean)
+                    Statistics: Num rows: 1 Data size: 188 Basic stats: COMPLETE Column stats: NONE
+                    Select Operator
+                      expressions: completedate (type: string)
+                      outputColumnNames: _col0
+                      Statistics: Num rows: 1 Data size: 188 Basic stats: COMPLETE Column stats: NONE
+                      Reduce Output Operator
+                        null sort order: 
+                        sort order: 
+                        Statistics: Num rows: 1 Data size: 188 Basic stats: COMPLETE Column stats: NONE
+                        value expressions: _col0 (type: string)
+                      Reduce Output Operator
+                        null sort order: 
+                        sort order: 
+                        Statistics: Num rows: 1 Data size: 188 Basic stats: COMPLETE Column stats: NONE
+                        value expressions: _col0 (type: string)
+            Execution mode: vectorized, llap
+            LLAP IO: no inputs
+        Map 9 
+            Map Operator Tree:
+                TableScan
+                  alias: wk
+                  Statistics: Num rows: 1 Data size: 184 Basic stats: COMPLETE Column stats: NONE
+                  Select Operator
+                    expressions: completedate (type: string)
+                    outputColumnNames: _col0
+                    Statistics: Num rows: 1 Data size: 184 Basic stats: COMPLETE Column stats: NONE
+                    Reduce Output Operator
+                      null sort order: 
+                      sort order: 
+                      Statistics: Num rows: 1 Data size: 184 Basic stats: COMPLETE Column stats: NONE
+                      value expressions: _col0 (type: string)
+                    Reduce Output Operator
+                      null sort order: 
+                      sort order: 
+                      Statistics: Num rows: 1 Data size: 184 Basic stats: COMPLETE Column stats: NONE
+                      value expressions: _col0 (type: string)
+            Execution mode: vectorized, llap
+            LLAP IO: no inputs
+        Reducer 2 
+            Execution mode: llap
+            Reduce Operator Tree:
+              Merge Join Operator
+                condition map:
+                     Left Outer Join 0 to 1
+                keys:
+                  0 
+                  1 
+                outputColumnNames: _col1
+                Statistics: Num rows: 1 Data size: 378 Basic stats: COMPLETE Column stats: NONE
+                Reduce Output Operator
+                  null sort order: 
+                  sort order: 
+                  Statistics: Num rows: 1 Data size: 378 Basic stats: COMPLETE Column stats: NONE
+                  value expressions: _col1 (type: bigint)
+        Reducer 3 
+            Execution mode: llap
+            Reduce Operator Tree:
+              Merge Join Operator
+                condition map:
+                     Left Outer Join 0 to 1
+                keys:
+                  0 
+                  1 
+                outputColumnNames: _col1, _col2
+                Statistics: Num rows: 1 Data size: 752 Basic stats: COMPLETE Column stats: NONE
+                Select Operator
+                  expressions: 20200228 (type: int), _col1 (type: bigint), _col2 (type: bigint)
+                  outputColumnNames: _col0, _col1, _col2
+                  Statistics: Num rows: 1 Data size: 752 Basic stats: COMPLETE Column stats: NONE
+                  File Output Operator
+                    compressed: false
+                    Statistics: Num rows: 1 Data size: 752 Basic stats: COMPLETE Column stats: NONE
+                    table:
+                        input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+                        output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+                        serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+        Reducer 5 
+            Execution mode: llap
+            Reduce Operator Tree:
+              Merge Join Operator
+                condition map:
+                     Left Outer Join 0 to 1
+                keys:
+                  0 
+                  1 
+                outputColumnNames: _col0, _col1
+                residual filter predicates: {(datediff(_col0, _col1) >= 0)} {(datediff(_col0, _col1) < 6)}
+                Statistics: Num rows: 1 Data size: 373 Basic stats: COMPLETE Column stats: NONE
+                Select Operator
+                  Statistics: Num rows: 1 Data size: 373 Basic stats: COMPLETE Column stats: NONE
+                  Group By Operator
+                    aggregations: count()
+                    keys: true (type: boolean)
+                    minReductionHashAggr: 0.99
+                    mode: hash
+                    outputColumnNames: _col0, _col1
+                    Statistics: Num rows: 1 Data size: 373 Basic stats: COMPLETE Column stats: NONE
+                    Reduce Output Operator
+                      key expressions: _col0 (type: boolean)
+                      null sort order: z
+                      sort order: +
+                      Map-reduce partition columns: _col0 (type: boolean)
+                      Statistics: Num rows: 1 Data size: 373 Basic stats: COMPLETE Column stats: NONE
+                      value expressions: _col1 (type: bigint)
+        Reducer 6 
+            Execution mode: vectorized, llap
+            Reduce Operator Tree:
+              Group By Operator
+                aggregations: count(VALUE._col0)
+                keys: KEY._col0 (type: boolean)
+                mode: mergepartial
+                outputColumnNames: _col0, _col1
+                Statistics: Num rows: 1 Data size: 373 Basic stats: COMPLETE Column stats: NONE
+                Select Operator
+                  expressions: _col1 (type: bigint)
+                  outputColumnNames: _col0
+                  Statistics: Num rows: 1 Data size: 373 Basic stats: COMPLETE Column stats: NONE
+                  Reduce Output Operator
+                    null sort order: 
+                    sort order: 
+                    Statistics: Num rows: 1 Data size: 373 Basic stats: COMPLETE Column stats: NONE
+                    value expressions: _col0 (type: bigint)
+        Reducer 7 
+            Execution mode: llap
+            Reduce Operator Tree:
+              Merge Join Operator
+                condition map:
+                     Left Outer Join 0 to 1
+                keys:
+                  0 
+                  1 
+                outputColumnNames: _col0, _col1
+                residual filter predicates: {(datediff(_col0, _col1) >= 0)} {(datediff(_col0, _col1) < 28)}
+                Statistics: Num rows: 1 Data size: 373 Basic stats: COMPLETE Column stats: NONE
+                Select Operator
+                  Statistics: Num rows: 1 Data size: 373 Basic stats: COMPLETE Column stats: NONE
+                  Group By Operator
+                    aggregations: count()
+                    keys: true (type: boolean)
+                    minReductionHashAggr: 0.99
+                    mode: hash
+                    outputColumnNames: _col0, _col1
+                    Statistics: Num rows: 1 Data size: 373 Basic stats: COMPLETE Column stats: NONE
+                    Reduce Output Operator
+                      key expressions: _col0 (type: boolean)
+                      null sort order: z
+                      sort order: +
+                      Map-reduce partition columns: _col0 (type: boolean)
+                      Statistics: Num rows: 1 Data size: 373 Basic stats: COMPLETE Column stats: NONE
+                      value expressions: _col1 (type: bigint)
+        Reducer 8 
+            Execution mode: vectorized, llap
+            Reduce Operator Tree:
+              Group By Operator
+                aggregations: count(VALUE._col0)
+                keys: KEY._col0 (type: boolean)
+                mode: mergepartial
+                outputColumnNames: _col0, _col1
+                Statistics: Num rows: 1 Data size: 373 Basic stats: COMPLETE Column stats: NONE
+                Select Operator
+                  expressions: _col1 (type: bigint)
+                  outputColumnNames: _col0
+                  Statistics: Num rows: 1 Data size: 373 Basic stats: COMPLETE Column stats: NONE
+                  Reduce Output Operator
+                    null sort order: 
+                    sort order: 
+                    Statistics: Num rows: 1 Data size: 373 Basic stats: COMPLETE Column stats: NONE
+                    value expressions: _col0 (type: bigint)
+
+  Stage: Stage-0
+    Fetch Operator
+      limit: -1
+      Processor Tree:
+        ListSink
+