You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by st...@apache.org on 2022/04/05 07:43:13 UTC

[impala] 01/03: IMPALA-11008: fix incorrect to propagate inferred predicates

This is an automated email from the ASF dual-hosted git repository.

stigahuang pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git

commit abfa5a72b6a6046037f27ad02d19b31f7b31c4e5
Author: xqhe <he...@126.com>
AuthorDate: Tue Feb 15 12:24:10 2022 +0800

    IMPALA-11008: fix incorrect to propagate inferred predicates
    
    It is incorrect to propagate predicates inferred from equi-join
    conjuncts into a plan subtree that is on the nullable side of an
    outer join if the predicate is not null-filtering for the nullable
    side.
    
    For example:
    SELECT *
    FROM (
      SELECT id IS NOT NULL
        AND col IS NULL AS a
      FROM (
        SELECT A.id, B.col
        FROM A
          LEFT JOIN B ON A.id = B.id
      ) t
    ) t
    WHERE a = 1
    Before this patch the inferred predicate '(B.id is not null and
    B.col is null) = 1' is evaluated at the scanner of B. This is
    incorrect since the predicate '(A.id is not null and B.col is null)
    = 1' is not null-filtering for B.
    To generate the inferred predicate we substitue the non-outer-join
    slots first and use 'isNullableConjunct' to do a more strict check
    on the conjunct before the final substitution.
    
    Tests:
      - Add plan tests in predicate-propagation.test
      - Add new query tests to verify the correctness of inferred
        predicates propagation
      - Ran the full set of verifications in Impala Public Jenkins
    
    Change-Id: I9e64230f6d0c2b9ef1560186ceba349a5920ccdf
    Reviewed-on: http://gerrit.cloudera.org:8080/18234
    Reviewed-by: Impala Public Jenkins <im...@cloudera.com>
    Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
 .../java/org/apache/impala/analysis/Analyzer.java  |  35 ++-
 .../queries/PlannerTest/predicate-propagation.test | 262 +++++++++++++++++++++
 .../queries/QueryTest/outer-joins.test             |  60 +++++
 3 files changed, 356 insertions(+), 1 deletion(-)

diff --git a/fe/src/main/java/org/apache/impala/analysis/Analyzer.java b/fe/src/main/java/org/apache/impala/analysis/Analyzer.java
index d8d412694..5cdadbff1 100644
--- a/fe/src/main/java/org/apache/impala/analysis/Analyzer.java
+++ b/fe/src/main/java/org/apache/impala/analysis/Analyzer.java
@@ -517,6 +517,11 @@ public class Analyzer {
     // ref.
     public Set<TupleId> zippingUnnestTupleIds = new HashSet<>();
 
+    // This holds the nullable side slot ids from the outer join's equi-join conjuncts
+    // e.g. t1 left join t2 on t1.id = t2.id, the slot id of t2.id will be added to
+    // this set.
+    public Set<SlotId> ojNullableSlotsInEquiPreds = new HashSet<>();
+
     public GlobalState(StmtTableCache stmtTableCache, TQueryCtx queryCtx,
         AuthorizationFactory authzFactory, AuthorizationContext authzCtx) {
       this.stmtTableCache = stmtTableCache;
@@ -2131,14 +2136,38 @@ public class Analyzer {
         if (srcSids.containsAll(destSids)) {
           p = srcConjunct;
         } else {
+          // It is incorrect to propagate predicates inferred from equi-join conjuncts
+          // into a plan subtree that is on the nullable side of an outer join if the
+          // predicate is not null-filtering.
+          // For example:
+          // select * from (select id is not null and col is null as a from (select A.id,
+          // B.col from A left join B on A.id = B.id) t ) t where a = 1
+          // In this query the inferred predicate (B.id is not null and B.col is null = 1)
+          // should not be evaluated at the scanner of B.
+          // We use 'ojmap' to save the mapping from src to the outer join equal slot,
+          // eg. B.id. We substitue the non-outer-join slots first and use
+          // 'isNullableConjunct' to do a more strict check on the conjunct before the
+          // final substitution.
           ExprSubstitutionMap smap = new ExprSubstitutionMap();
+          ExprSubstitutionMap ojSmap = new ExprSubstitutionMap();
           for (int i = 0; i < srcSids.size(); ++i) {
-            smap.put(
+            if (globalState_.ojNullableSlotsInEquiPreds.contains(destSids.get(i))) {
+              ojSmap.put(
                 new SlotRef(globalState_.descTbl.getSlotDesc(srcSids.get(i))),
                 new SlotRef(globalState_.descTbl.getSlotDesc(destSids.get(i))));
+            } else {
+              smap.put(
+                  new SlotRef(globalState_.descTbl.getSlotDesc(srcSids.get(i))),
+                  new SlotRef(globalState_.descTbl.getSlotDesc(destSids.get(i))));
+            }
           }
           try {
             p = srcConjunct.trySubstitute(smap, this, false);
+            if (hasOuterJoinedTuple &&
+                isNullableConjunct(p, Arrays.asList(destTid))) continue;
+            if (ojSmap.size() != 0) {
+              p = p.trySubstitute(ojSmap, this, false);
+            }
           } catch (ImpalaException exc) {
             // not an executable predicate; ignore
             continue;
@@ -2715,6 +2744,10 @@ public class Analyzer {
           || tblRef.getJoinOp() == JoinOperator.RIGHT_ANTI_JOIN) {
         g.addEdge(innerSlot.asInt(), outerSlot.asInt());
       }
+      if (tblRef.getJoinOp() == JoinOperator.LEFT_OUTER_JOIN ||
+          tblRef.getJoinOp() == JoinOperator.RIGHT_OUTER_JOIN) {
+        globalState_.ojNullableSlotsInEquiPreds.add(innerSlot);
+      }
     }
   }
 
diff --git a/testdata/workloads/functional-planner/queries/PlannerTest/predicate-propagation.test b/testdata/workloads/functional-planner/queries/PlannerTest/predicate-propagation.test
index 74ff2b4d6..b136d965d 100644
--- a/testdata/workloads/functional-planner/queries/PlannerTest/predicate-propagation.test
+++ b/testdata/workloads/functional-planner/queries/PlannerTest/predicate-propagation.test
@@ -1590,3 +1590,265 @@ PLAN-ROOT SINK
    predicates: c_custkey % 2 = 0
    row-size=8B cardinality=15.00K
 ====
+# IMPALA-11008: fix incorrect to propagate inferred predicates
+# The predicate 'case when t1.id is not null and t2.some_nulls is null
+# then true else null end is not null' is not null-filtering for t2, so
+# the inferred predicate is not pushable for t2.
+SELECT COUNT(*) FROM
+(
+  SELECT CASE WHEN id IS NOT NULL AND some_nulls IS NULL
+         THEN true ELSE NULL END AS a
+  FROM
+  (
+    SELECT STRAIGHT_JOIN t1.id
+           ,t2.some_nulls
+      FROM functional.nullrows t1
+      LEFT JOIN functional.nullrows t2
+      ON t1.id = t2.id
+   ) t
+) t
+WHERE a IS NOT NULL
+---- PLAN
+PLAN-ROOT SINK
+|
+03:AGGREGATE [FINALIZE]
+|  output: count(*)
+|  row-size=8B cardinality=1
+|
+02:HASH JOIN [LEFT OUTER JOIN]
+|  hash predicates: t1.id = t2.id
+|  other predicates: CASE WHEN t1.id IS NOT NULL AND t2.some_nulls IS NULL THEN TRUE ELSE NULL END IS NOT NULL
+|  row-size=39B cardinality=26
+|
+|--01:SCAN HDFS [functional.nullrows t2]
+|     HDFS partitions=1/1 files=1 size=541B
+|     row-size=26B cardinality=26
+|
+00:SCAN HDFS [functional.nullrows t1]
+   HDFS partitions=1/1 files=1 size=541B
+   row-size=13B cardinality=26
+====
+# IMPALA-11008: fix incorrect to propagate inferred predicates
+# Same as above but with right join
+SELECT COUNT(*) FROM
+(
+  SELECT CASE WHEN id IS NOT NULL AND some_nulls IS NULL
+         THEN true ELSE NULL END AS a
+  FROM
+  (
+    SELECT STRAIGHT_JOIN t2.id
+           ,t1.some_nulls
+      FROM functional.nullrows t1
+      RIGHT JOIN functional.nullrows t2
+      ON t1.id = t2.id
+   ) t
+) t
+WHERE a IS NOT NULL
+---- PLAN
+PLAN-ROOT SINK
+|
+03:AGGREGATE [FINALIZE]
+|  output: count(*)
+|  row-size=8B cardinality=1
+|
+02:HASH JOIN [RIGHT OUTER JOIN]
+|  hash predicates: t1.id = t2.id
+|  other predicates: CASE WHEN t2.id IS NOT NULL AND t1.some_nulls IS NULL THEN TRUE ELSE NULL END IS NOT NULL
+|  runtime filters: RF000 <- t2.id
+|  row-size=39B cardinality=26
+|
+|--01:SCAN HDFS [functional.nullrows t2]
+|     HDFS partitions=1/1 files=1 size=541B
+|     row-size=13B cardinality=26
+|
+00:SCAN HDFS [functional.nullrows t1]
+   HDFS partitions=1/1 files=1 size=541B
+   runtime filters: RF000 -> t1.id
+   row-size=26B cardinality=26
+====
+# IMPALA-11008: fix incorrect to propagate inferred predicates
+# It's correct to propagate inferred predicates into an inner join
+SELECT COUNT(*) FROM
+(
+  SELECT CASE WHEN id IS NOT NULL AND some_nulls IS NULL
+         THEN true ELSE NULL END AS a
+  FROM
+  (
+    SELECT t1.id
+           ,t2.some_nulls
+      FROM functional.nullrows t1
+      JOIN functional.nullrows t2
+      ON t1.id = t2.id
+   ) t
+) t
+WHERE a IS NOT NULL
+---- PLAN
+PLAN-ROOT SINK
+|
+03:AGGREGATE [FINALIZE]
+|  output: count(*)
+|  row-size=8B cardinality=1
+|
+02:HASH JOIN [INNER JOIN]
+|  hash predicates: t1.id = t2.id
+|  other predicates: CASE WHEN t1.id IS NOT NULL AND t2.some_nulls IS NULL THEN TRUE ELSE NULL END IS NOT NULL
+|  runtime filters: RF000 <- t2.id
+|  row-size=39B cardinality=3
+|
+|--01:SCAN HDFS [functional.nullrows t2]
+|     HDFS partitions=1/1 files=1 size=541B
+|     predicates: CASE WHEN t2.id IS NOT NULL AND t2.some_nulls IS NULL THEN TRUE ELSE NULL END IS NOT NULL
+|     row-size=26B cardinality=3
+|
+00:SCAN HDFS [functional.nullrows t1]
+   HDFS partitions=1/1 files=1 size=541B
+   runtime filters: RF000 -> t1.id
+   row-size=13B cardinality=26
+====
+# IMPALA-11008: fix incorrect to propagate inferred predicates
+# The predicate 'case when t2.id is not null and t2.some_nulls is null
+# then true else null end is not null' is null-filtering for t2.
+SELECT COUNT(*) FROM
+(
+  SELECT CASE WHEN id IS NOT NULL AND some_nulls IS NULL
+         THEN true ELSE NULL END AS a
+  FROM
+  (
+    SELECT t2.id
+           ,t2.some_nulls
+      FROM functional.nullrows t1
+      LEFT JOIN functional.nullrows t2
+      ON t1.id = t2.id
+   ) t
+) t
+WHERE a IS NOT NULL
+---- PLAN
+PLAN-ROOT SINK
+|
+03:AGGREGATE [FINALIZE]
+|  output: count(*)
+|  row-size=8B cardinality=1
+|
+02:HASH JOIN [LEFT OUTER JOIN]
+|  hash predicates: t1.id = t2.id
+|  other predicates: CASE WHEN t2.id IS NOT NULL AND t2.some_nulls IS NULL THEN TRUE ELSE NULL END IS NOT NULL
+|  row-size=39B cardinality=26
+|
+|--01:SCAN HDFS [functional.nullrows t2]
+|     HDFS partitions=1/1 files=1 size=541B
+|     predicates: CASE WHEN t2.id IS NOT NULL AND t2.some_nulls IS NULL THEN TRUE ELSE NULL END IS NOT NULL
+|     row-size=26B cardinality=3
+|
+00:SCAN HDFS [functional.nullrows t1]
+   HDFS partitions=1/1 files=1 size=541B
+   row-size=13B cardinality=26
+====
+# IMPALA-11008: fix incorrect to propagate inferred predicates
+# The predicate 't1.test_id + t2.zip = 2' is null-filtering for t2.
+SELECT *
+FROM (SELECT test_id + zip a FROM
+    (SELECT STRAIGHT_JOIN t1.test_id, t2.zip FROM functional.jointbl t1
+    LEFT JOIN functional.testtbl t2 ON t1.test_id = t2.id
+   ) t
+) t
+WHERE a = 2
+---- PLAN
+PLAN-ROOT SINK
+|
+02:HASH JOIN [LEFT OUTER JOIN]
+|  hash predicates: t1.test_id = t2.id
+|  other predicates: t1.test_id + t2.zip = 2
+|  row-size=20B cardinality=19
+|
+|--01:SCAN HDFS [functional.testtbl t2]
+|     HDFS partitions=1/1 files=0 size=0B
+|     predicates: t2.id + t2.zip = 2
+|     row-size=12B cardinality=0
+|
+00:SCAN HDFS [functional.jointbl t1]
+   HDFS partitions=1/1 files=1 size=433B
+   row-size=8B cardinality=19
+====
+# IMPALA-11008: fix incorrect to propagate inferred predicates
+# The predicate '(t1.id is not null and t2.some_nulls is null) = 1' is
+# not null-filtering for t2.
+SELECT *
+FROM (SELECT id IS NOT NULL AND some_nulls IS NULL AS a
+    FROM (SELECT STRAIGHT_JOIN t1.id, t2.some_nulls
+      FROM functional.nullrows t1
+      LEFT JOIN functional.nullrows t2 ON t1.id = t2.id
+   ) t
+) t
+WHERE a = 1
+---- PLAN
+PLAN-ROOT SINK
+|
+02:HASH JOIN [LEFT OUTER JOIN]
+|  hash predicates: t1.id = t2.id
+|  other predicates: t1.id IS NOT NULL AND t2.some_nulls IS NULL = 1
+|  row-size=39B cardinality=26
+|
+|--01:SCAN HDFS [functional.nullrows t2]
+|     HDFS partitions=1/1 files=1 size=541B
+|     row-size=26B cardinality=26
+|
+00:SCAN HDFS [functional.nullrows t1]
+   HDFS partitions=1/1 files=1 size=541B
+   row-size=13B cardinality=26
+====
+# IMPALA-11008: fix incorrect to propagate inferred predicates
+# Same as above but with right join
+SELECT *
+FROM (SELECT id IS NOT NULL AND some_nulls IS NULL AS a
+    FROM (SELECT STRAIGHT_JOIN t2.id, t1.some_nulls
+      FROM functional.nullrows t1
+      RIGHT JOIN functional.nullrows t2 ON t1.id = t2.id
+   ) t
+) t
+WHERE a = 1
+---- PLAN
+PLAN-ROOT SINK
+|
+02:HASH JOIN [RIGHT OUTER JOIN]
+|  hash predicates: t1.id = t2.id
+|  other predicates: t2.id IS NOT NULL AND t1.some_nulls IS NULL = 1
+|  runtime filters: RF000 <- t2.id
+|  row-size=39B cardinality=26
+|
+|--01:SCAN HDFS [functional.nullrows t2]
+|     HDFS partitions=1/1 files=1 size=541B
+|     row-size=13B cardinality=26
+|
+00:SCAN HDFS [functional.nullrows t1]
+   HDFS partitions=1/1 files=1 size=541B
+   runtime filters: RF000 -> t1.id
+   row-size=26B cardinality=26
+====
+# IMPALA-11008: fix incorrect to propagate inferred predicates
+# The predicate '(t1.id is not null and t2.some_nulls is not null) = 1'
+# is null-filtering for t2.
+SELECT *
+FROM (SELECT id IS NOT NULL AND some_nulls IS NOT NULL AS a
+   FROM (SELECT STRAIGHT_JOIN t1.id, t2.some_nulls
+      FROM functional.nullrows t1
+        LEFT JOIN functional.nullrows t2 ON t1.id = t2.id
+   ) t
+) t
+WHERE a = 1
+---- PLAN
+PLAN-ROOT SINK
+|
+02:HASH JOIN [LEFT OUTER JOIN]
+|  hash predicates: t1.id = t2.id
+|  other predicates: t1.id IS NOT NULL AND t2.some_nulls IS NOT NULL = 1
+|  row-size=39B cardinality=26
+|
+|--01:SCAN HDFS [functional.nullrows t2]
+|     HDFS partitions=1/1 files=1 size=541B
+|     predicates: t2.id IS NOT NULL AND t2.some_nulls IS NOT NULL = 1
+|     row-size=26B cardinality=9
+|
+00:SCAN HDFS [functional.nullrows t1]
+   HDFS partitions=1/1 files=1 size=541B
+   row-size=13B cardinality=26
+====
\ No newline at end of file
diff --git a/testdata/workloads/functional-query/queries/QueryTest/outer-joins.test b/testdata/workloads/functional-query/queries/QueryTest/outer-joins.test
index edb1e8449..6761e88c1 100644
--- a/testdata/workloads/functional-query/queries/QueryTest/outer-joins.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/outer-joins.test
@@ -728,3 +728,63 @@ LEFT OUTER JOIN (
 ---- TYPES
 bigint
 ====
+---- QUERY
+# Test inferred predicate propagation
+SELECT count(*)
+FROM (
+  SELECT CASE
+      WHEN id IS NOT NULL
+      AND some_nulls IS NULL THEN true
+      ELSE NULL
+    END AS a
+  FROM (
+    SELECT STRAIGHT_JOIN t1.id, t2.some_nulls
+    FROM functional.nullrows t1
+      LEFT JOIN functional.nullrows t2 ON t1.id = t2.id
+  ) t
+) t
+WHERE a IS NOT NULL
+---- RESULTS
+20
+---- TYPES
+bigint
+====
+---- QUERY
+# Test inferred predicate propagation
+SELECT count(*)
+FROM (
+  SELECT CASE
+      WHEN id IS NOT NULL
+      AND some_nulls IS NULL THEN true
+      ELSE NULL
+    END AS a
+  FROM (
+    SELECT STRAIGHT_JOIN t1.id, t2.some_nulls
+    FROM functional.nullrows t2
+      RIGHT JOIN functional.nullrows t1 ON t1.id = t2.id
+  ) t
+) t
+WHERE a IS NOT NULL
+---- RESULTS
+20
+---- TYPES
+bigint
+====
+---- QUERY
+# Test inferred predicate propagation
+SELECT count(*)
+FROM (
+  SELECT id IS NOT NULL
+      AND some_nulls IS NOT NULL AS a
+  FROM (
+    SELECT STRAIGHT_JOIN t1.id, t2.some_nulls
+    FROM functional.nullrows t1
+      LEFT JOIN functional.nullrows t2 ON t1.id = t2.id
+ ) t
+) t
+WHERE a = 1
+---- RESULTS
+6
+---- TYPES
+bigint
+====
\ No newline at end of file