You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by st...@apache.org on 2022/04/05 07:43:13 UTC
[impala] 01/03: IMPALA-11008: fix incorrect to propagate inferred predicates
This is an automated email from the ASF dual-hosted git repository.
stigahuang pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git
commit abfa5a72b6a6046037f27ad02d19b31f7b31c4e5
Author: xqhe <he...@126.com>
AuthorDate: Tue Feb 15 12:24:10 2022 +0800
IMPALA-11008: fix incorrect to propagate inferred predicates
It is incorrect to propagate predicates inferred from equi-join
conjuncts into a plan subtree that is on the nullable side of an
outer join if the predicate is not null-filtering for the nullable
side.
For example:
SELECT *
FROM (
SELECT id IS NOT NULL
AND col IS NULL AS a
FROM (
SELECT A.id, B.col
FROM A
LEFT JOIN B ON A.id = B.id
) t
) t
WHERE a = 1
Before this patch the inferred predicate '(B.id is not null and
B.col is null) = 1' is evaluated at the scanner of B. This is
incorrect since the predicate '(A.id is not null and B.col is null)
= 1' is not null-filtering for B.
To generate the inferred predicate we substitue the non-outer-join
slots first and use 'isNullableConjunct' to do a more strict check
on the conjunct before the final substitution.
Tests:
- Add plan tests in predicate-propagation.test
- Add new query tests to verify the correctness of inferred
predicates propagation
- Ran the full set of verifications in Impala Public Jenkins
Change-Id: I9e64230f6d0c2b9ef1560186ceba349a5920ccdf
Reviewed-on: http://gerrit.cloudera.org:8080/18234
Reviewed-by: Impala Public Jenkins <im...@cloudera.com>
Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
.../java/org/apache/impala/analysis/Analyzer.java | 35 ++-
.../queries/PlannerTest/predicate-propagation.test | 262 +++++++++++++++++++++
.../queries/QueryTest/outer-joins.test | 60 +++++
3 files changed, 356 insertions(+), 1 deletion(-)
diff --git a/fe/src/main/java/org/apache/impala/analysis/Analyzer.java b/fe/src/main/java/org/apache/impala/analysis/Analyzer.java
index d8d412694..5cdadbff1 100644
--- a/fe/src/main/java/org/apache/impala/analysis/Analyzer.java
+++ b/fe/src/main/java/org/apache/impala/analysis/Analyzer.java
@@ -517,6 +517,11 @@ public class Analyzer {
// ref.
public Set<TupleId> zippingUnnestTupleIds = new HashSet<>();
+ // This holds the nullable side slot ids from the outer join's equi-join conjuncts
+ // e.g. t1 left join t2 on t1.id = t2.id, the slot id of t2.id will be added to
+ // this set.
+ public Set<SlotId> ojNullableSlotsInEquiPreds = new HashSet<>();
+
public GlobalState(StmtTableCache stmtTableCache, TQueryCtx queryCtx,
AuthorizationFactory authzFactory, AuthorizationContext authzCtx) {
this.stmtTableCache = stmtTableCache;
@@ -2131,14 +2136,38 @@ public class Analyzer {
if (srcSids.containsAll(destSids)) {
p = srcConjunct;
} else {
+ // It is incorrect to propagate predicates inferred from equi-join conjuncts
+ // into a plan subtree that is on the nullable side of an outer join if the
+ // predicate is not null-filtering.
+ // For example:
+ // select * from (select id is not null and col is null as a from (select A.id,
+ // B.col from A left join B on A.id = B.id) t ) t where a = 1
+ // In this query the inferred predicate (B.id is not null and B.col is null = 1)
+ // should not be evaluated at the scanner of B.
+ // We use 'ojmap' to save the mapping from src to the outer join equal slot,
+ // eg. B.id. We substitue the non-outer-join slots first and use
+ // 'isNullableConjunct' to do a more strict check on the conjunct before the
+ // final substitution.
ExprSubstitutionMap smap = new ExprSubstitutionMap();
+ ExprSubstitutionMap ojSmap = new ExprSubstitutionMap();
for (int i = 0; i < srcSids.size(); ++i) {
- smap.put(
+ if (globalState_.ojNullableSlotsInEquiPreds.contains(destSids.get(i))) {
+ ojSmap.put(
new SlotRef(globalState_.descTbl.getSlotDesc(srcSids.get(i))),
new SlotRef(globalState_.descTbl.getSlotDesc(destSids.get(i))));
+ } else {
+ smap.put(
+ new SlotRef(globalState_.descTbl.getSlotDesc(srcSids.get(i))),
+ new SlotRef(globalState_.descTbl.getSlotDesc(destSids.get(i))));
+ }
}
try {
p = srcConjunct.trySubstitute(smap, this, false);
+ if (hasOuterJoinedTuple &&
+ isNullableConjunct(p, Arrays.asList(destTid))) continue;
+ if (ojSmap.size() != 0) {
+ p = p.trySubstitute(ojSmap, this, false);
+ }
} catch (ImpalaException exc) {
// not an executable predicate; ignore
continue;
@@ -2715,6 +2744,10 @@ public class Analyzer {
|| tblRef.getJoinOp() == JoinOperator.RIGHT_ANTI_JOIN) {
g.addEdge(innerSlot.asInt(), outerSlot.asInt());
}
+ if (tblRef.getJoinOp() == JoinOperator.LEFT_OUTER_JOIN ||
+ tblRef.getJoinOp() == JoinOperator.RIGHT_OUTER_JOIN) {
+ globalState_.ojNullableSlotsInEquiPreds.add(innerSlot);
+ }
}
}
diff --git a/testdata/workloads/functional-planner/queries/PlannerTest/predicate-propagation.test b/testdata/workloads/functional-planner/queries/PlannerTest/predicate-propagation.test
index 74ff2b4d6..b136d965d 100644
--- a/testdata/workloads/functional-planner/queries/PlannerTest/predicate-propagation.test
+++ b/testdata/workloads/functional-planner/queries/PlannerTest/predicate-propagation.test
@@ -1590,3 +1590,265 @@ PLAN-ROOT SINK
predicates: c_custkey % 2 = 0
row-size=8B cardinality=15.00K
====
+# IMPALA-11008: fix incorrect to propagate inferred predicates
+# The predicate 'case when t1.id is not null and t2.some_nulls is null
+# then true else null end is not null' is not null-filtering for t2, so
+# the inferred predicate is not pushable for t2.
+SELECT COUNT(*) FROM
+(
+ SELECT CASE WHEN id IS NOT NULL AND some_nulls IS NULL
+ THEN true ELSE NULL END AS a
+ FROM
+ (
+ SELECT STRAIGHT_JOIN t1.id
+ ,t2.some_nulls
+ FROM functional.nullrows t1
+ LEFT JOIN functional.nullrows t2
+ ON t1.id = t2.id
+ ) t
+) t
+WHERE a IS NOT NULL
+---- PLAN
+PLAN-ROOT SINK
+|
+03:AGGREGATE [FINALIZE]
+| output: count(*)
+| row-size=8B cardinality=1
+|
+02:HASH JOIN [LEFT OUTER JOIN]
+| hash predicates: t1.id = t2.id
+| other predicates: CASE WHEN t1.id IS NOT NULL AND t2.some_nulls IS NULL THEN TRUE ELSE NULL END IS NOT NULL
+| row-size=39B cardinality=26
+|
+|--01:SCAN HDFS [functional.nullrows t2]
+| HDFS partitions=1/1 files=1 size=541B
+| row-size=26B cardinality=26
+|
+00:SCAN HDFS [functional.nullrows t1]
+ HDFS partitions=1/1 files=1 size=541B
+ row-size=13B cardinality=26
+====
+# IMPALA-11008: fix incorrect to propagate inferred predicates
+# Same as above but with right join
+SELECT COUNT(*) FROM
+(
+ SELECT CASE WHEN id IS NOT NULL AND some_nulls IS NULL
+ THEN true ELSE NULL END AS a
+ FROM
+ (
+ SELECT STRAIGHT_JOIN t2.id
+ ,t1.some_nulls
+ FROM functional.nullrows t1
+ RIGHT JOIN functional.nullrows t2
+ ON t1.id = t2.id
+ ) t
+) t
+WHERE a IS NOT NULL
+---- PLAN
+PLAN-ROOT SINK
+|
+03:AGGREGATE [FINALIZE]
+| output: count(*)
+| row-size=8B cardinality=1
+|
+02:HASH JOIN [RIGHT OUTER JOIN]
+| hash predicates: t1.id = t2.id
+| other predicates: CASE WHEN t2.id IS NOT NULL AND t1.some_nulls IS NULL THEN TRUE ELSE NULL END IS NOT NULL
+| runtime filters: RF000 <- t2.id
+| row-size=39B cardinality=26
+|
+|--01:SCAN HDFS [functional.nullrows t2]
+| HDFS partitions=1/1 files=1 size=541B
+| row-size=13B cardinality=26
+|
+00:SCAN HDFS [functional.nullrows t1]
+ HDFS partitions=1/1 files=1 size=541B
+ runtime filters: RF000 -> t1.id
+ row-size=26B cardinality=26
+====
+# IMPALA-11008: fix incorrect to propagate inferred predicates
+# It's correct to propagate inferred predicates into an inner join
+SELECT COUNT(*) FROM
+(
+ SELECT CASE WHEN id IS NOT NULL AND some_nulls IS NULL
+ THEN true ELSE NULL END AS a
+ FROM
+ (
+ SELECT t1.id
+ ,t2.some_nulls
+ FROM functional.nullrows t1
+ JOIN functional.nullrows t2
+ ON t1.id = t2.id
+ ) t
+) t
+WHERE a IS NOT NULL
+---- PLAN
+PLAN-ROOT SINK
+|
+03:AGGREGATE [FINALIZE]
+| output: count(*)
+| row-size=8B cardinality=1
+|
+02:HASH JOIN [INNER JOIN]
+| hash predicates: t1.id = t2.id
+| other predicates: CASE WHEN t1.id IS NOT NULL AND t2.some_nulls IS NULL THEN TRUE ELSE NULL END IS NOT NULL
+| runtime filters: RF000 <- t2.id
+| row-size=39B cardinality=3
+|
+|--01:SCAN HDFS [functional.nullrows t2]
+| HDFS partitions=1/1 files=1 size=541B
+| predicates: CASE WHEN t2.id IS NOT NULL AND t2.some_nulls IS NULL THEN TRUE ELSE NULL END IS NOT NULL
+| row-size=26B cardinality=3
+|
+00:SCAN HDFS [functional.nullrows t1]
+ HDFS partitions=1/1 files=1 size=541B
+ runtime filters: RF000 -> t1.id
+ row-size=13B cardinality=26
+====
+# IMPALA-11008: fix incorrect to propagate inferred predicates
+# The predicate 'case when t2.id is not null and t2.some_nulls is null
+# then true else null end is not null' is null-filtering for t2.
+SELECT COUNT(*) FROM
+(
+ SELECT CASE WHEN id IS NOT NULL AND some_nulls IS NULL
+ THEN true ELSE NULL END AS a
+ FROM
+ (
+ SELECT t2.id
+ ,t2.some_nulls
+ FROM functional.nullrows t1
+ LEFT JOIN functional.nullrows t2
+ ON t1.id = t2.id
+ ) t
+) t
+WHERE a IS NOT NULL
+---- PLAN
+PLAN-ROOT SINK
+|
+03:AGGREGATE [FINALIZE]
+| output: count(*)
+| row-size=8B cardinality=1
+|
+02:HASH JOIN [LEFT OUTER JOIN]
+| hash predicates: t1.id = t2.id
+| other predicates: CASE WHEN t2.id IS NOT NULL AND t2.some_nulls IS NULL THEN TRUE ELSE NULL END IS NOT NULL
+| row-size=39B cardinality=26
+|
+|--01:SCAN HDFS [functional.nullrows t2]
+| HDFS partitions=1/1 files=1 size=541B
+| predicates: CASE WHEN t2.id IS NOT NULL AND t2.some_nulls IS NULL THEN TRUE ELSE NULL END IS NOT NULL
+| row-size=26B cardinality=3
+|
+00:SCAN HDFS [functional.nullrows t1]
+ HDFS partitions=1/1 files=1 size=541B
+ row-size=13B cardinality=26
+====
+# IMPALA-11008: fix incorrect to propagate inferred predicates
+# The predicate 't1.test_id + t2.zip = 2' is null-filtering for t2.
+SELECT *
+FROM (SELECT test_id + zip a FROM
+ (SELECT STRAIGHT_JOIN t1.test_id, t2.zip FROM functional.jointbl t1
+ LEFT JOIN functional.testtbl t2 ON t1.test_id = t2.id
+ ) t
+) t
+WHERE a = 2
+---- PLAN
+PLAN-ROOT SINK
+|
+02:HASH JOIN [LEFT OUTER JOIN]
+| hash predicates: t1.test_id = t2.id
+| other predicates: t1.test_id + t2.zip = 2
+| row-size=20B cardinality=19
+|
+|--01:SCAN HDFS [functional.testtbl t2]
+| HDFS partitions=1/1 files=0 size=0B
+| predicates: t2.id + t2.zip = 2
+| row-size=12B cardinality=0
+|
+00:SCAN HDFS [functional.jointbl t1]
+ HDFS partitions=1/1 files=1 size=433B
+ row-size=8B cardinality=19
+====
+# IMPALA-11008: fix incorrect to propagate inferred predicates
+# The predicate '(t1.id is not null and t2.some_nulls is null) = 1' is
+# not null-filtering for t2.
+SELECT *
+FROM (SELECT id IS NOT NULL AND some_nulls IS NULL AS a
+ FROM (SELECT STRAIGHT_JOIN t1.id, t2.some_nulls
+ FROM functional.nullrows t1
+ LEFT JOIN functional.nullrows t2 ON t1.id = t2.id
+ ) t
+) t
+WHERE a = 1
+---- PLAN
+PLAN-ROOT SINK
+|
+02:HASH JOIN [LEFT OUTER JOIN]
+| hash predicates: t1.id = t2.id
+| other predicates: t1.id IS NOT NULL AND t2.some_nulls IS NULL = 1
+| row-size=39B cardinality=26
+|
+|--01:SCAN HDFS [functional.nullrows t2]
+| HDFS partitions=1/1 files=1 size=541B
+| row-size=26B cardinality=26
+|
+00:SCAN HDFS [functional.nullrows t1]
+ HDFS partitions=1/1 files=1 size=541B
+ row-size=13B cardinality=26
+====
+# IMPALA-11008: fix incorrect to propagate inferred predicates
+# Same as above but with right join
+SELECT *
+FROM (SELECT id IS NOT NULL AND some_nulls IS NULL AS a
+ FROM (SELECT STRAIGHT_JOIN t2.id, t1.some_nulls
+ FROM functional.nullrows t1
+ RIGHT JOIN functional.nullrows t2 ON t1.id = t2.id
+ ) t
+) t
+WHERE a = 1
+---- PLAN
+PLAN-ROOT SINK
+|
+02:HASH JOIN [RIGHT OUTER JOIN]
+| hash predicates: t1.id = t2.id
+| other predicates: t2.id IS NOT NULL AND t1.some_nulls IS NULL = 1
+| runtime filters: RF000 <- t2.id
+| row-size=39B cardinality=26
+|
+|--01:SCAN HDFS [functional.nullrows t2]
+| HDFS partitions=1/1 files=1 size=541B
+| row-size=13B cardinality=26
+|
+00:SCAN HDFS [functional.nullrows t1]
+ HDFS partitions=1/1 files=1 size=541B
+ runtime filters: RF000 -> t1.id
+ row-size=26B cardinality=26
+====
+# IMPALA-11008: fix incorrect to propagate inferred predicates
+# The predicate '(t1.id is not null and t2.some_nulls is not null) = 1'
+# is null-filtering for t2.
+SELECT *
+FROM (SELECT id IS NOT NULL AND some_nulls IS NOT NULL AS a
+ FROM (SELECT STRAIGHT_JOIN t1.id, t2.some_nulls
+ FROM functional.nullrows t1
+ LEFT JOIN functional.nullrows t2 ON t1.id = t2.id
+ ) t
+) t
+WHERE a = 1
+---- PLAN
+PLAN-ROOT SINK
+|
+02:HASH JOIN [LEFT OUTER JOIN]
+| hash predicates: t1.id = t2.id
+| other predicates: t1.id IS NOT NULL AND t2.some_nulls IS NOT NULL = 1
+| row-size=39B cardinality=26
+|
+|--01:SCAN HDFS [functional.nullrows t2]
+| HDFS partitions=1/1 files=1 size=541B
+| predicates: t2.id IS NOT NULL AND t2.some_nulls IS NOT NULL = 1
+| row-size=26B cardinality=9
+|
+00:SCAN HDFS [functional.nullrows t1]
+ HDFS partitions=1/1 files=1 size=541B
+ row-size=13B cardinality=26
+====
\ No newline at end of file
diff --git a/testdata/workloads/functional-query/queries/QueryTest/outer-joins.test b/testdata/workloads/functional-query/queries/QueryTest/outer-joins.test
index edb1e8449..6761e88c1 100644
--- a/testdata/workloads/functional-query/queries/QueryTest/outer-joins.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/outer-joins.test
@@ -728,3 +728,63 @@ LEFT OUTER JOIN (
---- TYPES
bigint
====
+---- QUERY
+# Test inferred predicate propagation
+SELECT count(*)
+FROM (
+ SELECT CASE
+ WHEN id IS NOT NULL
+ AND some_nulls IS NULL THEN true
+ ELSE NULL
+ END AS a
+ FROM (
+ SELECT STRAIGHT_JOIN t1.id, t2.some_nulls
+ FROM functional.nullrows t1
+ LEFT JOIN functional.nullrows t2 ON t1.id = t2.id
+ ) t
+) t
+WHERE a IS NOT NULL
+---- RESULTS
+20
+---- TYPES
+bigint
+====
+---- QUERY
+# Test inferred predicate propagation
+SELECT count(*)
+FROM (
+ SELECT CASE
+ WHEN id IS NOT NULL
+ AND some_nulls IS NULL THEN true
+ ELSE NULL
+ END AS a
+ FROM (
+ SELECT STRAIGHT_JOIN t1.id, t2.some_nulls
+ FROM functional.nullrows t2
+ RIGHT JOIN functional.nullrows t1 ON t1.id = t2.id
+ ) t
+) t
+WHERE a IS NOT NULL
+---- RESULTS
+20
+---- TYPES
+bigint
+====
+---- QUERY
+# Test inferred predicate propagation
+SELECT count(*)
+FROM (
+ SELECT id IS NOT NULL
+ AND some_nulls IS NOT NULL AS a
+ FROM (
+ SELECT STRAIGHT_JOIN t1.id, t2.some_nulls
+ FROM functional.nullrows t1
+ LEFT JOIN functional.nullrows t2 ON t1.id = t2.id
+ ) t
+) t
+WHERE a = 1
+---- RESULTS
+6
+---- TYPES
+bigint
+====
\ No newline at end of file