You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by jo...@apache.org on 2022/06/14 21:00:29 UTC

[impala] branch master updated: IMPALA-11280: Join node incorrectly picks up unnest(array) predicates

This is an automated email from the ASF dual-hosted git repository.

joemcdonnell pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git


The following commit(s) were added to refs/heads/master by this push:
     new 2744f46fb IMPALA-11280: Join node incorrectly picks up unnest(array) predicates
2744f46fb is described below

commit 2744f46fbd921dafe9b63f4a0011b2237ee07c5f
Author: Gabor Kaszab <ga...@cloudera.com>
AuthorDate: Mon Jun 13 14:04:36 2022 +0200

    IMPALA-11280: Join node incorrectly picks up unnest(array) predicates
    
    The expectation for predicates on unnested arrays is that they are
    either picked up by the SCAN node or the UNNEST node for evaluation. If
    there is only one array being unnested then the SCAN node, otherwise
    the UNNEST node will be responsible for the evaluation. However, if
    there is a JOIN node involved where the JOIN construction happens
    before creating the UNNEST node then the JOIN node incorrectly picks
    up the predicates for the unnested arrays as well. This patch is to fix
    this behaviour.
    
    Tests:
      - Added E2E tests to cover result correctness.
      - Added planner tests to verify that the desired node picks up the
        predicates for unnested arrays.
    
    Change-Id: I89fed4eef220ca513b259f0e2649cdfbe43c797a
    Reviewed-on: http://gerrit.cloudera.org:8080/18614
    Reviewed-by: Impala Public Jenkins <im...@cloudera.com>
    Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
 .../java/org/apache/impala/planner/PlanNode.java   |  2 +-
 .../java/org/apache/impala/planner/ScanNode.java   |  3 +
 .../apache/impala/planner/SingleNodePlanner.java   |  1 +
 .../apache/impala/planner/SingularRowSrcNode.java  |  3 -
 .../java/org/apache/impala/planner/UnnestNode.java |  3 +
 .../queries/PlannerTest/zipping-unnest.test        | 80 ++++++++++++++++++++++
 .../QueryTest/nested-array-in-select-list.test     | 45 +++++++++++-
 7 files changed, 131 insertions(+), 6 deletions(-)

diff --git a/fe/src/main/java/org/apache/impala/planner/PlanNode.java b/fe/src/main/java/org/apache/impala/planner/PlanNode.java
index 2dc21afe2..b936d5a5c 100644
--- a/fe/src/main/java/org/apache/impala/planner/PlanNode.java
+++ b/fe/src/main/java/org/apache/impala/planner/PlanNode.java
@@ -548,7 +548,7 @@ abstract public class PlanNode extends TreeNode<PlanNode> {
     analyzer.markConjunctsAssigned(unassigned);
   }
 
-  protected boolean shouldPickUpZippingUnnestConjuncts() { return true; }
+  protected boolean shouldPickUpZippingUnnestConjuncts() { return false; }
 
   /**
    * Apply the provided conjuncts to the this node, returning the new root of
diff --git a/fe/src/main/java/org/apache/impala/planner/ScanNode.java b/fe/src/main/java/org/apache/impala/planner/ScanNode.java
index e8731d6e8..30f7fee62 100644
--- a/fe/src/main/java/org/apache/impala/planner/ScanNode.java
+++ b/fe/src/main/java/org/apache/impala/planner/ScanNode.java
@@ -85,6 +85,9 @@ abstract public class ScanNode extends PlanNode {
 
   public TupleDescriptor getTupleDesc() { return desc_; }
 
+  @Override
+  protected boolean shouldPickUpZippingUnnestConjuncts() { return true; }
+
   /**
    * Checks if this scan is supported based on the types of scanned columns and the
    * underlying file formats, in particular, whether complex types are supported.
diff --git a/fe/src/main/java/org/apache/impala/planner/SingleNodePlanner.java b/fe/src/main/java/org/apache/impala/planner/SingleNodePlanner.java
index 734885fbe..143ff4bce 100644
--- a/fe/src/main/java/org/apache/impala/planner/SingleNodePlanner.java
+++ b/fe/src/main/java/org/apache/impala/planner/SingleNodePlanner.java
@@ -2054,6 +2054,7 @@ public class SingleNodePlanner {
         Preconditions.checkState(hasNullMatchingEqOperator);
       }
     }
+    PlanNode.removeZippingUnnestConjuncts(otherJoinConjuncts, analyzer);
     analyzer.markConjunctsAssigned(otherJoinConjuncts);
 
     if (analyzer.getQueryOptions().isEnable_distinct_semi_join_optimization() &&
diff --git a/fe/src/main/java/org/apache/impala/planner/SingularRowSrcNode.java b/fe/src/main/java/org/apache/impala/planner/SingularRowSrcNode.java
index bef43b783..e9c498417 100644
--- a/fe/src/main/java/org/apache/impala/planner/SingularRowSrcNode.java
+++ b/fe/src/main/java/org/apache/impala/planner/SingularRowSrcNode.java
@@ -88,7 +88,4 @@ public class SingularRowSrcNode extends PlanNode {
   protected void toThrift(TPlanNode msg) {
     msg.node_type = TPlanNodeType.SINGULAR_ROW_SRC_NODE;
   }
-
-  @Override
-  protected boolean shouldPickUpZippingUnnestConjuncts() { return false; }
 }
diff --git a/fe/src/main/java/org/apache/impala/planner/UnnestNode.java b/fe/src/main/java/org/apache/impala/planner/UnnestNode.java
index e5247af0f..dd6eb33f5 100644
--- a/fe/src/main/java/org/apache/impala/planner/UnnestNode.java
+++ b/fe/src/main/java/org/apache/impala/planner/UnnestNode.java
@@ -82,6 +82,9 @@ public class UnnestNode extends PlanNode {
     computeMemLayout(analyzer);
   }
 
+  @Override
+  protected boolean shouldPickUpZippingUnnestConjuncts() { return true; }
+
   @Override
   public void computeStats(Analyzer analyzer) {
     super.computeStats(analyzer);
diff --git a/testdata/workloads/functional-planner/queries/PlannerTest/zipping-unnest.test b/testdata/workloads/functional-planner/queries/PlannerTest/zipping-unnest.test
index a9672637d..f4a39cab6 100644
--- a/testdata/workloads/functional-planner/queries/PlannerTest/zipping-unnest.test
+++ b/testdata/workloads/functional-planner/queries/PlannerTest/zipping-unnest.test
@@ -204,3 +204,83 @@ PLAN-ROOT SINK
    HDFS partitions=1/1 files=1 size=1.06KB
    row-size=16B cardinality=1.35K
 ====
+# IMPALA-11280. There is a join involved here by using the IN operator, and multiple
+# arrays are unnested. Checks that the predicate on an unnested array is not picked up by
+# the join node.
+select id, unnested_arr1, unnested_arr2
+from (
+    select id, unnest(arr1) as unnested_arr1, unnest(arr2) as unnested_arr2
+    from functional_parquet.complextypes_arrays
+    where id % 2 = 1 and id in (select id from functional_parquet.alltypestiny)
+) a
+where a.unnested_arr1 < 5;
+---- PLAN
+PLAN-ROOT SINK
+|
+06:SUBPLAN
+|  row-size=44B cardinality=1.35K
+|
+|--04:NESTED LOOP JOIN [CROSS JOIN]
+|  |  row-size=44B cardinality=10
+|  |
+|  |--02:SINGULAR ROW SRC
+|  |     row-size=28B cardinality=1
+|  |
+|  03:UNNEST [functional_parquet.complextypes_arrays.arr1 arr1, functional_parquet.complextypes_arrays.arr2 arr2]
+|     predicates: UNNEST(arr1) < 5
+|     row-size=0B cardinality=10
+|
+05:HASH JOIN [LEFT SEMI JOIN]
+|  hash predicates: id = id
+|  runtime filters: RF000 <- id
+|  row-size=28B cardinality=135
+|
+|--01:SCAN HDFS [functional_parquet.alltypestiny]
+|     HDFS partitions=4/4 files=4 size=11.92KB
+|     predicates: functional_parquet.alltypestiny.id % 2 = 1
+|     row-size=4B cardinality=76
+|
+00:SCAN HDFS [functional_parquet.complextypes_arrays]
+   HDFS partitions=1/1 files=1 size=1.06KB
+   predicates: id % 2 = 1
+   runtime filters: RF000 -> id
+   row-size=28B cardinality=135
+====
+# Similar as above but here the join is explicitly included in the query string and is not
+# a result of a query rewrite.
+select a.id, unnested_arr1, unnested_arr2
+from (
+    select cta.id, unnest(arr1) as unnested_arr1, unnest(arr2) as unnested_arr2
+    from functional_parquet.complextypes_arrays cta left join functional_parquet.alltypestiny ti on cta.id = ti.id
+    where cta.id % 2 = 1) a
+where a.unnested_arr1 < 5;
+---- PLAN
+PLAN-ROOT SINK
+|
+06:SUBPLAN
+|  row-size=44B cardinality=1.35K
+|
+|--04:NESTED LOOP JOIN [CROSS JOIN]
+|  |  row-size=44B cardinality=10
+|  |
+|  |--02:SINGULAR ROW SRC
+|  |     row-size=28B cardinality=1
+|  |
+|  03:UNNEST [cta.arr1 arr1, cta.arr2 arr2]
+|     predicates: UNNEST(arr1) < 5
+|     row-size=0B cardinality=10
+|
+05:HASH JOIN [LEFT OUTER JOIN]
+|  hash predicates: cta.id = ti.id
+|  row-size=32B cardinality=135
+|
+|--01:SCAN HDFS [functional_parquet.alltypestiny ti]
+|     HDFS partitions=4/4 files=4 size=11.92KB
+|     predicates: ti.id % 2 = 1
+|     row-size=4B cardinality=76
+|
+00:SCAN HDFS [functional_parquet.complextypes_arrays cta]
+   HDFS partitions=1/1 files=1 size=1.06KB
+   predicates: cta.id % 2 = 1
+   row-size=28B cardinality=135
+====
\ No newline at end of file
diff --git a/testdata/workloads/functional-query/queries/QueryTest/nested-array-in-select-list.test b/testdata/workloads/functional-query/queries/QueryTest/nested-array-in-select-list.test
index 13362f23e..1b2ef1e63 100644
--- a/testdata/workloads/functional-query/queries/QueryTest/nested-array-in-select-list.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/nested-array-in-select-list.test
@@ -334,11 +334,52 @@ BIGINT,INT
 select item from complextypes_arrays_only_view.int_array
 ---- CATCH
 AnalysisException: Non-relative collections are currently not supported on collections from views.
-=====
+====
 ---- QUERY
 # IMPALA-11052: allow using collections returned from views as non-relative table refs
 with s as (select int_array a from complextypestbl t)
 select item from s.a
 ---- CATCH
 AnalysisException: Could not resolve table reference: 's.a
-=====
\ No newline at end of file
+====
+---- QUERY
+# IMPALA-11280. There is a join involved here by using the IN operator, and multiple
+# arrays are unnested. Checks that the predicate on an unnested array is evaluated
+# correctly.
+select id, unnested_arr1, unnested_arr2
+from (
+    select id, unnest(arr1) as unnested_arr1, unnest(arr2) as unnested_arr2
+    from complextypes_arrays
+    where id % 2 = 1 and id in (select id from alltypestiny)
+) a
+where a.unnested_arr1 < 5;
+---- RESULTS
+1,1,'one'
+1,2,'two'
+1,3,'three'
+1,4,'four'
+7,1,'NULL'
+7,2,'NULL'
+---- TYPES
+INT,INT,STRING
+====
+---- QUERY
+# Similar as above but here the join is explicitly included in the query string and is not
+# a result of a query rewrite.
+select a.id, unnested_arr1, unnested_arr2
+from (
+    select cta.id, unnest(arr1) as unnested_arr1, unnest(arr2) as unnested_arr2
+    from functional_parquet.complextypes_arrays cta
+        left join functional_parquet.alltypestiny ti on cta.id = ti.id
+    where cta.id % 2 = 1) a
+where a.unnested_arr1 < 5;
+---- RESULTS
+1,1,'one'
+1,2,'two'
+1,3,'three'
+1,4,'four'
+7,1,'NULL'
+7,2,'NULL'
+---- TYPES
+INT,INT,STRING
+====
\ No newline at end of file