You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by st...@apache.org on 2020/08/25 23:42:03 UTC

[impala] branch master updated: IMPALA-7782: fix constant NOT IN subqueries that can return 0 rows

This is an automated email from the ASF dual-hosted git repository.

stakiar pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git


The following commit(s) were added to refs/heads/master by this push:
     new e133d18  IMPALA-7782: fix constant NOT IN subqueries that can return 0 rows
e133d18 is described below

commit e133d1838ab05e75007fef24e2ce1b6f18113c8d
Author: Tim Armstrong <ta...@cloudera.com>
AuthorDate: Thu Aug 13 12:53:37 2020 -0700

    IMPALA-7782: fix constant NOT IN subqueries that can return 0 rows
    
    The bug was the the statement rewriter converted NOT IN <subquery>
    predicates to != <subquery> predicates when the subquery could
    be an empty set. This was invalid, because NOT IN (<empty set>)
    is true, but != (<empty set>) is false.
    
    Testing:
    Added targeted planner and end-to-end tests.
    
    Ran exhaustive tests.
    
    Change-Id: I66c726f0f66ce2f609e6ba44057191f5929a67fc
    Reviewed-on: http://gerrit.cloudera.org:8080/16338
    Reviewed-by: Impala Public Jenkins <im...@cloudera.com>
    Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
 .../org/apache/impala/analysis/StmtRewriter.java   |  4 +-
 .../queries/PlannerTest/subquery-rewrite.test      | 52 +++++++++++++++++-
 .../QueryTest/subquery-in-constant-lhs.test        | 64 +++++++++++++++++++++-
 3 files changed, 115 insertions(+), 5 deletions(-)

diff --git a/fe/src/main/java/org/apache/impala/analysis/StmtRewriter.java b/fe/src/main/java/org/apache/impala/analysis/StmtRewriter.java
index 19d9dc1..705eb07 100644
--- a/fe/src/main/java/org/apache/impala/analysis/StmtRewriter.java
+++ b/fe/src/main/java/org/apache/impala/analysis/StmtRewriter.java
@@ -480,7 +480,9 @@ public class StmtRewriter {
       if (!inPred.isNotIn()) return null;
 
       // CASE 2, NOT IN and RHS returns a single row:
-      if (rhsQuery.returnsAtMostOneRow()) {
+      // IMPALA-7782: this rewrite is only valid if the subquery is always non-empty
+      // because C NOT IN (<empty set>) is true, but C != (<empty set>) is false.
+      if (rhsQuery.returnsExactlyOneRow()) {
         return new BinaryPredicate(BinaryPredicate.Operator.NE, lhs, rhs);
       }
 
diff --git a/testdata/workloads/functional-planner/queries/PlannerTest/subquery-rewrite.test b/testdata/workloads/functional-planner/queries/PlannerTest/subquery-rewrite.test
index 25d50f6..6eba293 100644
--- a/testdata/workloads/functional-planner/queries/PlannerTest/subquery-rewrite.test
+++ b/testdata/workloads/functional-planner/queries/PlannerTest/subquery-rewrite.test
@@ -2692,11 +2692,12 @@ select * from functional.alltypessmall where
 ---- PLAN
 PLAN-ROOT SINK
 |
-03:NESTED LOOP JOIN [CROSS JOIN]
-|  row-size=93B cardinality=100
+03:NESTED LOOP JOIN [LEFT ANTI JOIN]
+|  row-size=89B cardinality=100
 |
 |--02:SELECT
-|  |  predicates: 1 != int_col
+|  |  predicates: 1 IS NULL OR int_col IS NULL OR int_col = 1
+|  |  limit: 1
 |  |  row-size=4B cardinality=1
 |  |
 |  01:SCAN HDFS [functional.alltypestiny]
@@ -4763,3 +4764,48 @@ PLAN-ROOT SINK
    HDFS partitions=11/11 files=11 size=814.73KB
    row-size=25B cardinality=11.00K
 ====
+# IMPALA-7782: incorrect results for subquery with HAVING predicate.
+# Subqueries with HAVING clauses need to be rewritten to an outer join
+# because they may return an empty result set.
+SELECT id
+FROM functional.alltypestiny
+WHERE -1 NOT IN (SELECT COUNT(id) FROM functional.alltypestiny HAVING false)
+---- PLAN
+PLAN-ROOT SINK
+|
+02:NESTED LOOP JOIN [LEFT ANTI JOIN]
+|  row-size=4B cardinality=8
+|
+|--01:EMPTYSET
+|     limit: 1
+|
+00:SCAN HDFS [functional.alltypestiny]
+   HDFS partitions=4/4 files=4 size=460B
+   row-size=4B cardinality=8
+====
+# IMPALA-7782: incorrect results for subquery with HAVING predicate.
+# Subqueries with HAVING clauses need to be rewritten to an outer join
+# because they may return an empty result set.
+SELECT id
+FROM functional.alltypestiny
+WHERE -1 NOT IN (SELECT COUNT(id) FROM functional.alltypestiny HAVING count(id) > 100)
+---- PLAN
+PLAN-ROOT SINK
+|
+03:NESTED LOOP JOIN [LEFT ANTI JOIN]
+|  row-size=4B cardinality=8
+|
+|--02:AGGREGATE [FINALIZE]
+|  |  output: count(id)
+|  |  having: count(id) > 100, -1 IS NULL OR count(id) IS NULL OR count(id) = -1
+|  |  limit: 1
+|  |  row-size=8B cardinality=1
+|  |
+|  01:SCAN HDFS [functional.alltypestiny]
+|     HDFS partitions=4/4 files=4 size=460B
+|     row-size=4B cardinality=8
+|
+00:SCAN HDFS [functional.alltypestiny]
+   HDFS partitions=4/4 files=4 size=460B
+   row-size=4B cardinality=8
+====
diff --git a/testdata/workloads/functional-query/queries/QueryTest/subquery-in-constant-lhs.test b/testdata/workloads/functional-query/queries/QueryTest/subquery-in-constant-lhs.test
index 56627cb..dec0155 100644
--- a/testdata/workloads/functional-query/queries/QueryTest/subquery-in-constant-lhs.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/subquery-in-constant-lhs.test
@@ -298,4 +298,66 @@ SELECT a.id FROM alltypessmall a WHERE
 ---- RESULTS
 ---- TYPES
 INT
-====
\ No newline at end of file
+====
+---- QUERY
+# IMPALA-7782: incorrect results for subquery with HAVING predicate.
+# The subquery is empty so the predicate is always true
+SELECT id
+FROM alltypestiny
+WHERE -1 NOT IN (SELECT COUNT(id) FROM alltypestiny HAVING false)
+---- RESULTS
+0
+1
+2
+3
+4
+5
+6
+7
+---- TYPES
+INT
+====
+---- QUERY
+# IMPALA-7782: incorrect results for subquery with HAVING predicate.
+# The subquery is empty so the predicate is always true
+SELECT id
+FROM alltypestiny
+WHERE NULL NOT IN (SELECT COUNT(id) FROM alltypestiny HAVING false)
+---- RESULTS
+0
+1
+2
+3
+4
+5
+6
+7
+---- TYPES
+INT
+====
+---- QUERY
+# IMPALA-7782: true having predicate is also handled correctly.
+SELECT id
+FROM alltypestiny
+WHERE 0 NOT IN (SELECT COUNT(id) FROM alltypestiny HAVING count(id) > 1)
+---- RESULTS
+0
+1
+2
+3
+4
+5
+6
+7
+---- TYPES
+INT
+====
+---- QUERY
+# IMPALA-7782: true having predicate is also handled correctly.
+SELECT id
+FROM alltypestiny
+WHERE 8 NOT IN (SELECT COUNT(id) FROM alltypestiny HAVING count(id) > 1)
+---- RESULTS
+---- TYPES
+INT
+====