You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by st...@apache.org on 2022/08/08 23:37:01 UTC

[impala] 04/27: IMPALA-11301: Fix extreme != selectivity for NDV=1

This is an automated email from the ASF dual-hosted git repository.

stigahuang pushed a commit to branch branch-4.1.1
in repository https://gitbox.apache.org/repos/asf/impala.git

commit e5164c89e57de817dede5beca7100fd8fea97565
Author: Csaba Ringhofer <cs...@cloudera.com>
AuthorDate: Wed May 18 17:58:08 2022 +0200

    IMPALA-11301: Fix extreme != selectivity for NDV=1
    
    The original selectivity of 1.0 - 1.0/ndv makes sense for
    large NDVs, but the result is 0.0 in case of ndv==1, which
    leads to a cardinality of 1 even for huge tables. The new
    selectivity is 0.5.
    
    Note that as the formula for = is not changed (1.0/ndv),
    NOT col="const" will still lead to 0.0 selectivity if ndv=1.
    Changing the formula of NOT or = would have caused a lot of
    subtle changes in plans in tests, so I don't want to touch
    those before coming to wider agreement about the correct
    approach.
    
    IMPALA-7601 contains some discussion about these formulas.
    
    Testing:
    - added a regression test
    
    Change-Id: I6b5334a8d7d6ca46a450ff98ae03e5269faaa3c6
    Reviewed-on: http://gerrit.cloudera.org:8080/18543
    Reviewed-by: Impala Public Jenkins <im...@cloudera.com>
    Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
 .../main/java/org/apache/impala/analysis/BinaryPredicate.java  |  3 ++-
 .../test/java/org/apache/impala/planner/CardinalityTest.java   | 10 ++++++++++
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/fe/src/main/java/org/apache/impala/analysis/BinaryPredicate.java b/fe/src/main/java/org/apache/impala/analysis/BinaryPredicate.java
index 72b98c3a3..22c8730e5 100644
--- a/fe/src/main/java/org/apache/impala/analysis/BinaryPredicate.java
+++ b/fe/src/main/java/org/apache/impala/analysis/BinaryPredicate.java
@@ -270,7 +270,8 @@ public class BinaryPredicate extends Predicate {
       if (op_ == Operator.DISTINCT_FROM && rChildIsNull) {
         selectivity_ = 1.0;
       } else {
-        selectivity_ = 1.0 - 1.0 / distinctValues;
+        // avoid 0.0 selectivity if ndv == 1 (IMPALA-11301).
+        selectivity_ = distinctValues == 1 ? 0.5 : (1.0 - 1.0 / distinctValues);
       }
     } else {
       return;
diff --git a/fe/src/test/java/org/apache/impala/planner/CardinalityTest.java b/fe/src/test/java/org/apache/impala/planner/CardinalityTest.java
index 067d8385f..317a36151 100644
--- a/fe/src/test/java/org/apache/impala/planner/CardinalityTest.java
+++ b/fe/src/test/java/org/apache/impala/planner/CardinalityTest.java
@@ -101,6 +101,10 @@ public class CardinalityTest extends PlannerTestBase {
         "SELECT COUNT(*) FROM functional.alltypes GROUP BY id", 7300);
     verifyCardinality(
         "SELECT COUNT(*) FROM functional.alltypes GROUP BY bool_col", 2);
+
+    // Regression test for IMPALA-11301.
+    verifyCardinality(
+        "SELECT * FROM tpcds_parquet.date_dim WHERE d_current_day != 'a'", 36525);
   }
 
   /**
@@ -396,6 +400,12 @@ public class CardinalityTest extends PlannerTestBase {
     verifyApproxCardinality("SELECT SUM(int_col) OVER() int_col "
         + "FROM functional_parquet.alltypestiny", 742, true,
         ImmutableSet.of(), path, AnalyticEvalNode.class);
+
+    // Regression test for IMPALA-11301. row_number() is (incorrectly) assumed to have
+    // NDV=1, which was leading to selectivity=0.0 in rn != 5. Will break if someone
+    // implements correct ndv estimates for analytic functions.
+    verifyCardinality("SELECT * FROM (SELECT *, row_number() OVER(order by id) "
+        + "as rn FROM functional.alltypestiny) v where rn != 5", 4);
   }
 
   @Test