You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by st...@apache.org on 2022/08/08 23:37:01 UTC
[impala] 04/27: IMPALA-11301: Fix extreme != selectivity for NDV=1
This is an automated email from the ASF dual-hosted git repository.
stigahuang pushed a commit to branch branch-4.1.1
in repository https://gitbox.apache.org/repos/asf/impala.git
commit e5164c89e57de817dede5beca7100fd8fea97565
Author: Csaba Ringhofer <cs...@cloudera.com>
AuthorDate: Wed May 18 17:58:08 2022 +0200
IMPALA-11301: Fix extreme != selectivity for NDV=1
The original selectivity of 1.0 - 1.0/ndv makes sense for
large NDVs, but the result is 0.0 in case of ndv==1, which
leads to a cardinality of 1 even for huge tables. The new
selectivity is 0.5.
Note that as the formula for = is not changed (1.0/ndv),
NOT col="const" will still lead to 0.0 selectivity if ndv=1.
Changing the formula of NOT or = would have caused a lot of
subtle changes in plans in tests, so I don't want to touch
those before coming to wider agreement about the correct
approach.
IMPALA-7601 contains some discussion about these formulas.
Testing:
- added a regression test
Change-Id: I6b5334a8d7d6ca46a450ff98ae03e5269faaa3c6
Reviewed-on: http://gerrit.cloudera.org:8080/18543
Reviewed-by: Impala Public Jenkins <im...@cloudera.com>
Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
.../main/java/org/apache/impala/analysis/BinaryPredicate.java | 3 ++-
.../test/java/org/apache/impala/planner/CardinalityTest.java | 10 ++++++++++
2 files changed, 12 insertions(+), 1 deletion(-)
diff --git a/fe/src/main/java/org/apache/impala/analysis/BinaryPredicate.java b/fe/src/main/java/org/apache/impala/analysis/BinaryPredicate.java
index 72b98c3a3..22c8730e5 100644
--- a/fe/src/main/java/org/apache/impala/analysis/BinaryPredicate.java
+++ b/fe/src/main/java/org/apache/impala/analysis/BinaryPredicate.java
@@ -270,7 +270,8 @@ public class BinaryPredicate extends Predicate {
if (op_ == Operator.DISTINCT_FROM && rChildIsNull) {
selectivity_ = 1.0;
} else {
- selectivity_ = 1.0 - 1.0 / distinctValues;
+ // avoid 0.0 selectivity if ndv == 1 (IMPALA-11301).
+ selectivity_ = distinctValues == 1 ? 0.5 : (1.0 - 1.0 / distinctValues);
}
} else {
return;
diff --git a/fe/src/test/java/org/apache/impala/planner/CardinalityTest.java b/fe/src/test/java/org/apache/impala/planner/CardinalityTest.java
index 067d8385f..317a36151 100644
--- a/fe/src/test/java/org/apache/impala/planner/CardinalityTest.java
+++ b/fe/src/test/java/org/apache/impala/planner/CardinalityTest.java
@@ -101,6 +101,10 @@ public class CardinalityTest extends PlannerTestBase {
"SELECT COUNT(*) FROM functional.alltypes GROUP BY id", 7300);
verifyCardinality(
"SELECT COUNT(*) FROM functional.alltypes GROUP BY bool_col", 2);
+
+ // Regression test for IMPALA-11301.
+ verifyCardinality(
+ "SELECT * FROM tpcds_parquet.date_dim WHERE d_current_day != 'a'", 36525);
}
/**
@@ -396,6 +400,12 @@ public class CardinalityTest extends PlannerTestBase {
verifyApproxCardinality("SELECT SUM(int_col) OVER() int_col "
+ "FROM functional_parquet.alltypestiny", 742, true,
ImmutableSet.of(), path, AnalyticEvalNode.class);
+
+ // Regression test for IMPALA-11301. row_number() is (incorrectly) assumed to have
+ // NDV=1, which was leading to selectivity=0.0 in rn != 5. Will break if someone
+ // implements correct ndv estimates for analytic functions.
+ verifyCardinality("SELECT * FROM (SELECT *, row_number() OVER(order by id) "
+ + "as rn FROM functional.alltypestiny) v where rn != 5", 4);
}
@Test