You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by st...@apache.org on 2023/03/05 23:53:36 UTC

[impala] branch master updated (6ebf35cd5 -> 3573db68c)

This is an automated email from the ASF dual-hosted git repository.

stigahuang pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git


    from 6ebf35cd5 IMPALA-11633: Use longer ctas timeout for S3/Ozone
     new e15610633 IMPALA-11965: Fix TestCodegenCache failure when codegen cache disabled by default
     new 3573db68c IMPALA-11960: Fix constant propagation from TIMESTAMP to DATE

The 2 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 .../impala/analysis/ConstantPredicateHandler.java  |  9 +++++
 .../queries/PlannerTest/constant-propagation.test  | 36 ++++++++++++++++-
 .../QueryTest/range-constant-propagation.test      | 21 ++++++++++
 tests/custom_cluster/test_codegen_cache.py         | 45 ++++++++++++++--------
 4 files changed, 94 insertions(+), 17 deletions(-)


[impala] 01/02: IMPALA-11965: Fix TestCodegenCache failure when codegen cache disabled by default

Posted by st...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

stigahuang pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git

commit e15610633e5170559b6a0dd3cad10be1de0ebcd8
Author: Yida Wu <yi...@cloudera.com>
AuthorDate: Thu Mar 2 17:16:23 2023 -0800

    IMPALA-11965: Fix TestCodegenCache failure when codegen cache disabled by default
    
    The patch fixes the testcase TestCodegenCache failure when the codegen
    cache is changed to be disabled by default, because the testcase
    assumes the codegen cache is enabled with the default setting.
    
    The solution is to specify a default value to codegen_cache_capacity
    in the testcase's start option, so that manually ensures the
    codegen cache is on during the test.
    
    Tests:
    Passed TestCodegenCache in the exhaustive run with codegen cache
    disabled by default.
    
    Change-Id: I749a6ba68553834bdea908741aa7449ed32cd569
    Reviewed-on: http://gerrit.cloudera.org:8080/19574
    Reviewed-by: Zoltan Borok-Nagy <bo...@cloudera.com>
    Reviewed-by: Michael Smith <mi...@cloudera.com>
    Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
 tests/custom_cluster/test_codegen_cache.py | 45 ++++++++++++++++++++----------
 1 file changed, 30 insertions(+), 15 deletions(-)

diff --git a/tests/custom_cluster/test_codegen_cache.py b/tests/custom_cluster/test_codegen_cache.py
index f867fb378..bdd69c614 100644
--- a/tests/custom_cluster/test_codegen_cache.py
+++ b/tests/custom_cluster/test_codegen_cache.py
@@ -40,76 +40,88 @@ class TestCodegenCache(CustomClusterTestSuite):
     super(TestCodegenCache, cls).setup_class()
 
   @pytest.mark.execute_serially
-  @CustomClusterTestSuite.with_args(cluster_size=1)
+  @CustomClusterTestSuite.with_args(cluster_size=1,
+          impalad_args="--codegen_cache_capacity=1GB")
   def test_codegen_cache(self, vector):
     self._test_codegen_cache(vector,
             ("select * from (select * from functional.alltypes "
              + "limit 1000000) t1 where int_col > 10 limit 10"))
 
   @pytest.mark.execute_serially
-  @CustomClusterTestSuite.with_args(cluster_size=1)
+  @CustomClusterTestSuite.with_args(cluster_size=1,
+          impalad_args="--codegen_cache_capacity=1GB")
   def test_codegen_cache_int_col(self, vector):
     self._test_codegen_cache(vector,
       "select * from functional.alltypes where int_col > 0")
 
   @pytest.mark.execute_serially
-  @CustomClusterTestSuite.with_args(cluster_size=1)
+  @CustomClusterTestSuite.with_args(cluster_size=1,
+          impalad_args="--codegen_cache_capacity=1GB")
   def test_codegen_cache_tinyint_col(self, vector):
     self._test_codegen_cache(vector,
       "select * from functional.alltypes where tinyint_col > 0")
 
   @pytest.mark.execute_serially
-  @CustomClusterTestSuite.with_args(cluster_size=1)
+  @CustomClusterTestSuite.with_args(cluster_size=1,
+          impalad_args="--codegen_cache_capacity=1GB")
   def test_codegen_cache_bool_col(self, vector):
     self._test_codegen_cache(vector,
       "select * from functional.alltypes where bool_col > 0")
 
   @pytest.mark.execute_serially
-  @CustomClusterTestSuite.with_args(cluster_size=1)
+  @CustomClusterTestSuite.with_args(cluster_size=1,
+          impalad_args="--codegen_cache_capacity=1GB")
   def test_codegen_cache_bigint_col(self, vector):
     self._test_codegen_cache(vector,
       "select * from functional.alltypes where bigint_col > 0")
 
   @pytest.mark.execute_serially
-  @CustomClusterTestSuite.with_args(cluster_size=1)
+  @CustomClusterTestSuite.with_args(cluster_size=1,
+          impalad_args="--codegen_cache_capacity=1GB")
   def test_codegen_cache_float_col(self, vector):
     self._test_codegen_cache(vector,
       "select * from functional.alltypes where float_col > 0")
 
   @pytest.mark.execute_serially
-  @CustomClusterTestSuite.with_args(cluster_size=1)
+  @CustomClusterTestSuite.with_args(cluster_size=1,
+          impalad_args="--codegen_cache_capacity=1GB")
   def test_codegen_cache_double_col(self, vector):
     self._test_codegen_cache(vector,
       "select * from functional.alltypes where double_col > 0")
 
   @pytest.mark.execute_serially
-  @CustomClusterTestSuite.with_args(cluster_size=1)
+  @CustomClusterTestSuite.with_args(cluster_size=1,
+          impalad_args="--codegen_cache_capacity=1GB")
   def test_codegen_cache_date_string_col(self, vector):
     self._test_codegen_cache(vector,
       "select * from functional.alltypes where date_string_col != ''")
 
   @pytest.mark.execute_serially
-  @CustomClusterTestSuite.with_args(cluster_size=1)
+  @CustomClusterTestSuite.with_args(cluster_size=1,
+          impalad_args="--codegen_cache_capacity=1GB")
   def test_codegen_cache_string_col(self, vector):
     self._test_codegen_cache(vector,
       "select * from functional.alltypes where string_col != ''")
 
   @pytest.mark.execute_serially
-  @CustomClusterTestSuite.with_args(cluster_size=1)
+  @CustomClusterTestSuite.with_args(cluster_size=1,
+          impalad_args="--codegen_cache_capacity=1GB")
   def test_codegen_cache_poly_func_string_col(self, vector):
     self._test_codegen_cache(vector,
       ("select * from functional.alltypes where "
       + "CHAR_LENGTH(string_col) > 0"))
 
   @pytest.mark.execute_serially
-  @CustomClusterTestSuite.with_args(cluster_size=1)
+  @CustomClusterTestSuite.with_args(cluster_size=1,
+          impalad_args="--codegen_cache_capacity=1GB")
   def test_codegen_cache_poly_func_date_string_col(self, vector):
     self._test_codegen_cache(vector,
       ("select * from functional.alltypes where "
       + "CHAR_LENGTH(date_string_col) > 0"))
 
   @pytest.mark.execute_serially
-  @CustomClusterTestSuite.with_args(cluster_size=1)
+  @CustomClusterTestSuite.with_args(cluster_size=1,
+          impalad_args="--codegen_cache_capacity=1GB")
   # Test native uda is missed in the codegen cache, as it is disabled.
   def test_codegen_cache_uda_miss(self, vector):
     database = "test_codegen_cache_uda_miss"
@@ -118,7 +130,8 @@ class TestCodegenCache(CustomClusterTestSuite):
       "select test_count(int_col) from functional.alltypestiny", False)
 
   @pytest.mark.execute_serially
-  @CustomClusterTestSuite.with_args(cluster_size=1)
+  @CustomClusterTestSuite.with_args(cluster_size=1,
+          impalad_args="--codegen_cache_capacity=1GB")
   # Test native udf is missed in the codegen cache, as it is disabled.
   def test_codegen_cache_udf_miss(self, vector):
     database = "test_codegen_cache_udf_miss"
@@ -193,7 +206,8 @@ class TestCodegenCache(CustomClusterTestSuite):
       assert result is not None
 
   @pytest.mark.execute_serially
-  @CustomClusterTestSuite.with_args(cluster_size=3)
+  @CustomClusterTestSuite.with_args(cluster_size=3,
+          impalad_args="--codegen_cache_capacity=1GB")
   def test_codegen_cache_udf_crash(self, vector):
       # The testcase would crash if we don't disable the native udf for codegen cache.
       database = "test_codegen_cache_udf_crash"
@@ -234,7 +248,8 @@ class TestCodegenCache(CustomClusterTestSuite):
       assert result is not None
 
   @pytest.mark.execute_serially
-  @CustomClusterTestSuite.with_args(cluster_size=1)
+  @CustomClusterTestSuite.with_args(cluster_size=1,
+          impalad_args="--codegen_cache_capacity=1GB")
   def test_codegen_cache_timezone_crash(self, vector):
       # The testcase tests whether it would crash using the broken builtin function
       # from_utc_timestamp from the codegen cache.


[impala] 02/02: IMPALA-11960: Fix constant propagation from TIMESTAMP to DATE

Posted by st...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

stigahuang pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git

commit 3573db68c83918126337c9267140b7fa36f153e4
Author: Csaba Ringhofer <cs...@cloudera.com>
AuthorDate: Thu Mar 2 14:37:17 2023 +0100

    IMPALA-11960: Fix constant propagation from TIMESTAMP to DATE
    
    The constant propagation introduced in IMPALA-10064 handled conversion
    of < and > predicates from timestamps to dates incorrectly.
    
    Example:
    select * from functional.alltypes_date_partition
      where date_col = cast(timestamp_col as date)
        and timestamp_col > '2009-01-01 01:00:00'
        and timestamp_col < '2009-02-01 01:00:00';
    
    Before this change query rewrites added the following predicates:
    date_col > DATE '2009-01-01' AND date_col < DATE '2009-02-01'
    This incorrectly rejected all timestamps on the days of the
    lower / upper bounds.
    
    The fix is to rewrite < and > to <= and >= in the date predicates.
    
    < could be kept if the upper bound is a constant with no time-of-day
    part, e.g. timestamp_col < "2009-01-01" could be rewritten to
    date_col < "2009-01-01", but this optimization is not added in this
    patch to make it simpler.
    
    Testing:
    - added planner + EE regression tests
    
    Change-Id: I1938bf5e91057b220daf8a1892940f674aac3d68
    Reviewed-on: http://gerrit.cloudera.org:8080/19572
    Reviewed-by: Impala Public Jenkins <im...@cloudera.com>
    Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
 .../impala/analysis/ConstantPredicateHandler.java  |  9 ++++++
 .../queries/PlannerTest/constant-propagation.test  | 36 ++++++++++++++++++++--
 .../QueryTest/range-constant-propagation.test      | 21 +++++++++++++
 3 files changed, 64 insertions(+), 2 deletions(-)

diff --git a/fe/src/main/java/org/apache/impala/analysis/ConstantPredicateHandler.java b/fe/src/main/java/org/apache/impala/analysis/ConstantPredicateHandler.java
index 36ae91360..1cf5e3822 100644
--- a/fe/src/main/java/org/apache/impala/analysis/ConstantPredicateHandler.java
+++ b/fe/src/main/java/org/apache/impala/analysis/ConstantPredicateHandler.java
@@ -172,6 +172,15 @@ public class ConstantPredicateHandler {
 
   private Expr rewriteWithOp(BinaryPredicate pred, BinaryPredicate.Operator newOp,
     Analyzer analyzer) {
+    // Allow equality to avoid problems with truncating the time-of-day part of
+    // timestamps (IMPALA-11960).
+    // TODO: the original (stricter) op could be kept if the upper bound is a
+    //       TimestampLiteral without time component, e.g ts_col < "2000-01-01"
+    if (newOp == BinaryPredicate.Operator.LT) {
+      newOp = BinaryPredicate.Operator.LE;
+    } else if (newOp == BinaryPredicate.Operator.GT) {
+      newOp = BinaryPredicate.Operator.GE;
+    }
     if (pred.getOp() == newOp) return pred;
     BinaryPredicate newPred = new BinaryPredicate(newOp, pred.getChild(0),
         pred.getChild(1));
diff --git a/testdata/workloads/functional-planner/queries/PlannerTest/constant-propagation.test b/testdata/workloads/functional-planner/queries/PlannerTest/constant-propagation.test
index 350cfbddf..3455e227c 100644
--- a/testdata/workloads/functional-planner/queries/PlannerTest/constant-propagation.test
+++ b/testdata/workloads/functional-planner/queries/PlannerTest/constant-propagation.test
@@ -434,6 +434,38 @@ PLAN-ROOT SINK
    predicates: functional.alltypes_date_partition.timestamp_col <= TIMESTAMP '2009-02-01 00:00:00', functional.alltypes_date_partition.timestamp_col >= TIMESTAMP '2009-01-01 00:00:00', date_col = CAST(timestamp_col AS DATE)
    row-size=65B cardinality=30
 ====
+# Same as above but using < > and timestamps with time-of-day parts.
+# Regression test for IMPALA-11960.
+with dp_view as
+(select * from functional.alltypes_date_partition
+where date_col = cast(timestamp_col as date))
+select * from dp_view
+where timestamp_col > '2009-01-01 01:00:00' and timestamp_col < '2009-02-01 01:00:00';
+---- PLAN
+PLAN-ROOT SINK
+|
+00:SCAN HDFS [functional.alltypes_date_partition]
+   partition predicates: date_col >= DATE '2009-01-01' AND date_col <= DATE '2009-02-01'
+   HDFS partitions=32/55 files=32 size=15.99KB
+   predicates: functional.alltypes_date_partition.timestamp_col < TIMESTAMP '2009-02-01 01:00:00', functional.alltypes_date_partition.timestamp_col > TIMESTAMP '2009-01-01 01:00:00', date_col = CAST(timestamp_col AS DATE)
+   row-size=65B cardinality=30
+====
+# Same as above but using >= instead of >.
+# Regression test for IMPALA-11960.
+with dp_view as
+(select * from functional.alltypes_date_partition
+where date_col = cast(timestamp_col as date))
+select * from dp_view
+where timestamp_col >= '2009-01-01 01:00:00' and timestamp_col < '2009-02-01 01:00:00';
+---- PLAN
+PLAN-ROOT SINK
+|
+00:SCAN HDFS [functional.alltypes_date_partition]
+   partition predicates: date_col >= DATE '2009-01-01' AND date_col <= DATE '2009-02-01'
+   HDFS partitions=32/55 files=32 size=15.99KB
+   predicates: functional.alltypes_date_partition.timestamp_col < TIMESTAMP '2009-02-01 01:00:00', functional.alltypes_date_partition.timestamp_col >= TIMESTAMP '2009-01-01 01:00:00', date_col = CAST(timestamp_col AS DATE)
+   row-size=65B cardinality=30
+====
 # Mix of various predicates some of which are eligible for propagation
 with dp_view as
 (select * from functional.alltypes_date_partition
@@ -521,8 +553,8 @@ and timestamp_col > from_timestamp('2009-02-01 00:02:00', 'yyyy-mm-dd');
 PLAN-ROOT SINK
 |
 00:SCAN HDFS [functional.alltypes_date_partition]
-   partition predicates: date_col > DATE '2009-02-01'
-   HDFS partitions=23/55 files=23 size=11.28KB
+   partition predicates: date_col >= DATE '2009-02-01'
+   HDFS partitions=24/55 files=24 size=11.82KB
    predicates: timestamp_col > TIMESTAMP '2009-02-01 00:00:00', date_col = CAST(timestamp_col AS DATE)
    row-size=65B cardinality=21
 ====
diff --git a/testdata/workloads/functional-query/queries/QueryTest/range-constant-propagation.test b/testdata/workloads/functional-query/queries/QueryTest/range-constant-propagation.test
index aa714147d..ba7b3b11d 100644
--- a/testdata/workloads/functional-query/queries/QueryTest/range-constant-propagation.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/range-constant-propagation.test
@@ -10,6 +10,27 @@ and timestamp_col between '2009-01-01' and '2009-02-01';
 BIGINT, BIGINT
 ====
 ---- QUERY
+# Same query as above with time-of-day parts. Regression test for IMPALA-11960.
+select count(*), sum(int_col) from alltypes_date_partition
+where date_col = cast(timestamp_col as date)
+and timestamp_col > '2009-01-01 01:00:00' and timestamp_col < '2009-02-01 01:00:00';
+---- RESULTS
+155,620
+---- TYPES
+BIGINT, BIGINT
+====
+---- QUERY
+# Same as above but using >= instead of > and using a lower bound that matches with
+# a row. Regression test for IMPALA-11960.
+select count(*), sum(int_col) from alltypes_date_partition
+where date_col = cast(timestamp_col as date)
+and timestamp_col >= '2009-01-01 00:08:00.280' and timestamp_col < '2009-02-01 01:00:00';
+---- RESULTS
+156,628
+---- TYPES
+BIGINT, BIGINT
+====
+---- QUERY
 # Mix of various predicates some of which are eligible for propagation
 with dp_view as
 (select * from alltypes_date_partition