You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by bo...@apache.org on 2021/11/03 12:22:21 UTC

[impala] 01/02: IMPALA-10836: Add 'SimplifyCastExprRule' rule to rewrite cast expr in some situations

This is an automated email from the ASF dual-hosted git repository.

boroknagyz pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git

commit 217d821e94cc8571c7f21e8a4d55a3d3f900c116
Author: skyyws <sk...@163.com>
AuthorDate: Mon Sep 6 20:32:12 2021 +0800

    IMPALA-10836: Add 'SimplifyCastExprRule' rule to rewrite cast expr in some situations
    
    This patch adds a new expr rewrite rule to simplify some cast expr when
    cast target data type is the same as inner expr data type. We will
    remove unnecessary cast expr if any rules are matched. This kind of
    rewrite will improve query performance when casting a non-partition
    column, especially when scanning lots of data. Besides, cast expr in
    where clause can not pushdown to Kudu server, if we can remove
    unnecessary cast expr, Impala will pushdown this predicate to Kudu
    server, and this will save lots of time and IO/memmory.
    
    Testing:
    - Added unit test cases in `ExprRewriteRulesTest`
    
    Change-Id: Id8fac7100060d4e139a8b24d4795c6f279c55954
    Reviewed-on: http://gerrit.cloudera.org:8080/17933
    Tested-by: Impala Public Jenkins <im...@cloudera.com>
    Reviewed-by: Zoltan Borok-Nagy <bo...@cloudera.com>
---
 .../java/org/apache/impala/analysis/Analyzer.java  |  2 +
 .../impala/rewrite/SimplifyCastExprRule.java       | 62 +++++++++++++++++
 .../impala/analysis/ExprRewriteRulesTest.java      | 80 ++++++++++++++++++++++
 .../queries/PlannerTest/insert-sort-by.test        |  4 +-
 4 files changed, 146 insertions(+), 2 deletions(-)

diff --git a/fe/src/main/java/org/apache/impala/analysis/Analyzer.java b/fe/src/main/java/org/apache/impala/analysis/Analyzer.java
index 9734c3a..c3d48dc 100644
--- a/fe/src/main/java/org/apache/impala/analysis/Analyzer.java
+++ b/fe/src/main/java/org/apache/impala/analysis/Analyzer.java
@@ -68,6 +68,7 @@ import org.apache.impala.compat.MetastoreShim;
 import org.apache.impala.planner.JoinNode;
 import org.apache.impala.planner.PlanNode;
 import org.apache.impala.rewrite.BetweenToCompoundRule;
+import org.apache.impala.rewrite.SimplifyCastExprRule;
 import org.apache.impala.rewrite.ConvertToCNFRule;
 import org.apache.impala.rewrite.EqualityDisjunctsToInRule;
 import org.apache.impala.rewrite.ExprRewriteRule;
@@ -535,6 +536,7 @@ public class Analyzer {
         rules.add(SimplifyCastStringToTimestamp.INSTANCE);
         rules.add(CountDistinctToNdvRule.INSTANCE);
         rules.add(DefaultNdvScaleRule.INSTANCE);
+        rules.add(SimplifyCastExprRule.INSTANCE);
       }
       exprRewriter_ = new ExprRewriter(rules);
     }
diff --git a/fe/src/main/java/org/apache/impala/rewrite/SimplifyCastExprRule.java b/fe/src/main/java/org/apache/impala/rewrite/SimplifyCastExprRule.java
new file mode 100644
index 0000000..4272e7b
--- /dev/null
+++ b/fe/src/main/java/org/apache/impala/rewrite/SimplifyCastExprRule.java
@@ -0,0 +1,62 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+package org.apache.impala.rewrite;
+
+import com.google.common.base.Preconditions;
+import org.apache.impala.analysis.Analyzer;
+import org.apache.impala.analysis.CastExpr;
+import org.apache.impala.analysis.Expr;
+import org.apache.impala.analysis.NullLiteral;
+
+/**
+ * This rewrite rule simplifies cast expr when cast target data type is the same as inner
+ * expr data type.
+ * Examples:
+ * CAST(str_col AS STRING) ==> str_col
+ * SUM(CAST(int_col AS INT)) ==> SUM(int_col)
+ * CAST(int_col+int_col AS INT) ==> int_col+int_col
+ * CAST(CAST(col as BIGINT) as BIGINT) ==> CAST(col as BIGINT)
+ *
+ * This rule is also valid for complex type.
+ * Examples:
+ * CAST(int_array.item as INT) ==> int_array.item
+ * CAST(string_map.value as STRING) ==> string_map.value
+ * CAST(CAST(array.item as INT) AS INT) ==> CAST(array.item as INT)
+ * CAST(CAST(map.value as STRING) AS STRING) ==> CAST(map.value as STRING)
+ *
+ * Pay attention:
+ * For varchar type, we remove cast expr only when lengths are the same.
+ * For decimal type, we remove cast expr only when precisions and scales are the same.
+ */
+public class SimplifyCastExprRule implements ExprRewriteRule {
+  public static ExprRewriteRule INSTANCE = new SimplifyCastExprRule();
+
+  @Override
+  public Expr apply(Expr expr, Analyzer analyzer) {
+    if (!(expr instanceof CastExpr)) return expr;
+
+    Preconditions.checkState(expr.getChildren().size() == 1);
+    CastExpr castExpr = (CastExpr) expr;
+    Expr child = castExpr.getChild(0);
+    //In analysis phase, CaseExpr will convert child to target type, we cannot rewrite in
+    //this situation, such as cast(cast(NULL as string) as timestamp FORMAT 'YYYY-MM-DD').
+    //If we replace inner 'cast(NULL as string)' as 'NULL', query will throw exception.
+    if (child instanceof NullLiteral) return castExpr;
+    return castExpr.getType().matchesType(child.getType())? child : castExpr;
+  }
+}
diff --git a/fe/src/test/java/org/apache/impala/analysis/ExprRewriteRulesTest.java b/fe/src/test/java/org/apache/impala/analysis/ExprRewriteRulesTest.java
index d91a53c..e799a35 100644
--- a/fe/src/test/java/org/apache/impala/analysis/ExprRewriteRulesTest.java
+++ b/fe/src/test/java/org/apache/impala/analysis/ExprRewriteRulesTest.java
@@ -32,6 +32,7 @@ import org.apache.impala.common.ImpalaException;
 import org.apache.impala.common.QueryFixture;
 import org.apache.impala.common.SqlCastException;
 import org.apache.impala.rewrite.BetweenToCompoundRule;
+import org.apache.impala.rewrite.SimplifyCastExprRule;
 import org.apache.impala.rewrite.ConvertToCNFRule;
 import org.apache.impala.rewrite.EqualityDisjunctsToInRule;
 import org.apache.impala.rewrite.ExprRewriteRule;
@@ -915,4 +916,83 @@ public class ExprRewriteRulesTest extends FrontendTestBase {
     RewritesOk(
         "NULL || bool_col", extractCompoundVerticalBarExprRule, "NULL OR bool_col");
   }
+
+  @Test
+  public void testSimplifyCastExprRule() throws ImpalaException {
+    ExprRewriteRule rule = SimplifyCastExprRule.INSTANCE;
+
+    //Inner expr is a column
+    RewritesOk("functional.alltypes", "CAST(int_col AS INT)", rule, "int_col");
+    RewritesOk("functional.alltypes", "CAST(date_string_col AS STRING)", rule,
+        "date_string_col");
+    RewritesOk("functional.alltypes", "CAST(timestamp_col AS TIMESTAMP)", rule,
+        "timestamp_col");
+    //Multi-layer cast
+    RewritesOk("functional.alltypes", "CAST(CAST(int_col AS INT) AS INT)", rule,
+        "int_col");
+    //No rewrite
+    RewritesOk("functional.alltypes", "CAST(int_col AS BIGINT)", rule, null);
+
+    //Inner expr is an arithmetic expr
+    RewritesOk("functional.alltypes", "CAST(bigint_col+bigint_col AS BIGINT)", rule,
+        "bigint_col + bigint_col");
+    //Multi-layer cast
+    RewritesOk("functional.alltypes", "SUM(CAST(bigint_col+bigint_col AS BIGINT))", rule,
+        "sum(bigint_col + bigint_col)");
+    //No rewrite
+    RewritesOk("functional.alltypes", "CAST(bigint_col+bigint_col AS DOUBLE)", rule,
+        null);
+
+    //Inner expr is also a cast expr
+    RewritesOk("functional.alltypes", "CAST(CAST(int_col AS BIGINT) AS BIGINT)", rule,
+        "CAST(int_col AS BIGINT)");
+    //Multi-layer cast
+    RewritesOk("functional.alltypes",
+        "CAST(CAST(CAST(int_col AS BIGINT) AS BIGINT) AS BIGINT)", rule,
+        "CAST(int_col AS BIGINT)");
+    //No rewrite
+    RewritesOk("functional.alltypes", "CAST(CAST(int_col AS BIGINT) AS INT)", rule, null);
+
+    //Decimal type, d3 type is DECIMAL(20,10)
+    RewritesOk("functional.decimal_tbl", "CAST(d3 AS DECIMAL(20,10))", rule, "d3");
+    RewritesOk("functional.decimal_tbl",
+        "CAST(CAST(d3 AS DECIMAL(10,5)) AS DECIMAL(10,5))", rule,
+        "CAST(d3 AS DECIMAL(10,5))");
+    //No rewrite
+    RewritesOk("functional.decimal_tbl", "CAST(d3 AS DECIMAL(10,5))", rule, null);
+    RewritesOk("functional.decimal_tbl", "CAST(d3 AS DECIMAL(25,15))", rule, null);
+    RewritesOk("functional.decimal_tbl",
+        "CAST(CAST(d3 AS DECIMAL(10,5)) AS DECIMAL(15,5))", rule, null);
+
+    //Varchar type, vc type is VARCHAR(32)
+    RewritesOk("functional.chars_formats", "CAST(vc AS VARCHAR(32))", rule, "vc");
+    RewritesOk("functional.chars_formats",
+        "CAST(CAST(vc AS VARCHAR(16)) AS VARCHAR(16))", rule, "CAST(vc AS VARCHAR(16))");
+    //No rewrite
+    RewritesOk("functional.chars_formats", "CAST(vc AS VARCHAR(16))", rule, null);
+    RewritesOk("functional.chars_formats", "CAST(vc AS VARCHAR(48))", rule, null);
+    RewritesOk("functional.chars_formats",
+        "CAST(CAST(vc AS VARCHAR(48)) AS VARCHAR(16))", rule, null);
+
+    //Array type
+    RewritesOk("functional.allcomplextypes.int_array_col",
+        "CAST(int_array_col.item AS INT)", rule, "int_array_col.item");
+    RewritesOk("functional.allcomplextypes.int_array_col",
+        "CAST(CAST(int_array_col.item AS BIGINT) AS BIGINT)", rule,
+        "CAST(int_array_col.item AS BIGINT)");
+
+    //Map type
+    RewritesOk("functional.allcomplextypes.int_map_col",
+        "CAST(int_map_col.key AS STRING)", rule, "int_map_col.`key`");
+    RewritesOk("functional.allcomplextypes.int_map_col",
+        "CAST(CAST(int_map_col.value AS STRING) AS STRING)", rule,
+        "CAST(int_map_col.value AS STRING)");
+
+    //Struct type
+    RewritesOk("functional.allcomplextypes", "CAST(int_struct_col.f1 AS INT)", rule,
+        "int_struct_col.f1");
+    RewritesOk("functional.allcomplextypes",
+        "CAST(CAST(int_struct_col.f1 AS BIGINT) AS BIGINT)", rule,
+        "CAST(int_struct_col.f1 AS BIGINT)");
+  }
 }
diff --git a/testdata/workloads/functional-planner/queries/PlannerTest/insert-sort-by.test b/testdata/workloads/functional-planner/queries/PlannerTest/insert-sort-by.test
index 525ce97..1e20c4b 100644
--- a/testdata/workloads/functional-planner/queries/PlannerTest/insert-sort-by.test
+++ b/testdata/workloads/functional-planner/queries/PlannerTest/insert-sort-by.test
@@ -387,7 +387,7 @@ WRITE TO HDFS [test_sort_by.t_nopart, OVERWRITE=false]
 |  partitions=1
 |
 02:SORT
-|  order by: CAST(id AS INT) ASC NULLS LAST
+|  order by: id ASC NULLS LAST
 |  row-size=1B cardinality=2
 |
 01:AGGREGATE [FINALIZE]
@@ -402,7 +402,7 @@ WRITE TO HDFS [test_sort_by.t_nopart, OVERWRITE=false]
 |  partitions=1
 |
 02:SORT
-|  order by: CAST(id AS INT) ASC NULLS LAST
+|  order by: id ASC NULLS LAST
 |  row-size=1B cardinality=2
 |
 01:AGGREGATE [FINALIZE]