You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ne...@apache.org on 2021/06/05 10:16:16 UTC

[arrow-datafusion] branch i507-string-like-prune created (now 1062d5c)

This is an automated email from the ASF dual-hosted git repository.

nevime pushed a change to branch i507-string-like-prune
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git.


      at 1062d5c  add expr::like and expr::notlike to pruning logic

This branch includes the following new commits:

     new 1062d5c  add expr::like and expr::notlike to pruning logic

The 1 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


[arrow-datafusion] 01/01: add expr::like and expr::notlike to pruning logic

Posted by ne...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

nevime pushed a commit to branch i507-string-like-prune
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git

commit 1062d5c8e77291bd7ae2245b2f701c12d4d27310
Author: Neville Dipale <ne...@gmail.com>
AuthorDate: Sat Jun 5 11:57:56 2021 +0200

    add expr::like and expr::notlike to pruning logic
---
 datafusion/src/physical_optimizer/pruning.rs | 96 +++++++++++++++++++++++++++-
 1 file changed, 94 insertions(+), 2 deletions(-)

diff --git a/datafusion/src/physical_optimizer/pruning.rs b/datafusion/src/physical_optimizer/pruning.rs
index c65733b..0e43e4e 100644
--- a/datafusion/src/physical_optimizer/pruning.rs
+++ b/datafusion/src/physical_optimizer/pruning.rs
@@ -42,6 +42,7 @@ use crate::{
     logical_plan::{Expr, Operator},
     optimizer::utils,
     physical_plan::{planner::DefaultPhysicalPlanner, ColumnarValue, PhysicalExpr},
+    scalar::ScalarValue,
 };
 
 /// Interface to pass statistics information to [`PruningPredicates`]
@@ -548,7 +549,7 @@ fn build_predicate_expression(
         // allow partial failure in predicate expression generation
         // this can still produce a useful predicate when multiple conditions are joined using AND
         Err(_) => {
-            return Ok(logical_plan::lit(true));
+            return Ok(unhandled);
         }
     };
     let corrected_op = expr_builder.correct_operator(op);
@@ -586,8 +587,45 @@ fn build_predicate_expression(
                 .min_column_expr()?
                 .lt_eq(expr_builder.scalar_expr().clone())
         }
+        Operator::Like => {
+            match &**right {
+                // If the literal is a 'starts_with'
+                Expr::Literal(ScalarValue::Utf8(Some(string)))
+                    if !string.starts_with('%') =>
+                {
+                    let scalar_expr =
+                        Expr::Literal(ScalarValue::Utf8(Some(string.replace('%', ""))));
+                    // Behaves like Eq
+                    let min_column_expr = expr_builder.min_column_expr()?;
+                    let max_column_expr = expr_builder.max_column_expr()?;
+                    min_column_expr
+                        .lt_eq(scalar_expr.clone())
+                        .and(scalar_expr.lt_eq(max_column_expr))
+                }
+                _ => unhandled,
+            }
+        }
+        Operator::NotLike => {
+            match &**right {
+                // If the literal is a 'starts_with'
+                Expr::Literal(ScalarValue::Utf8(Some(string)))
+                    if !string.starts_with('%') =>
+                {
+                    let scalar_expr =
+                        Expr::Literal(ScalarValue::Utf8(Some(string.replace('%', ""))));
+                    // Behaves like Eq
+                    let min_column_expr = expr_builder.min_column_expr()?;
+                    let max_column_expr = expr_builder.max_column_expr()?;
+                    // Inverse of Like
+                    min_column_expr
+                        .gt_eq(scalar_expr.clone())
+                        .and(scalar_expr.gt_eq(max_column_expr))
+                }
+                _ => unhandled,
+            }
+        }
         // other expressions are not supported
-        _ => logical_plan::lit(true),
+        _ => unhandled,
     };
     Ok(statistics_expr)
 }
@@ -1096,6 +1134,60 @@ mod tests {
     }
 
     #[test]
+    fn row_group_predicate_starts_with() -> Result<()> {
+        let schema = Schema::new(vec![Field::new("c1", DataType::Utf8, true)]);
+        // test LIKE operator that is converted to a 'starts_with'
+        let expr = col("c1").like(lit("Banana%"));
+        let expected_expr =
+            "#c1_min LtEq Utf8(\"Banana\") And Utf8(\"Banana\") LtEq #c1_max";
+        let predicate_expr =
+            build_predicate_expression(&expr, &schema, &mut RequiredStatColumns::new())?;
+        assert_eq!(format!("{:?}", predicate_expr), expected_expr);
+
+        Ok(())
+    }
+
+    #[test]
+    fn row_group_predicate_like() -> Result<()> {
+        let schema = Schema::new(vec![Field::new("c1", DataType::Utf8, true)]);
+        // test LIKE operator that can't be converted to a 'starts_with'
+        let expr = col("c1").like(lit("%Banana%"));
+        let expected_expr = "Boolean(true)";
+        let predicate_expr =
+            build_predicate_expression(&expr, &schema, &mut RequiredStatColumns::new())?;
+        assert_eq!(format!("{:?}", predicate_expr), expected_expr);
+
+        Ok(())
+    }
+
+    #[test]
+    fn row_group_predicate_not_starts_with() -> Result<()> {
+        let schema = Schema::new(vec![Field::new("c1", DataType::Utf8, true)]);
+        // test LIKE operator that can't be converted to a 'starts_with'
+        let expr = col("c1").not().like(lit("Banana%"));
+        let expected_expr =
+            "NOT #c1_min LtEq Utf8(\"Banana\") And Utf8(\"Banana\") LtEq NOT #c1_max";
+        let predicate_expr =
+            build_predicate_expression(&expr, &schema, &mut RequiredStatColumns::new())?;
+        assert_eq!(format!("{:?}", predicate_expr), expected_expr);
+
+        Ok(())
+    }
+
+    #[test]
+    fn row_group_predicate_not_like() -> Result<()> {
+        let schema = Schema::new(vec![Field::new("c1", DataType::Utf8, true)]);
+        // test LIKE operator that can't be converted to a 'starts_with'
+        let expr = col("c1").not().like(lit("%Banana%"));
+        let expected_expr = "Boolean(true)";
+        let predicate_expr =
+            build_predicate_expression(&expr, &schema, &mut RequiredStatColumns::new())?;
+        assert_eq!(format!("{:?}", predicate_expr), expected_expr);
+
+        Ok(())
+    }
+
+    #[test]
     fn row_group_predicate_required_columns() -> Result<()> {
         let schema = Schema::new(vec![
             Field::new("c1", DataType::Int32, false),