You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2021/06/06 10:48:00 UTC

[GitHub] [arrow-datafusion] alamb commented on a change in pull request #508: add expr::like and expr::notlike to pruning logic

alamb commented on a change in pull request #508:
URL: https://github.com/apache/arrow-datafusion/pull/508#discussion_r646112932



##########
File path: datafusion/src/physical_optimizer/pruning.rs
##########
@@ -1095,6 +1133,60 @@ mod tests {
         Ok(())
     }
 
+    #[test]
+    fn row_group_predicate_starts_with() -> Result<()> {
+        let schema = Schema::new(vec![Field::new("c1", DataType::Utf8, true)]);
+        // test LIKE operator that is converted to a 'starts_with'
+        let expr = col("c1").like(lit("Banana%"));
+        let expected_expr =
+            "#c1_min LtEq Utf8(\"Banana\") And Utf8(\"Banana\") LtEq #c1_max";
+        let predicate_expr =
+            build_predicate_expression(&expr, &schema, &mut RequiredStatColumns::new())?;
+        assert_eq!(format!("{:?}", predicate_expr), expected_expr);
+
+        Ok(())
+    }
+
+    #[test]
+    fn row_group_predicate_like() -> Result<()> {
+        let schema = Schema::new(vec![Field::new("c1", DataType::Utf8, true)]);
+        // test LIKE operator that can't be converted to a 'starts_with'
+        let expr = col("c1").like(lit("%Banana%"));
+        let expected_expr = "Boolean(true)";
+        let predicate_expr =
+            build_predicate_expression(&expr, &schema, &mut RequiredStatColumns::new())?;
+        assert_eq!(format!("{:?}", predicate_expr), expected_expr);
+
+        Ok(())
+    }
+
+    #[test]
+    fn row_group_predicate_not_starts_with() -> Result<()> {
+        let schema = Schema::new(vec![Field::new("c1", DataType::Utf8, true)]);
+        // test LIKE operator that can't be converted to a 'starts_with'

Review comment:
       ```suggestion
           // test NOT LIKE operator that can't be converted to a 'starts_with'
   ```

##########
File path: datafusion/src/physical_optimizer/pruning.rs
##########
@@ -586,8 +587,45 @@ fn build_predicate_expression(
                 .min_column_expr()?
                 .lt_eq(expr_builder.scalar_expr().clone())
         }
+        Operator::Like => {
+            match &**right {
+                // If the literal is a 'starts_with'
+                Expr::Literal(ScalarValue::Utf8(Some(string)))
+                    if !string.starts_with('%') =>
+                {
+                    let scalar_expr =
+                        Expr::Literal(ScalarValue::Utf8(Some(string.replace('%', ""))));
+                    // Behaves like Eq
+                    let min_column_expr = expr_builder.min_column_expr()?;
+                    let max_column_expr = expr_builder.max_column_expr()?;
+                    min_column_expr
+                        .lt_eq(scalar_expr.clone())
+                        .and(scalar_expr.lt_eq(max_column_expr))
+                }
+                _ => unhandled,
+            }
+        }
+        Operator::NotLike => {
+            match &**right {
+                // If the literal is a 'starts_with'
+                Expr::Literal(ScalarValue::Utf8(Some(string)))
+                    if !string.starts_with('%') =>
+                {
+                    let scalar_expr =
+                        Expr::Literal(ScalarValue::Utf8(Some(string.replace('%', ""))));

Review comment:
       I am not sure if just removing `%` is correct:
   
   For example in  a pattern like `foo%bar` would be converted to `foobar` and when compared with a value of `fooaaabar` would be deemed "out of range" by this logic, even though it matches the original predicate `foo%bar`.
   
   
   If instead, for `foo%bar` we used `foo` (only use the string up to the first unescaped `%`) I think then the logic applies.

##########
File path: datafusion/src/physical_optimizer/pruning.rs
##########
@@ -548,7 +549,7 @@ fn build_predicate_expression(
         // allow partial failure in predicate expression generation
         // this can still produce a useful predicate when multiple conditions are joined using AND
         Err(_) => {
-            return Ok(logical_plan::lit(true));
+            return Ok(unhandled);

Review comment:
       👍 thanks I forgot that

##########
File path: datafusion/src/physical_optimizer/pruning.rs
##########
@@ -1095,6 +1133,60 @@ mod tests {
         Ok(())
     }
 
+    #[test]
+    fn row_group_predicate_starts_with() -> Result<()> {
+        let schema = Schema::new(vec![Field::new("c1", DataType::Utf8, true)]);
+        // test LIKE operator that is converted to a 'starts_with'
+        let expr = col("c1").like(lit("Banana%"));
+        let expected_expr =
+            "#c1_min LtEq Utf8(\"Banana\") And Utf8(\"Banana\") LtEq #c1_max";
+        let predicate_expr =
+            build_predicate_expression(&expr, &schema, &mut RequiredStatColumns::new())?;
+        assert_eq!(format!("{:?}", predicate_expr), expected_expr);
+
+        Ok(())
+    }
+
+    #[test]
+    fn row_group_predicate_like() -> Result<()> {
+        let schema = Schema::new(vec![Field::new("c1", DataType::Utf8, true)]);
+        // test LIKE operator that can't be converted to a 'starts_with'
+        let expr = col("c1").like(lit("%Banana%"));
+        let expected_expr = "Boolean(true)";
+        let predicate_expr =
+            build_predicate_expression(&expr, &schema, &mut RequiredStatColumns::new())?;
+        assert_eq!(format!("{:?}", predicate_expr), expected_expr);
+
+        Ok(())
+    }
+
+    #[test]
+    fn row_group_predicate_not_starts_with() -> Result<()> {
+        let schema = Schema::new(vec![Field::new("c1", DataType::Utf8, true)]);
+        // test LIKE operator that can't be converted to a 'starts_with'
+        let expr = col("c1").not().like(lit("Banana%"));

Review comment:
       I think there is a difference between `!(a LIKE b)` and `a NOT LIKE b` -- so to test the `NOT LIKE` operator above this should be something like
   
   ```suggestion
           let expr = col("c1").not_like(lit("Banana%");
   ```
   
   https://github.com/apache/arrow-datafusion/blob/master/datafusion/src/logical_plan/expr.rs#L455-L457

##########
File path: datafusion/src/physical_optimizer/pruning.rs
##########
@@ -1095,6 +1133,60 @@ mod tests {
         Ok(())
     }
 
+    #[test]
+    fn row_group_predicate_starts_with() -> Result<()> {
+        let schema = Schema::new(vec![Field::new("c1", DataType::Utf8, true)]);
+        // test LIKE operator that is converted to a 'starts_with'
+        let expr = col("c1").like(lit("Banana%"));
+        let expected_expr =
+            "#c1_min LtEq Utf8(\"Banana\") And Utf8(\"Banana\") LtEq #c1_max";
+        let predicate_expr =
+            build_predicate_expression(&expr, &schema, &mut RequiredStatColumns::new())?;
+        assert_eq!(format!("{:?}", predicate_expr), expected_expr);
+
+        Ok(())
+    }
+
+    #[test]
+    fn row_group_predicate_like() -> Result<()> {
+        let schema = Schema::new(vec![Field::new("c1", DataType::Utf8, true)]);
+        // test LIKE operator that can't be converted to a 'starts_with'
+        let expr = col("c1").like(lit("%Banana%"));
+        let expected_expr = "Boolean(true)";
+        let predicate_expr =
+            build_predicate_expression(&expr, &schema, &mut RequiredStatColumns::new())?;
+        assert_eq!(format!("{:?}", predicate_expr), expected_expr);
+
+        Ok(())
+    }
+
+    #[test]
+    fn row_group_predicate_not_starts_with() -> Result<()> {
+        let schema = Schema::new(vec![Field::new("c1", DataType::Utf8, true)]);
+        // test LIKE operator that can't be converted to a 'starts_with'
+        let expr = col("c1").not().like(lit("Banana%"));
+        let expected_expr =
+            "NOT #c1_min LtEq Utf8(\"Banana\") And Utf8(\"Banana\") LtEq NOT #c1_max";
+        let predicate_expr =
+            build_predicate_expression(&expr, &schema, &mut RequiredStatColumns::new())?;
+        assert_eq!(format!("{:?}", predicate_expr), expected_expr);
+
+        Ok(())
+    }
+
+    #[test]
+    fn row_group_predicate_not_like() -> Result<()> {
+        let schema = Schema::new(vec![Field::new("c1", DataType::Utf8, true)]);
+        // test LIKE operator that can't be converted to a 'starts_with'

Review comment:
       ```suggestion
           // test NOT LIKE operator that can't be converted to a 'starts_with'
   ```




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org