You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by al...@apache.org on 2022/04/07 20:15:58 UTC

[arrow-datafusion] branch master updated: implement 'StringConcat' operator to support sql like "select 'aa' || 'b' " (#2142)

This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git


The following commit(s) were added to refs/heads/master by this push:
     new ddf29f112 implement 'StringConcat' operator to support sql like "select 'aa' || 'b' " (#2142)
ddf29f112 is described below

commit ddf29f112030e476fe483ba435eec14d7ee1eb3e
Author: DuRipeng <45...@qq.com>
AuthorDate: Fri Apr 8 04:15:53 2022 +0800

    implement 'StringConcat' operator to support sql like "select 'aa' || 'b' " (#2142)
    
    * implement stringconcat operator
    
    * snake case fix
    
    * support non-string concat & handle NULL
    
    * value -> array
    
    * string concat internal coercion
    
    * get NULL in right index of vec
    
    Co-authored-by: duripeng <du...@baidu.com>
---
 datafusion/core/src/sql/planner.rs                 |  1 +
 datafusion/core/tests/sql/expr.rs                  | 53 ++++++++++++++++++++++
 datafusion/expr/src/operator.rs                    |  3 ++
 .../physical-expr/src/coercion_rule/binary_rule.rs | 31 +++++++++++++
 datafusion/physical-expr/src/expressions/binary.rs | 31 +++++++++++++
 5 files changed, 119 insertions(+)

diff --git a/datafusion/core/src/sql/planner.rs b/datafusion/core/src/sql/planner.rs
index 3598492e7..42d42d1f1 100644
--- a/datafusion/core/src/sql/planner.rs
+++ b/datafusion/core/src/sql/planner.rs
@@ -1328,6 +1328,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> {
             BinaryOperator::PGRegexNotIMatch => Ok(Operator::RegexNotIMatch),
             BinaryOperator::BitwiseAnd => Ok(Operator::BitwiseAnd),
             BinaryOperator::BitwiseOr => Ok(Operator::BitwiseOr),
+            BinaryOperator::StringConcat => Ok(Operator::StringConcat),
             _ => Err(DataFusionError::NotImplemented(format!(
                 "Unsupported SQL binary operator {:?}",
                 op
diff --git a/datafusion/core/tests/sql/expr.rs b/datafusion/core/tests/sql/expr.rs
index 3863da442..2b12b3743 100644
--- a/datafusion/core/tests/sql/expr.rs
+++ b/datafusion/core/tests/sql/expr.rs
@@ -280,6 +280,59 @@ async fn query_scalar_minus_array() -> Result<()> {
     Ok(())
 }
 
+#[tokio::test]
+async fn test_string_concat_operator() -> Result<()> {
+    let ctx = SessionContext::new();
+    // concat 2 strings
+    let sql = "SELECT 'aa' || 'b'";
+    let actual = execute_to_batches(&ctx, sql).await;
+    let expected = vec![
+        "+-------------------------+",
+        "| Utf8(\"aa\") || Utf8(\"b\") |",
+        "+-------------------------+",
+        "| aab                     |",
+        "+-------------------------+",
+    ];
+    assert_batches_eq!(expected, &actual);
+
+    // concat 4 strings as a string concat pipe.
+    let sql = "SELECT 'aa' || 'b' || 'cc' || 'd'";
+    let actual = execute_to_batches(&ctx, sql).await;
+    let expected = vec![
+        "+----------------------------------------------------+",
+        "| Utf8(\"aa\") || Utf8(\"b\") || Utf8(\"cc\") || Utf8(\"d\") |",
+        "+----------------------------------------------------+",
+        "| aabccd                                             |",
+        "+----------------------------------------------------+",
+    ];
+    assert_batches_eq!(expected, &actual);
+
+    // concat 2 strings and NULL, output should be NULL
+    let sql = "SELECT 'aa' || NULL || 'd'";
+    let actual = execute_to_batches(&ctx, sql).await;
+    let expected = vec![
+        "+---------------------------------------+",
+        "| Utf8(\"aa\") || Utf8(NULL) || Utf8(\"d\") |",
+        "+---------------------------------------+",
+        "|                                       |",
+        "+---------------------------------------+",
+    ];
+    assert_batches_eq!(expected, &actual);
+
+    // concat 1 strings and 2 numeric
+    let sql = "SELECT 'a' || 42 || 23.3";
+    let actual = execute_to_batches(&ctx, sql).await;
+    let expected = vec![
+        "+-----------------------------------------+",
+        "| Utf8(\"a\") || Int64(42) || Float64(23.3) |",
+        "+-----------------------------------------+",
+        "| a4223.3                                 |",
+        "+-----------------------------------------+",
+    ];
+    assert_batches_eq!(expected, &actual);
+    Ok(())
+}
+
 #[tokio::test]
 async fn test_boolean_expressions() -> Result<()> {
     test_expression!("true", "true");
diff --git a/datafusion/expr/src/operator.rs b/datafusion/expr/src/operator.rs
index e4a9871e6..d22cb8569 100644
--- a/datafusion/expr/src/operator.rs
+++ b/datafusion/expr/src/operator.rs
@@ -71,6 +71,8 @@ pub enum Operator {
     BitwiseAnd,
     /// Bitwise or, like `|`
     BitwiseOr,
+    /// String concat
+    StringConcat,
 }
 
 impl fmt::Display for Operator {
@@ -99,6 +101,7 @@ impl fmt::Display for Operator {
             Operator::IsNotDistinctFrom => "IS NOT DISTINCT FROM",
             Operator::BitwiseAnd => "&",
             Operator::BitwiseOr => "|",
+            Operator::StringConcat => "||",
         };
         write!(f, "{}", display)
     }
diff --git a/datafusion/physical-expr/src/coercion_rule/binary_rule.rs b/datafusion/physical-expr/src/coercion_rule/binary_rule.rs
index ac23f2b1b..263523938 100644
--- a/datafusion/physical-expr/src/coercion_rule/binary_rule.rs
+++ b/datafusion/physical-expr/src/coercion_rule/binary_rule.rs
@@ -17,6 +17,7 @@
 
 //! Coercion rules for matching argument types for binary operators
 
+use arrow::compute::can_cast_types;
 use arrow::datatypes::{DataType, DECIMAL_MAX_PRECISION, DECIMAL_MAX_SCALE};
 use datafusion_common::DataFusionError;
 use datafusion_common::Result;
@@ -58,6 +59,8 @@ pub(crate) fn coerce_types(
         | Operator::RegexIMatch
         | Operator::RegexNotMatch
         | Operator::RegexNotIMatch => string_coercion(lhs_type, rhs_type),
+        // "||" operator has its own rules, and always return a string type
+        Operator::StringConcat => string_concat_coercion(lhs_type, rhs_type),
         Operator::IsDistinctFrom | Operator::IsNotDistinctFrom => {
             eq_coercion(lhs_type, rhs_type)
         }
@@ -369,6 +372,34 @@ fn dictionary_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option<DataT
     }
 }
 
+/// Coercion rules for string concat.
+/// This is a union of string coercion rules and specified rules:
+/// 1. At lease one side of lhs and rhs should be string type (Utf8 / LargeUtf8)
+/// 2. Data type of the other side should be able to cast to string type
+fn string_concat_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option<DataType> {
+    use arrow::datatypes::DataType::*;
+    string_coercion(lhs_type, rhs_type).or(match (lhs_type, rhs_type) {
+        (Utf8, from_type) | (from_type, Utf8) => {
+            string_concat_internal_coercion(from_type, &Utf8)
+        }
+        (LargeUtf8, from_type) | (from_type, LargeUtf8) => {
+            string_concat_internal_coercion(from_type, &LargeUtf8)
+        }
+        _ => None,
+    })
+}
+
+fn string_concat_internal_coercion(
+    from_type: &DataType,
+    to_type: &DataType,
+) -> Option<DataType> {
+    if can_cast_types(from_type, to_type) {
+        Some(to_type.to_owned())
+    } else {
+        None
+    }
+}
+
 /// Coercion rules for Strings: the type that both lhs and rhs can be
 /// casted to for the purpose of a string computation
 fn string_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option<DataType> {
diff --git a/datafusion/physical-expr/src/expressions/binary.rs b/datafusion/physical-expr/src/expressions/binary.rs
index c3fc216aa..7cf460b6b 100644
--- a/datafusion/physical-expr/src/expressions/binary.rs
+++ b/datafusion/physical-expr/src/expressions/binary.rs
@@ -56,6 +56,7 @@ use arrow::record_batch::RecordBatch;
 
 use crate::coercion_rule::binary_rule::coerce_types;
 use crate::expressions::try_cast;
+use crate::string_expressions;
 use crate::PhysicalExpr;
 use datafusion_common::ScalarValue;
 use datafusion_common::{DataFusionError, Result};
@@ -416,6 +417,33 @@ fn bitwise_or(left: ArrayRef, right: ArrayRef) -> Result<ArrayRef> {
     }
 }
 
+/// Use datafusion build-in expression `concat` to evaluate `StringConcat` operator.
+/// Besides, any `NULL` exists on lhs or rhs will come out result `NULL`
+/// 1. 'a' || 'b' || 32 = 'ab32'
+/// 2. 'a' || NULL = NULL
+fn string_concat(left: ArrayRef, right: ArrayRef) -> Result<ArrayRef> {
+    let ignore_null = match string_expressions::concat(&[
+        ColumnarValue::Array(left.clone()),
+        ColumnarValue::Array(right.clone()),
+    ])? {
+        ColumnarValue::Array(array_ref) => array_ref,
+        scalar_value => scalar_value.into_array(left.clone().len()),
+    };
+    let ignore_null_array = ignore_null.as_any().downcast_ref::<StringArray>().unwrap();
+    let result = (0..ignore_null_array.len())
+        .into_iter()
+        .map(|index| {
+            if left.is_null(index) || right.is_null(index) {
+                None
+            } else {
+                Some(ignore_null_array.value(index))
+            }
+        })
+        .collect::<StringArray>();
+
+    Ok(Arc::new(result) as ArrayRef)
+}
+
 fn bitwise_and_scalar(
     array: &dyn Array,
     scalar: ScalarValue,
@@ -1005,6 +1033,8 @@ pub fn binary_operator_data_type(
         | Operator::Divide
         | Operator::Multiply
         | Operator::Modulo => Ok(result_type),
+        // string operations return the same values as the common coerced type
+        Operator::StringConcat => Ok(result_type),
     }
 }
 
@@ -1266,6 +1296,7 @@ impl BinaryExpr {
             }
             Operator::BitwiseAnd => bitwise_and(left, right),
             Operator::BitwiseOr => bitwise_or(left, right),
+            Operator::StringConcat => string_concat(left, right),
         }
     }
 }