You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by al...@apache.org on 2022/04/07 20:15:58 UTC
[arrow-datafusion] branch master updated: implement 'StringConcat' operator to support sql like "select 'aa' || 'b' " (#2142)
This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git
The following commit(s) were added to refs/heads/master by this push:
new ddf29f112 implement 'StringConcat' operator to support sql like "select 'aa' || 'b' " (#2142)
ddf29f112 is described below
commit ddf29f112030e476fe483ba435eec14d7ee1eb3e
Author: DuRipeng <45...@qq.com>
AuthorDate: Fri Apr 8 04:15:53 2022 +0800
implement 'StringConcat' operator to support sql like "select 'aa' || 'b' " (#2142)
* implement stringconcat operator
* snake case fix
* support non-string concat & handle NULL
* value -> array
* string concat internal coercion
* get NULL in right index of vec
Co-authored-by: duripeng <du...@baidu.com>
---
datafusion/core/src/sql/planner.rs | 1 +
datafusion/core/tests/sql/expr.rs | 53 ++++++++++++++++++++++
datafusion/expr/src/operator.rs | 3 ++
.../physical-expr/src/coercion_rule/binary_rule.rs | 31 +++++++++++++
datafusion/physical-expr/src/expressions/binary.rs | 31 +++++++++++++
5 files changed, 119 insertions(+)
diff --git a/datafusion/core/src/sql/planner.rs b/datafusion/core/src/sql/planner.rs
index 3598492e7..42d42d1f1 100644
--- a/datafusion/core/src/sql/planner.rs
+++ b/datafusion/core/src/sql/planner.rs
@@ -1328,6 +1328,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> {
BinaryOperator::PGRegexNotIMatch => Ok(Operator::RegexNotIMatch),
BinaryOperator::BitwiseAnd => Ok(Operator::BitwiseAnd),
BinaryOperator::BitwiseOr => Ok(Operator::BitwiseOr),
+ BinaryOperator::StringConcat => Ok(Operator::StringConcat),
_ => Err(DataFusionError::NotImplemented(format!(
"Unsupported SQL binary operator {:?}",
op
diff --git a/datafusion/core/tests/sql/expr.rs b/datafusion/core/tests/sql/expr.rs
index 3863da442..2b12b3743 100644
--- a/datafusion/core/tests/sql/expr.rs
+++ b/datafusion/core/tests/sql/expr.rs
@@ -280,6 +280,59 @@ async fn query_scalar_minus_array() -> Result<()> {
Ok(())
}
+#[tokio::test]
+async fn test_string_concat_operator() -> Result<()> {
+ let ctx = SessionContext::new();
+ // concat 2 strings
+ let sql = "SELECT 'aa' || 'b'";
+ let actual = execute_to_batches(&ctx, sql).await;
+ let expected = vec![
+ "+-------------------------+",
+ "| Utf8(\"aa\") || Utf8(\"b\") |",
+ "+-------------------------+",
+ "| aab |",
+ "+-------------------------+",
+ ];
+ assert_batches_eq!(expected, &actual);
+
+ // concat 4 strings as a string concat pipe.
+ let sql = "SELECT 'aa' || 'b' || 'cc' || 'd'";
+ let actual = execute_to_batches(&ctx, sql).await;
+ let expected = vec![
+ "+----------------------------------------------------+",
+ "| Utf8(\"aa\") || Utf8(\"b\") || Utf8(\"cc\") || Utf8(\"d\") |",
+ "+----------------------------------------------------+",
+ "| aabccd |",
+ "+----------------------------------------------------+",
+ ];
+ assert_batches_eq!(expected, &actual);
+
+ // concat 2 strings and NULL, output should be NULL
+ let sql = "SELECT 'aa' || NULL || 'd'";
+ let actual = execute_to_batches(&ctx, sql).await;
+ let expected = vec![
+ "+---------------------------------------+",
+ "| Utf8(\"aa\") || Utf8(NULL) || Utf8(\"d\") |",
+ "+---------------------------------------+",
+ "| |",
+ "+---------------------------------------+",
+ ];
+ assert_batches_eq!(expected, &actual);
+
+ // concat 1 strings and 2 numeric
+ let sql = "SELECT 'a' || 42 || 23.3";
+ let actual = execute_to_batches(&ctx, sql).await;
+ let expected = vec![
+ "+-----------------------------------------+",
+ "| Utf8(\"a\") || Int64(42) || Float64(23.3) |",
+ "+-----------------------------------------+",
+ "| a4223.3 |",
+ "+-----------------------------------------+",
+ ];
+ assert_batches_eq!(expected, &actual);
+ Ok(())
+}
+
#[tokio::test]
async fn test_boolean_expressions() -> Result<()> {
test_expression!("true", "true");
diff --git a/datafusion/expr/src/operator.rs b/datafusion/expr/src/operator.rs
index e4a9871e6..d22cb8569 100644
--- a/datafusion/expr/src/operator.rs
+++ b/datafusion/expr/src/operator.rs
@@ -71,6 +71,8 @@ pub enum Operator {
BitwiseAnd,
/// Bitwise or, like `|`
BitwiseOr,
+ /// String concat
+ StringConcat,
}
impl fmt::Display for Operator {
@@ -99,6 +101,7 @@ impl fmt::Display for Operator {
Operator::IsNotDistinctFrom => "IS NOT DISTINCT FROM",
Operator::BitwiseAnd => "&",
Operator::BitwiseOr => "|",
+ Operator::StringConcat => "||",
};
write!(f, "{}", display)
}
diff --git a/datafusion/physical-expr/src/coercion_rule/binary_rule.rs b/datafusion/physical-expr/src/coercion_rule/binary_rule.rs
index ac23f2b1b..263523938 100644
--- a/datafusion/physical-expr/src/coercion_rule/binary_rule.rs
+++ b/datafusion/physical-expr/src/coercion_rule/binary_rule.rs
@@ -17,6 +17,7 @@
//! Coercion rules for matching argument types for binary operators
+use arrow::compute::can_cast_types;
use arrow::datatypes::{DataType, DECIMAL_MAX_PRECISION, DECIMAL_MAX_SCALE};
use datafusion_common::DataFusionError;
use datafusion_common::Result;
@@ -58,6 +59,8 @@ pub(crate) fn coerce_types(
| Operator::RegexIMatch
| Operator::RegexNotMatch
| Operator::RegexNotIMatch => string_coercion(lhs_type, rhs_type),
+ // "||" operator has its own rules, and always return a string type
+ Operator::StringConcat => string_concat_coercion(lhs_type, rhs_type),
Operator::IsDistinctFrom | Operator::IsNotDistinctFrom => {
eq_coercion(lhs_type, rhs_type)
}
@@ -369,6 +372,34 @@ fn dictionary_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option<DataT
}
}
+/// Coercion rules for string concat.
+/// This is a union of string coercion rules and specified rules:
+/// 1. At lease one side of lhs and rhs should be string type (Utf8 / LargeUtf8)
+/// 2. Data type of the other side should be able to cast to string type
+fn string_concat_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option<DataType> {
+ use arrow::datatypes::DataType::*;
+ string_coercion(lhs_type, rhs_type).or(match (lhs_type, rhs_type) {
+ (Utf8, from_type) | (from_type, Utf8) => {
+ string_concat_internal_coercion(from_type, &Utf8)
+ }
+ (LargeUtf8, from_type) | (from_type, LargeUtf8) => {
+ string_concat_internal_coercion(from_type, &LargeUtf8)
+ }
+ _ => None,
+ })
+}
+
+fn string_concat_internal_coercion(
+ from_type: &DataType,
+ to_type: &DataType,
+) -> Option<DataType> {
+ if can_cast_types(from_type, to_type) {
+ Some(to_type.to_owned())
+ } else {
+ None
+ }
+}
+
/// Coercion rules for Strings: the type that both lhs and rhs can be
/// casted to for the purpose of a string computation
fn string_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option<DataType> {
diff --git a/datafusion/physical-expr/src/expressions/binary.rs b/datafusion/physical-expr/src/expressions/binary.rs
index c3fc216aa..7cf460b6b 100644
--- a/datafusion/physical-expr/src/expressions/binary.rs
+++ b/datafusion/physical-expr/src/expressions/binary.rs
@@ -56,6 +56,7 @@ use arrow::record_batch::RecordBatch;
use crate::coercion_rule::binary_rule::coerce_types;
use crate::expressions::try_cast;
+use crate::string_expressions;
use crate::PhysicalExpr;
use datafusion_common::ScalarValue;
use datafusion_common::{DataFusionError, Result};
@@ -416,6 +417,33 @@ fn bitwise_or(left: ArrayRef, right: ArrayRef) -> Result<ArrayRef> {
}
}
+/// Use datafusion build-in expression `concat` to evaluate `StringConcat` operator.
+/// Besides, any `NULL` exists on lhs or rhs will come out result `NULL`
+/// 1. 'a' || 'b' || 32 = 'ab32'
+/// 2. 'a' || NULL = NULL
+fn string_concat(left: ArrayRef, right: ArrayRef) -> Result<ArrayRef> {
+ let ignore_null = match string_expressions::concat(&[
+ ColumnarValue::Array(left.clone()),
+ ColumnarValue::Array(right.clone()),
+ ])? {
+ ColumnarValue::Array(array_ref) => array_ref,
+ scalar_value => scalar_value.into_array(left.clone().len()),
+ };
+ let ignore_null_array = ignore_null.as_any().downcast_ref::<StringArray>().unwrap();
+ let result = (0..ignore_null_array.len())
+ .into_iter()
+ .map(|index| {
+ if left.is_null(index) || right.is_null(index) {
+ None
+ } else {
+ Some(ignore_null_array.value(index))
+ }
+ })
+ .collect::<StringArray>();
+
+ Ok(Arc::new(result) as ArrayRef)
+}
+
fn bitwise_and_scalar(
array: &dyn Array,
scalar: ScalarValue,
@@ -1005,6 +1033,8 @@ pub fn binary_operator_data_type(
| Operator::Divide
| Operator::Multiply
| Operator::Modulo => Ok(result_type),
+ // string operations return the same values as the common coerced type
+ Operator::StringConcat => Ok(result_type),
}
}
@@ -1266,6 +1296,7 @@ impl BinaryExpr {
}
Operator::BitwiseAnd => bitwise_and(left, right),
Operator::BitwiseOr => bitwise_or(left, right),
+ Operator::StringConcat => string_concat(left, right),
}
}
}