You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by al...@apache.org on 2022/05/25 18:36:29 UTC

[arrow-rs] branch master updated: Implementation string concat (#1720)

This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/master by this push:
     new 2ba1ef417 Implementation string concat (#1720)
2ba1ef417 is described below

commit 2ba1ef417837e994e4ed820a7d339708e6db7580
Author: Ismail-Maj <76...@users.noreply.github.com>
AuthorDate: Wed May 25 20:36:24 2022 +0200

    Implementation string concat (#1720)
    
    * first implementation string concat
    
    * faster implementation
    
    * no branching for loop and handle offsets
    
    * string module and more tests
    
    * empty string test
    
    * no null test and error test
    
    Co-authored-by: Ismail-Maj <is...@protonmail.com>
---
 arrow/src/compute/kernels/mod.rs    |   1 +
 arrow/src/compute/kernels/string.rs | 196 ++++++++++++++++++++++++++++++++++++
 2 files changed, 197 insertions(+)

diff --git a/arrow/src/compute/kernels/mod.rs b/arrow/src/compute/kernels/mod.rs
index a0ef50a7b..73fef5b3d 100644
--- a/arrow/src/compute/kernels/mod.rs
+++ b/arrow/src/compute/kernels/mod.rs
@@ -31,6 +31,7 @@ pub mod limit;
 pub mod partition;
 pub mod regexp;
 pub mod sort;
+pub mod string;
 pub mod substring;
 pub mod take;
 pub mod temporal;
diff --git a/arrow/src/compute/kernels/string.rs b/arrow/src/compute/kernels/string.rs
new file mode 100644
index 000000000..2af0bf85a
--- /dev/null
+++ b/arrow/src/compute/kernels/string.rs
@@ -0,0 +1,196 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::array::*;
+use crate::compute::util::combine_option_bitmap;
+use crate::error::{ArrowError, Result};
+
+/// Returns the elementwise concatenation of `StringArray`.
+///
+/// An index of the resulting `StringArray` is null if any of `StringArray` are null at that location.
+///
+/// ```text
+/// e.g:
+///
+///   ["Hello"] + ["World"] = ["HelloWorld"]
+///
+///   ["a", "b"] + [None, "c"] = [None, "bc"]
+/// ```
+///
+/// Attention: `left` and `right` must have the same length.
+pub fn string_concat<Offset: OffsetSizeTrait>(
+    left: &GenericStringArray<Offset>,
+    right: &GenericStringArray<Offset>,
+) -> Result<GenericStringArray<Offset>> {
+    if left.len() != right.len() {
+        return Err(ArrowError::ComputeError(format!(
+            "Arrays must have the same length: {} != {}",
+            left.len(),
+            right.len()
+        )));
+    }
+
+    let output_bitmap = combine_option_bitmap(left.data(), right.data(), left.len())?;
+
+    let left_offsets = left.value_offsets();
+    let right_offsets = right.value_offsets();
+
+    let left_buffer = left.value_data();
+    let right_buffer = right.value_data();
+    let left_values = left_buffer.as_slice();
+    let right_values = right_buffer.as_slice();
+
+    let mut output_values = BufferBuilder::<u8>::new(
+        left_values.len() + right_values.len()
+            - left_offsets[0].to_usize().unwrap()
+            - right_offsets[0].to_usize().unwrap(),
+    );
+
+    let mut output_offsets = BufferBuilder::<Offset>::new(left_offsets.len());
+    output_offsets.append(Offset::zero());
+    for (left_idx, right_idx) in left_offsets.windows(2).zip(right_offsets.windows(2)) {
+        output_values.append_slice(
+            &left_values
+                [left_idx[0].to_usize().unwrap()..left_idx[1].to_usize().unwrap()],
+        );
+        output_values.append_slice(
+            &right_values
+                [right_idx[0].to_usize().unwrap()..right_idx[1].to_usize().unwrap()],
+        );
+        output_offsets.append(Offset::from_usize(output_values.len()).unwrap());
+    }
+
+    let mut builder =
+        ArrayDataBuilder::new(GenericStringArray::<Offset>::get_data_type())
+            .len(left.len())
+            .add_buffer(output_offsets.finish())
+            .add_buffer(output_values.finish());
+
+    if let Some(null_bitmap) = output_bitmap {
+        builder = builder.null_bit_buffer(null_bitmap);
+    }
+
+    // SAFETY - offsets valid by construction
+    Ok(unsafe { builder.build_unchecked() }.into())
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    #[test]
+    fn test_string_concat() {
+        let left = [Some("foo"), Some("bar"), None]
+            .into_iter()
+            .collect::<StringArray>();
+        let right = [None, Some("yyy"), Some("zzz")]
+            .into_iter()
+            .collect::<StringArray>();
+
+        let output = string_concat(&left, &right).unwrap();
+
+        let expected = [None, Some("baryyy"), None]
+            .into_iter()
+            .collect::<StringArray>();
+
+        assert_eq!(output, expected);
+    }
+
+    #[test]
+    fn test_string_concat_empty_string() {
+        let left = [Some("foo"), Some(""), Some("bar")]
+            .into_iter()
+            .collect::<StringArray>();
+        let right = [Some("baz"), Some(""), Some("")]
+            .into_iter()
+            .collect::<StringArray>();
+
+        let output = string_concat(&left, &right).unwrap();
+
+        let expected = [Some("foobaz"), Some(""), Some("bar")]
+            .into_iter()
+            .collect::<StringArray>();
+
+        assert_eq!(output, expected);
+    }
+
+    #[test]
+    fn test_string_concat_no_null() {
+        let left = StringArray::from(vec!["foo", "bar"]);
+        let right = StringArray::from(vec!["bar", "baz"]);
+
+        let output = string_concat(&left, &right).unwrap();
+
+        let expected = StringArray::from(vec!["foobar", "barbaz"]);
+
+        assert_eq!(output, expected);
+    }
+
+    #[test]
+    fn test_string_concat_error() {
+        let left = StringArray::from(vec!["foo", "bar"]);
+        let right = StringArray::from(vec!["baz"]);
+
+        let output = string_concat(&left, &right);
+
+        assert!(output.is_err());
+    }
+
+    #[test]
+    fn test_string_concat_slice() {
+        let left = &StringArray::from(vec![None, Some("foo"), Some("bar"), Some("baz")]);
+        let right = &StringArray::from(vec![Some("boo"), None, Some("far"), Some("faz")]);
+
+        let left_slice = left.slice(0, 3);
+        let right_slice = right.slice(1, 3);
+        let output = string_concat(
+            left_slice
+                .as_any()
+                .downcast_ref::<GenericStringArray<i32>>()
+                .unwrap(),
+            right_slice
+                .as_any()
+                .downcast_ref::<GenericStringArray<i32>>()
+                .unwrap(),
+        )
+        .unwrap();
+
+        let expected = [None, Some("foofar"), Some("barfaz")]
+            .into_iter()
+            .collect::<StringArray>();
+
+        assert_eq!(output, expected);
+
+        let left_slice = left.slice(2, 2);
+        let right_slice = right.slice(1, 2);
+
+        let output = string_concat(
+            left_slice
+                .as_any()
+                .downcast_ref::<GenericStringArray<i32>>()
+                .unwrap(),
+            right_slice
+                .as_any()
+                .downcast_ref::<GenericStringArray<i32>>()
+                .unwrap(),
+        )
+        .unwrap();
+
+        let expected = [None, Some("bazfar")].into_iter().collect::<StringArray>();
+
+        assert_eq!(output, expected);
+    }
+}