You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ne...@apache.org on 2020/09/27 21:29:40 UTC

[arrow] branch master updated: ARROW-10019: [Rust] Add substring kernel

This is an automated email from the ASF dual-hosted git repository.

nevime pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new c4b0d0e  ARROW-10019: [Rust] Add substring kernel
c4b0d0e is described below

commit c4b0d0e6cdb49f508f3aeca90a47bf8d0c8a4ffd
Author: Jorge C. Leitao <jo...@gmail.com>
AuthorDate: Sun Sep 27 23:28:47 2020 +0200

    ARROW-10019: [Rust] Add substring kernel
    
    This PR adds a new function to kernels called `substring` that yields a substring of a StringArray starting at a given index, and with a given optional length.
    
    `fn substring(array: &Array, start: i32, length: &Option<u32>) -> Result<ArrayRef>`
    
    This operation is common in strings, and it is useful for string-based transformations.
    
    The API is inspired by Spark's API.
    
    Closes #8199 from jorgecarleitao/substring
    
    Authored-by: Jorge C. Leitao <jo...@gmail.com>
    Signed-off-by: Neville Dipale <ne...@gmail.com>
---
 rust/arrow/README.md                        |   6 +
 rust/arrow/src/array/array.rs               |   4 +-
 rust/arrow/src/compute/kernels/mod.rs       |   1 +
 rust/arrow/src/compute/kernels/substring.rs | 274 ++++++++++++++++++++++++++++
 rust/arrow/src/compute/kernels/take.rs      |   6 +-
 5 files changed, 285 insertions(+), 6 deletions(-)

diff --git a/rust/arrow/README.md b/rust/arrow/README.md
index fc90fe9..cb8d285 100644
--- a/rust/arrow/README.md
+++ b/rust/arrow/README.md
@@ -89,6 +89,12 @@ The current status is:
 - [X] Arrow IPC
 - [ ] Interop tests with other implementations
 
+- operations
+  - string
+    - [x] length
+    - [x] substring
+    - [x] take
+
 ## Examples
 
 The examples folder shows how to construct some different types of Arrow
diff --git a/rust/arrow/src/array/array.rs b/rust/arrow/src/array/array.rs
index e14b34f..bfbfe39 100644
--- a/rust/arrow/src/array/array.rs
+++ b/rust/arrow/src/array/array.rs
@@ -893,7 +893,7 @@ pub trait ListArrayOps<OffsetSize: OffsetSizeTrait> {
 }
 
 /// trait declaring an offset size, relevant for i32 vs i64 array types.
-pub trait OffsetSizeTrait: ArrowNativeType + Num {
+pub trait OffsetSizeTrait: ArrowNativeType + Num + Ord {
     fn prefix() -> &'static str;
 
     fn to_isize(&self) -> isize;
@@ -1503,7 +1503,7 @@ impl<OffsetSize: OffsetSizeTrait> GenericStringArray<OffsetSize> {
                 values.extend_from_slice(s.as_bytes());
             } else {
                 offsets.push(length_so_far);
-                values.extend_from_slice("".as_bytes());
+                values.extend_from_slice(b"");
             }
         }
         let array_data = ArrayData::builder(data_type)
diff --git a/rust/arrow/src/compute/kernels/mod.rs b/rust/arrow/src/compute/kernels/mod.rs
index 0cd3d70..5ac0f0b 100644
--- a/rust/arrow/src/compute/kernels/mod.rs
+++ b/rust/arrow/src/compute/kernels/mod.rs
@@ -27,5 +27,6 @@ pub mod filter;
 pub mod length;
 pub mod limit;
 pub mod sort;
+pub mod substring;
 pub mod take;
 pub mod temporal;
diff --git a/rust/arrow/src/compute/kernels/substring.rs b/rust/arrow/src/compute/kernels/substring.rs
new file mode 100644
index 0000000..04be80b
--- /dev/null
+++ b/rust/arrow/src/compute/kernels/substring.rs
@@ -0,0 +1,274 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Defines kernel to extract a substring of a [Large]StringArray
+
+use crate::{array::*, buffer::Buffer, datatypes::ToByteSlice};
+use crate::{
+    datatypes::DataType,
+    error::{ArrowError, Result},
+};
+use std::sync::Arc;
+
+fn substring1<OffsetSize: OffsetSizeTrait>(
+    array: &GenericStringArray<OffsetSize>,
+    start: OffsetSize,
+    length: &Option<OffsetSize>,
+    datatype: DataType,
+) -> Result<ArrayRef> {
+    // compute current offsets
+    let offsets = array.data_ref().clone().buffers()[0].clone();
+    let offsets: &[OffsetSize] = unsafe { offsets.typed_data::<OffsetSize>() };
+
+    // compute null bitmap (copy)
+    let null_bit_buffer = array.data_ref().null_buffer().cloned();
+
+    // compute values
+    let values = &array.data_ref().buffers()[1];
+    let data = values.data();
+
+    let mut new_values = Vec::new(); // we have no way to estimate how much this will be.
+    let mut new_offsets: Vec<OffsetSize> = Vec::with_capacity(array.len() + 1);
+
+    let mut length_so_far = OffsetSize::zero();
+    new_offsets.push(length_so_far);
+    (0..array.len()).for_each(|i| {
+        // the length of this entry
+        let lenght_i: OffsetSize = offsets[i + 1] - offsets[i];
+        // compute where we should start slicing this entry
+        let start = offsets[i]
+            + if start >= OffsetSize::zero() {
+                start
+            } else {
+                lenght_i + start
+            };
+
+        let start = start.max(offsets[i]).min(offsets[i + 1]);
+        // compute the lenght of the slice
+        let length: OffsetSize = length
+            .unwrap_or(lenght_i)
+            // .max(0) is not needed as it is guaranteed
+            .min(offsets[i + 1] - start); // so we do not go beyond this entry
+
+        length_so_far = length_so_far + length;
+
+        new_offsets.push(length_so_far);
+
+        // we need usize for ranges
+        let start = start.to_usize().unwrap();
+        let length = length.to_usize().unwrap();
+
+        new_values.extend_from_slice(&data[start..start + length]);
+    });
+
+    let data = ArrayData::new(
+        datatype,
+        array.len(),
+        None,
+        null_bit_buffer,
+        0,
+        vec![
+            Buffer::from(new_offsets.to_byte_slice()),
+            Buffer::from(&new_values[..]),
+        ],
+        vec![],
+    );
+    Ok(make_array(Arc::new(data)))
+}
+
+/// Returns an ArrayRef with a substring starting from `start` and with optional length `length` of each of the elements in `array`.
+/// `start` can be negative, in which case the start counts from the end of the string.
+/// this function errors when the passed array is not a [Large]String array.
+pub fn substring(array: &Array, start: i64, length: &Option<u64>) -> Result<ArrayRef> {
+    match array.data_type() {
+        DataType::LargeUtf8 => substring1(
+            array
+                .as_any()
+                .downcast_ref::<LargeStringArray>()
+                .expect("A large string is expected"),
+            start,
+            &length.map(|e| e as i64),
+            DataType::LargeUtf8,
+        ),
+        DataType::Utf8 => substring1(
+            array
+                .as_any()
+                .downcast_ref::<StringArray>()
+                .expect("A string is expected"),
+            start as i32,
+            &length.map(|e| e as i32),
+            DataType::Utf8,
+        ),
+        _ => Err(ArrowError::ComputeError(format!(
+            "substring does not support type {:?}",
+            array.data_type()
+        ))),
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    fn with_nulls<T: 'static + Array + PartialEq + From<Vec<Option<&'static str>>>>(
+    ) -> Result<()> {
+        let cases = vec![
+            // identity
+            (
+                vec![Some("hello"), None, Some("word")],
+                0,
+                None,
+                vec![Some("hello"), None, Some("word")],
+            ),
+            // 0 length -> Nothing
+            (
+                vec![Some("hello"), None, Some("word")],
+                0,
+                Some(0),
+                vec![Some(""), None, Some("")],
+            ),
+            // high start -> Nothing
+            (
+                vec![Some("hello"), None, Some("word")],
+                1000,
+                Some(0),
+                vec![Some(""), None, Some("")],
+            ),
+            // high negative start -> identity
+            (
+                vec![Some("hello"), None, Some("word")],
+                -1000,
+                None,
+                vec![Some("hello"), None, Some("word")],
+            ),
+            // high length -> identity
+            (
+                vec![Some("hello"), None, Some("word")],
+                0,
+                Some(1000),
+                vec![Some("hello"), None, Some("word")],
+            ),
+        ];
+
+        cases
+            .into_iter()
+            .map(|(array, start, length, expected)| {
+                let array = T::from(array);
+                let result = substring(&array, start, &length)?;
+                assert_eq!(array.len(), result.len());
+
+                let result = result.as_any().downcast_ref::<T>().unwrap();
+                let expected = T::from(expected);
+                assert_eq!(&expected, result);
+                Ok(())
+            })
+            .collect::<Result<()>>()?;
+
+        Ok(())
+    }
+
+    #[test]
+    fn with_nulls_string() -> Result<()> {
+        with_nulls::<StringArray>()
+    }
+
+    #[test]
+    fn with_nulls_large_string() -> Result<()> {
+        with_nulls::<LargeStringArray>()
+    }
+
+    fn without_nulls<T: 'static + Array + PartialEq + From<Vec<Option<&'static str>>>>(
+    ) -> Result<()> {
+        let cases = vec![
+            // increase start
+            (
+                vec!["hello", "", "word"],
+                0,
+                None,
+                vec!["hello", "", "word"],
+            ),
+            (vec!["hello", "", "word"], 1, None, vec!["ello", "", "ord"]),
+            (vec!["hello", "", "word"], 2, None, vec!["llo", "", "rd"]),
+            (vec!["hello", "", "word"], 3, None, vec!["lo", "", "d"]),
+            (vec!["hello", "", "word"], 10, None, vec!["", "", ""]),
+            // increase start negatively
+            (vec!["hello", "", "word"], -1, None, vec!["o", "", "d"]),
+            (vec!["hello", "", "word"], -2, None, vec!["lo", "", "rd"]),
+            (vec!["hello", "", "word"], -3, None, vec!["llo", "", "ord"]),
+            (
+                vec!["hello", "", "word"],
+                -10,
+                None,
+                vec!["hello", "", "word"],
+            ),
+            // increase length
+            (vec!["hello", "", "word"], 1, Some(1), vec!["e", "", "o"]),
+            (vec!["hello", "", "word"], 1, Some(2), vec!["el", "", "or"]),
+            (
+                vec!["hello", "", "word"],
+                1,
+                Some(3),
+                vec!["ell", "", "ord"],
+            ),
+            (
+                vec!["hello", "", "word"],
+                1,
+                Some(4),
+                vec!["ello", "", "ord"],
+            ),
+            (vec!["hello", "", "word"], -3, Some(1), vec!["l", "", "o"]),
+            (vec!["hello", "", "word"], -3, Some(2), vec!["ll", "", "or"]),
+            (
+                vec!["hello", "", "word"],
+                -3,
+                Some(3),
+                vec!["llo", "", "ord"],
+            ),
+            (
+                vec!["hello", "", "word"],
+                -3,
+                Some(4),
+                vec!["llo", "", "ord"],
+            ),
+        ];
+
+        cases
+            .into_iter()
+            .map(|(array, start, length, expected)| {
+                let array = StringArray::from(array);
+                let result = substring(&array, start, &length)?;
+                assert_eq!(array.len(), result.len());
+                let result = result.as_any().downcast_ref::<StringArray>().unwrap();
+                let expected = StringArray::from(expected);
+                assert_eq!(&expected, result,);
+                Ok(())
+            })
+            .collect::<Result<()>>()?;
+
+        Ok(())
+    }
+
+    #[test]
+    fn without_nulls_string() -> Result<()> {
+        without_nulls::<StringArray>()
+    }
+
+    #[test]
+    fn without_nulls_large_string() -> Result<()> {
+        without_nulls::<LargeStringArray>()
+    }
+}
diff --git a/rust/arrow/src/compute/kernels/take.rs b/rust/arrow/src/compute/kernels/take.rs
index 7e1ad3f..f7a1776 100644
--- a/rust/arrow/src/compute/kernels/take.rs
+++ b/rust/arrow/src/compute/kernels/take.rs
@@ -221,10 +221,8 @@ fn take_boolean(values: &ArrayRef, indices: &UInt32Array) -> Result<ArrayRef> {
         let index = indices.value(i) as usize;
         if array.is_null(index) {
             bit_util::unset_bit(null_slice, i);
-        } else {
-            if array.value(index) {
-                bit_util::set_bit(val_slice, i);
-            }
+        } else if array.value(index) {
+            bit_util::set_bit(val_slice, i);
         }
     });