You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ne...@apache.org on 2020/09/27 21:29:40 UTC
[arrow] branch master updated: ARROW-10019: [Rust] Add substring
kernel
This is an automated email from the ASF dual-hosted git repository.
nevime pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new c4b0d0e ARROW-10019: [Rust] Add substring kernel
c4b0d0e is described below
commit c4b0d0e6cdb49f508f3aeca90a47bf8d0c8a4ffd
Author: Jorge C. Leitao <jo...@gmail.com>
AuthorDate: Sun Sep 27 23:28:47 2020 +0200
ARROW-10019: [Rust] Add substring kernel
This PR adds a new function to kernels called `substring` that yields a substring of a StringArray starting at a given index, and with a given optional length.
`fn substring(array: &Array, start: i32, length: &Option<u32>) -> Result<ArrayRef>`
This operation is common in strings, and it is useful for string-based transformations.
The API is inspired by Spark's API.
Closes #8199 from jorgecarleitao/substring
Authored-by: Jorge C. Leitao <jo...@gmail.com>
Signed-off-by: Neville Dipale <ne...@gmail.com>
---
rust/arrow/README.md | 6 +
rust/arrow/src/array/array.rs | 4 +-
rust/arrow/src/compute/kernels/mod.rs | 1 +
rust/arrow/src/compute/kernels/substring.rs | 274 ++++++++++++++++++++++++++++
rust/arrow/src/compute/kernels/take.rs | 6 +-
5 files changed, 285 insertions(+), 6 deletions(-)
diff --git a/rust/arrow/README.md b/rust/arrow/README.md
index fc90fe9..cb8d285 100644
--- a/rust/arrow/README.md
+++ b/rust/arrow/README.md
@@ -89,6 +89,12 @@ The current status is:
- [X] Arrow IPC
- [ ] Interop tests with other implementations
+- operations
+ - string
+ - [x] length
+ - [x] substring
+ - [x] take
+
## Examples
The examples folder shows how to construct some different types of Arrow
diff --git a/rust/arrow/src/array/array.rs b/rust/arrow/src/array/array.rs
index e14b34f..bfbfe39 100644
--- a/rust/arrow/src/array/array.rs
+++ b/rust/arrow/src/array/array.rs
@@ -893,7 +893,7 @@ pub trait ListArrayOps<OffsetSize: OffsetSizeTrait> {
}
/// trait declaring an offset size, relevant for i32 vs i64 array types.
-pub trait OffsetSizeTrait: ArrowNativeType + Num {
+pub trait OffsetSizeTrait: ArrowNativeType + Num + Ord {
fn prefix() -> &'static str;
fn to_isize(&self) -> isize;
@@ -1503,7 +1503,7 @@ impl<OffsetSize: OffsetSizeTrait> GenericStringArray<OffsetSize> {
values.extend_from_slice(s.as_bytes());
} else {
offsets.push(length_so_far);
- values.extend_from_slice("".as_bytes());
+ values.extend_from_slice(b"");
}
}
let array_data = ArrayData::builder(data_type)
diff --git a/rust/arrow/src/compute/kernels/mod.rs b/rust/arrow/src/compute/kernels/mod.rs
index 0cd3d70..5ac0f0b 100644
--- a/rust/arrow/src/compute/kernels/mod.rs
+++ b/rust/arrow/src/compute/kernels/mod.rs
@@ -27,5 +27,6 @@ pub mod filter;
pub mod length;
pub mod limit;
pub mod sort;
+pub mod substring;
pub mod take;
pub mod temporal;
diff --git a/rust/arrow/src/compute/kernels/substring.rs b/rust/arrow/src/compute/kernels/substring.rs
new file mode 100644
index 0000000..04be80b
--- /dev/null
+++ b/rust/arrow/src/compute/kernels/substring.rs
@@ -0,0 +1,274 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Defines kernel to extract a substring of a [Large]StringArray
+
+use crate::{array::*, buffer::Buffer, datatypes::ToByteSlice};
+use crate::{
+ datatypes::DataType,
+ error::{ArrowError, Result},
+};
+use std::sync::Arc;
+
+fn substring1<OffsetSize: OffsetSizeTrait>(
+ array: &GenericStringArray<OffsetSize>,
+ start: OffsetSize,
+ length: &Option<OffsetSize>,
+ datatype: DataType,
+) -> Result<ArrayRef> {
+ // compute current offsets
+ let offsets = array.data_ref().clone().buffers()[0].clone();
+ let offsets: &[OffsetSize] = unsafe { offsets.typed_data::<OffsetSize>() };
+
+ // compute null bitmap (copy)
+ let null_bit_buffer = array.data_ref().null_buffer().cloned();
+
+ // compute values
+ let values = &array.data_ref().buffers()[1];
+ let data = values.data();
+
+ let mut new_values = Vec::new(); // we have no way to estimate how much this will be.
+ let mut new_offsets: Vec<OffsetSize> = Vec::with_capacity(array.len() + 1);
+
+ let mut length_so_far = OffsetSize::zero();
+ new_offsets.push(length_so_far);
+ (0..array.len()).for_each(|i| {
+ // the length of this entry
+ let lenght_i: OffsetSize = offsets[i + 1] - offsets[i];
+ // compute where we should start slicing this entry
+ let start = offsets[i]
+ + if start >= OffsetSize::zero() {
+ start
+ } else {
+ lenght_i + start
+ };
+
+ let start = start.max(offsets[i]).min(offsets[i + 1]);
+ // compute the lenght of the slice
+ let length: OffsetSize = length
+ .unwrap_or(lenght_i)
+ // .max(0) is not needed as it is guaranteed
+ .min(offsets[i + 1] - start); // so we do not go beyond this entry
+
+ length_so_far = length_so_far + length;
+
+ new_offsets.push(length_so_far);
+
+ // we need usize for ranges
+ let start = start.to_usize().unwrap();
+ let length = length.to_usize().unwrap();
+
+ new_values.extend_from_slice(&data[start..start + length]);
+ });
+
+ let data = ArrayData::new(
+ datatype,
+ array.len(),
+ None,
+ null_bit_buffer,
+ 0,
+ vec![
+ Buffer::from(new_offsets.to_byte_slice()),
+ Buffer::from(&new_values[..]),
+ ],
+ vec![],
+ );
+ Ok(make_array(Arc::new(data)))
+}
+
+/// Returns an ArrayRef with a substring starting from `start` and with optional length `length` of each of the elements in `array`.
+/// `start` can be negative, in which case the start counts from the end of the string.
+/// this function errors when the passed array is not a [Large]String array.
+pub fn substring(array: &Array, start: i64, length: &Option<u64>) -> Result<ArrayRef> {
+ match array.data_type() {
+ DataType::LargeUtf8 => substring1(
+ array
+ .as_any()
+ .downcast_ref::<LargeStringArray>()
+ .expect("A large string is expected"),
+ start,
+ &length.map(|e| e as i64),
+ DataType::LargeUtf8,
+ ),
+ DataType::Utf8 => substring1(
+ array
+ .as_any()
+ .downcast_ref::<StringArray>()
+ .expect("A string is expected"),
+ start as i32,
+ &length.map(|e| e as i32),
+ DataType::Utf8,
+ ),
+ _ => Err(ArrowError::ComputeError(format!(
+ "substring does not support type {:?}",
+ array.data_type()
+ ))),
+ }
+}
+
+#[cfg(test)]
+mod tests {
+ use super::*;
+
+ fn with_nulls<T: 'static + Array + PartialEq + From<Vec<Option<&'static str>>>>(
+ ) -> Result<()> {
+ let cases = vec![
+ // identity
+ (
+ vec![Some("hello"), None, Some("word")],
+ 0,
+ None,
+ vec![Some("hello"), None, Some("word")],
+ ),
+ // 0 length -> Nothing
+ (
+ vec![Some("hello"), None, Some("word")],
+ 0,
+ Some(0),
+ vec![Some(""), None, Some("")],
+ ),
+ // high start -> Nothing
+ (
+ vec![Some("hello"), None, Some("word")],
+ 1000,
+ Some(0),
+ vec![Some(""), None, Some("")],
+ ),
+ // high negative start -> identity
+ (
+ vec![Some("hello"), None, Some("word")],
+ -1000,
+ None,
+ vec![Some("hello"), None, Some("word")],
+ ),
+ // high length -> identity
+ (
+ vec![Some("hello"), None, Some("word")],
+ 0,
+ Some(1000),
+ vec![Some("hello"), None, Some("word")],
+ ),
+ ];
+
+ cases
+ .into_iter()
+ .map(|(array, start, length, expected)| {
+ let array = T::from(array);
+ let result = substring(&array, start, &length)?;
+ assert_eq!(array.len(), result.len());
+
+ let result = result.as_any().downcast_ref::<T>().unwrap();
+ let expected = T::from(expected);
+ assert_eq!(&expected, result);
+ Ok(())
+ })
+ .collect::<Result<()>>()?;
+
+ Ok(())
+ }
+
+ #[test]
+ fn with_nulls_string() -> Result<()> {
+ with_nulls::<StringArray>()
+ }
+
+ #[test]
+ fn with_nulls_large_string() -> Result<()> {
+ with_nulls::<LargeStringArray>()
+ }
+
+ fn without_nulls<T: 'static + Array + PartialEq + From<Vec<Option<&'static str>>>>(
+ ) -> Result<()> {
+ let cases = vec![
+ // increase start
+ (
+ vec!["hello", "", "word"],
+ 0,
+ None,
+ vec!["hello", "", "word"],
+ ),
+ (vec!["hello", "", "word"], 1, None, vec!["ello", "", "ord"]),
+ (vec!["hello", "", "word"], 2, None, vec!["llo", "", "rd"]),
+ (vec!["hello", "", "word"], 3, None, vec!["lo", "", "d"]),
+ (vec!["hello", "", "word"], 10, None, vec!["", "", ""]),
+ // increase start negatively
+ (vec!["hello", "", "word"], -1, None, vec!["o", "", "d"]),
+ (vec!["hello", "", "word"], -2, None, vec!["lo", "", "rd"]),
+ (vec!["hello", "", "word"], -3, None, vec!["llo", "", "ord"]),
+ (
+ vec!["hello", "", "word"],
+ -10,
+ None,
+ vec!["hello", "", "word"],
+ ),
+ // increase length
+ (vec!["hello", "", "word"], 1, Some(1), vec!["e", "", "o"]),
+ (vec!["hello", "", "word"], 1, Some(2), vec!["el", "", "or"]),
+ (
+ vec!["hello", "", "word"],
+ 1,
+ Some(3),
+ vec!["ell", "", "ord"],
+ ),
+ (
+ vec!["hello", "", "word"],
+ 1,
+ Some(4),
+ vec!["ello", "", "ord"],
+ ),
+ (vec!["hello", "", "word"], -3, Some(1), vec!["l", "", "o"]),
+ (vec!["hello", "", "word"], -3, Some(2), vec!["ll", "", "or"]),
+ (
+ vec!["hello", "", "word"],
+ -3,
+ Some(3),
+ vec!["llo", "", "ord"],
+ ),
+ (
+ vec!["hello", "", "word"],
+ -3,
+ Some(4),
+ vec!["llo", "", "ord"],
+ ),
+ ];
+
+ cases
+ .into_iter()
+ .map(|(array, start, length, expected)| {
+ let array = StringArray::from(array);
+ let result = substring(&array, start, &length)?;
+ assert_eq!(array.len(), result.len());
+ let result = result.as_any().downcast_ref::<StringArray>().unwrap();
+ let expected = StringArray::from(expected);
+ assert_eq!(&expected, result,);
+ Ok(())
+ })
+ .collect::<Result<()>>()?;
+
+ Ok(())
+ }
+
+ #[test]
+ fn without_nulls_string() -> Result<()> {
+ without_nulls::<StringArray>()
+ }
+
+ #[test]
+ fn without_nulls_large_string() -> Result<()> {
+ without_nulls::<LargeStringArray>()
+ }
+}
diff --git a/rust/arrow/src/compute/kernels/take.rs b/rust/arrow/src/compute/kernels/take.rs
index 7e1ad3f..f7a1776 100644
--- a/rust/arrow/src/compute/kernels/take.rs
+++ b/rust/arrow/src/compute/kernels/take.rs
@@ -221,10 +221,8 @@ fn take_boolean(values: &ArrayRef, indices: &UInt32Array) -> Result<ArrayRef> {
let index = indices.value(i) as usize;
if array.is_null(index) {
bit_util::unset_bit(null_slice, i);
- } else {
- if array.value(index) {
- bit_util::set_bit(val_slice, i);
- }
+ } else if array.value(index) {
+ bit_util::set_bit(val_slice, i);
}
});