You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by vi...@apache.org on 2022/05/05 01:30:43 UTC
[arrow-rs] branch master updated: Add `substring` support for `FixedSizeBinaryArray` (#1633)
This is an automated email from the ASF dual-hosted git repository.
viirya pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new e51de5eb7 Add `substring` support for `FixedSizeBinaryArray` (#1633)
e51de5eb7 is described below
commit e51de5eb7ef40a4a35324e28d120df5495b68c6c
Author: Remzi Yang <59...@users.noreply.github.com>
AuthorDate: Thu May 5 09:30:37 2022 +0800
Add `substring` support for `FixedSizeBinaryArray` (#1633)
* add function, no tests yet
Signed-off-by: remzi <13...@gmail.com>
* adjust the fn structure
Signed-off-by: remzi <13...@gmail.com>
* cargo fmt
Signed-off-by: remzi <13...@gmail.com>
* add tests
Signed-off-by: remzi <13...@gmail.com>
* fix clippy
Signed-off-by: remzi <13...@gmail.com>
* add identical test cases for utf8 and binary
Signed-off-by: remzi <13...@gmail.com>
* add benchmark
Signed-off-by: remzi <13...@gmail.com>
* fix offset bug
Signed-off-by: remzi <13...@gmail.com>
* fix a nit
Signed-off-by: remzi <13...@gmail.com>
---
arrow/benches/string_kernels.rs | 17 +-
arrow/src/compute/kernels/substring.rs | 351 +++++++++++++++++++++++++++++++++
2 files changed, 362 insertions(+), 6 deletions(-)
diff --git a/arrow/benches/string_kernels.rs b/arrow/benches/string_kernels.rs
index 37d1f3f89..7df52a6bb 100644
--- a/arrow/benches/string_kernels.rs
+++ b/arrow/benches/string_kernels.rs
@@ -25,22 +25,27 @@ use arrow::array::*;
use arrow::compute::kernels::substring::substring;
use arrow::util::bench_util::*;
-fn bench_substring(arr: &StringArray, start: i64, length: Option<u64>) {
+fn bench_substring(arr: &dyn Array, start: i64, length: Option<u64>) {
substring(criterion::black_box(arr), start, length).unwrap();
}
fn add_benchmark(c: &mut Criterion) {
let size = 65536;
- let str_len = 1000;
+ let val_len = 1000;
- let arr_string = create_string_array_with_len::<i32>(size, 0.0, str_len);
+ let arr_string = create_string_array_with_len::<i32>(size, 0.0, val_len);
+ let arr_fsb = create_fsb_array(size, 0.0, val_len);
- c.bench_function("substring (start = 0, length = None)", |b| {
+ c.bench_function("substring utf8 (start = 0, length = None)", |b| {
b.iter(|| bench_substring(&arr_string, 0, None))
});
- c.bench_function("substring (start = 1, length = str_len - 1)", |b| {
- b.iter(|| bench_substring(&arr_string, 1, Some((str_len - 1) as u64)))
+ c.bench_function("substring utf8 (start = 1, length = str_len - 1)", |b| {
+ b.iter(|| bench_substring(&arr_string, 1, Some((val_len - 1) as u64)))
+ });
+
+ c.bench_function("substring fixed size binary array", |b| {
+ b.iter(|| bench_substring(&arr_fsb, 1, Some((val_len - 1) as u64)))
});
}
diff --git a/arrow/src/compute/kernels/substring.rs b/arrow/src/compute/kernels/substring.rs
index b687757c4..9d898fda0 100644
--- a/arrow/src/compute/kernels/substring.rs
+++ b/arrow/src/compute/kernels/substring.rs
@@ -86,6 +86,55 @@ fn binary_substring<OffsetSize: BinaryOffsetSizeTrait>(
Ok(make_array(data))
}
+fn fixed_size_binary_substring(
+ array: &FixedSizeBinaryArray,
+ old_len: i32,
+ start: i32,
+ length: Option<i32>,
+) -> Result<ArrayRef> {
+ let new_start = if start >= 0 {
+ start.min(old_len)
+ } else {
+ (old_len + start).max(0)
+ };
+ let new_len = match length {
+ Some(len) => len.min(old_len - new_start),
+ None => old_len - new_start,
+ };
+
+ // build value buffer
+ let num_of_elements = array.len();
+ let values = array.value_data();
+ let data = values.as_slice();
+ let mut new_values = MutableBuffer::new(num_of_elements * (new_len as usize));
+ (0..num_of_elements)
+ .map(|idx| {
+ let offset = array.value_offset(idx);
+ (
+ (offset + new_start) as usize,
+ (offset + new_start + new_len) as usize,
+ )
+ })
+ .for_each(|(start, end)| new_values.extend_from_slice(&data[start..end]));
+
+ let array_data = unsafe {
+ ArrayData::new_unchecked(
+ DataType::FixedSizeBinary(new_len),
+ num_of_elements,
+ None,
+ array
+ .data_ref()
+ .null_buffer()
+ .map(|b| b.bit_slice(array.offset(), num_of_elements)),
+ 0,
+ vec![new_values.into()],
+ vec![],
+ )
+ };
+
+ Ok(make_array(array_data))
+}
+
/// substring by byte
fn utf8_substring<OffsetSize: StringOffsetSizeTrait>(
array: &GenericStringArray<OffsetSize>,
@@ -219,6 +268,15 @@ pub fn substring(array: &dyn Array, start: i64, length: Option<u64>) -> Result<A
start as i32,
length.map(|e| e as i32),
),
+ DataType::FixedSizeBinary(old_len) => fixed_size_binary_substring(
+ array
+ .as_any()
+ .downcast_ref::<FixedSizeBinaryArray>()
+ .expect("a fixed size binary is expected"),
+ *old_len,
+ start as i32,
+ length.map(|e| e as i32),
+ ),
DataType::LargeUtf8 => utf8_substring(
array
.as_any()
@@ -249,6 +307,8 @@ mod tests {
#[allow(clippy::type_complexity)]
fn with_nulls_generic_binary<O: BinaryOffsetSizeTrait>() -> Result<()> {
let cases: Vec<(Vec<Option<&[u8]>>, i64, Option<u64>, Vec<Option<&[u8]>>)> = vec![
+ // all-nulls array is always identical
+ (vec![None, None, None], -1, Some(1), vec![None, None, None]),
// identity
(
vec![Some(b"hello"), None, Some(&[0xf8, 0xf9, 0xff, 0xfa])],
@@ -318,6 +378,8 @@ mod tests {
#[allow(clippy::type_complexity)]
fn without_nulls_generic_binary<O: BinaryOffsetSizeTrait>() -> Result<()> {
let cases: Vec<(Vec<&[u8]>, i64, Option<u64>, Vec<&[u8]>)> = vec![
+ // empty array is always identical
+ (vec![b"", b"", b""], 2, Some(1), vec![b"", b"", b""]),
// increase start
(
vec![b"hello", b"", &[0xf8, 0xf9, 0xff, 0xfa]],
@@ -453,8 +515,295 @@ mod tests {
without_nulls_generic_binary::<i64>()
}
+ #[test]
+ #[allow(clippy::type_complexity)]
+ fn with_nulls_fixed_size_binary() -> Result<()> {
+ let cases: Vec<(Vec<Option<&[u8]>>, i64, Option<u64>, Vec<Option<&[u8]>>)> = vec![
+ // all-nulls array is always identical
+ (vec![None, None, None], 3, Some(2), vec![None, None, None]),
+ // increase start
+ (
+ vec![Some(b"cat"), None, Some(&[0xf8, 0xf9, 0xff])],
+ 0,
+ None,
+ vec![Some(b"cat"), None, Some(&[0xf8, 0xf9, 0xff])],
+ ),
+ (
+ vec![Some(b"cat"), None, Some(&[0xf8, 0xf9, 0xff])],
+ 1,
+ None,
+ vec![Some(b"at"), None, Some(&[0xf9, 0xff])],
+ ),
+ (
+ vec![Some(b"cat"), None, Some(&[0xf8, 0xf9, 0xff])],
+ 2,
+ None,
+ vec![Some(b"t"), None, Some(&[0xff])],
+ ),
+ (
+ vec![Some(b"cat"), None, Some(&[0xf8, 0xf9, 0xff])],
+ 3,
+ None,
+ vec![Some(b""), None, Some(&[])],
+ ),
+ (
+ vec![Some(b"cat"), None, Some(&[0xf8, 0xf9, 0xff])],
+ 10,
+ None,
+ vec![Some(b""), None, Some(b"")],
+ ),
+ // increase start negatively
+ (
+ vec![Some(b"cat"), None, Some(&[0xf8, 0xf9, 0xff])],
+ -1,
+ None,
+ vec![Some(b"t"), None, Some(&[0xff])],
+ ),
+ (
+ vec![Some(b"cat"), None, Some(&[0xf8, 0xf9, 0xff])],
+ -2,
+ None,
+ vec![Some(b"at"), None, Some(&[0xf9, 0xff])],
+ ),
+ (
+ vec![Some(b"cat"), None, Some(&[0xf8, 0xf9, 0xff])],
+ -3,
+ None,
+ vec![Some(b"cat"), None, Some(&[0xf8, 0xf9, 0xff])],
+ ),
+ (
+ vec![Some(b"cat"), None, Some(&[0xf8, 0xf9, 0xff])],
+ -10,
+ None,
+ vec![Some(b"cat"), None, Some(&[0xf8, 0xf9, 0xff])],
+ ),
+ // increase length
+ (
+ vec![Some(b"cat"), None, Some(&[0xf8, 0xf9, 0xff])],
+ 1,
+ Some(1),
+ vec![Some(b"a"), None, Some(&[0xf9])],
+ ),
+ (
+ vec![Some(b"cat"), None, Some(&[0xf8, 0xf9, 0xff])],
+ 1,
+ Some(2),
+ vec![Some(b"at"), None, Some(&[0xf9, 0xff])],
+ ),
+ (
+ vec![Some(b"cat"), None, Some(&[0xf8, 0xf9, 0xff])],
+ 1,
+ Some(3),
+ vec![Some(b"at"), None, Some(&[0xf9, 0xff])],
+ ),
+ (
+ vec![Some(b"cat"), None, Some(&[0xf8, 0xf9, 0xff])],
+ -3,
+ Some(1),
+ vec![Some(b"c"), None, Some(&[0xf8])],
+ ),
+ (
+ vec![Some(b"cat"), None, Some(&[0xf8, 0xf9, 0xff])],
+ -3,
+ Some(2),
+ vec![Some(b"ca"), None, Some(&[0xf8, 0xf9])],
+ ),
+ (
+ vec![Some(b"cat"), None, Some(&[0xf8, 0xf9, 0xff])],
+ -3,
+ Some(3),
+ vec![Some(b"cat"), None, Some(&[0xf8, 0xf9, 0xff])],
+ ),
+ (
+ vec![Some(b"cat"), None, Some(&[0xf8, 0xf9, 0xff])],
+ -3,
+ Some(4),
+ vec![Some(b"cat"), None, Some(&[0xf8, 0xf9, 0xff])],
+ ),
+ ];
+
+ cases.into_iter().try_for_each::<_, Result<()>>(
+ |(array, start, length, expected)| {
+ let array = FixedSizeBinaryArray::try_from_sparse_iter(array.into_iter())
+ .unwrap();
+ let result = substring(&array, start, length)?;
+ assert_eq!(array.len(), result.len());
+ let result = result
+ .as_any()
+ .downcast_ref::<FixedSizeBinaryArray>()
+ .unwrap();
+ let expected =
+ FixedSizeBinaryArray::try_from_sparse_iter(expected.into_iter())
+ .unwrap();
+ assert_eq!(&expected, result,);
+ Ok(())
+ },
+ )?;
+
+ Ok(())
+ }
+
+ #[test]
+ #[allow(clippy::type_complexity)]
+ fn without_nulls_fixed_size_binary() -> Result<()> {
+ let cases: Vec<(Vec<&[u8]>, i64, Option<u64>, Vec<&[u8]>)> = vec![
+ // empty array is always identical
+ (vec![b"", b"", &[]], 3, Some(2), vec![b"", b"", &[]]),
+ // increase start
+ (
+ vec![b"cat", b"dog", &[0xf8, 0xf9, 0xff]],
+ 0,
+ None,
+ vec![b"cat", b"dog", &[0xf8, 0xf9, 0xff]],
+ ),
+ (
+ vec![b"cat", b"dog", &[0xf8, 0xf9, 0xff]],
+ 1,
+ None,
+ vec![b"at", b"og", &[0xf9, 0xff]],
+ ),
+ (
+ vec![b"cat", b"dog", &[0xf8, 0xf9, 0xff]],
+ 2,
+ None,
+ vec![b"t", b"g", &[0xff]],
+ ),
+ (
+ vec![b"cat", b"dog", &[0xf8, 0xf9, 0xff]],
+ 3,
+ None,
+ vec![b"", b"", &[]],
+ ),
+ (
+ vec![b"cat", b"dog", &[0xf8, 0xf9, 0xff]],
+ 10,
+ None,
+ vec![b"", b"", b""],
+ ),
+ // increase start negatively
+ (
+ vec![b"cat", b"dog", &[0xf8, 0xf9, 0xff]],
+ -1,
+ None,
+ vec![b"t", b"g", &[0xff]],
+ ),
+ (
+ vec![b"cat", b"dog", &[0xf8, 0xf9, 0xff]],
+ -2,
+ None,
+ vec![b"at", b"og", &[0xf9, 0xff]],
+ ),
+ (
+ vec![b"cat", b"dog", &[0xf8, 0xf9, 0xff]],
+ -3,
+ None,
+ vec![b"cat", b"dog", &[0xf8, 0xf9, 0xff]],
+ ),
+ (
+ vec![b"cat", b"dog", &[0xf8, 0xf9, 0xff]],
+ -10,
+ None,
+ vec![b"cat", b"dog", &[0xf8, 0xf9, 0xff]],
+ ),
+ // increase length
+ (
+ vec![b"cat", b"dog", &[0xf8, 0xf9, 0xff]],
+ 1,
+ Some(1),
+ vec![b"a", b"o", &[0xf9]],
+ ),
+ (
+ vec![b"cat", b"dog", &[0xf8, 0xf9, 0xff]],
+ 1,
+ Some(2),
+ vec![b"at", b"og", &[0xf9, 0xff]],
+ ),
+ (
+ vec![b"cat", b"dog", &[0xf8, 0xf9, 0xff]],
+ 1,
+ Some(3),
+ vec![b"at", b"og", &[0xf9, 0xff]],
+ ),
+ (
+ vec![b"cat", b"dog", &[0xf8, 0xf9, 0xff]],
+ -3,
+ Some(1),
+ vec![b"c", b"d", &[0xf8]],
+ ),
+ (
+ vec![b"cat", b"dog", &[0xf8, 0xf9, 0xff]],
+ -3,
+ Some(2),
+ vec![b"ca", b"do", &[0xf8, 0xf9]],
+ ),
+ (
+ vec![b"cat", b"dog", &[0xf8, 0xf9, 0xff]],
+ -3,
+ Some(3),
+ vec![b"cat", b"dog", &[0xf8, 0xf9, 0xff]],
+ ),
+ (
+ vec![b"cat", b"dog", &[0xf8, 0xf9, 0xff]],
+ -3,
+ Some(4),
+ vec![b"cat", b"dog", &[0xf8, 0xf9, 0xff]],
+ ),
+ ];
+
+ cases.into_iter().try_for_each::<_, Result<()>>(
+ |(array, start, length, expected)| {
+ let array =
+ FixedSizeBinaryArray::try_from_iter(array.into_iter()).unwrap();
+ let result = substring(&array, start, length)?;
+ assert_eq!(array.len(), result.len());
+ let result = result
+ .as_any()
+ .downcast_ref::<FixedSizeBinaryArray>()
+ .unwrap();
+ let expected =
+ FixedSizeBinaryArray::try_from_iter(expected.into_iter()).unwrap();
+ assert_eq!(&expected, result,);
+ Ok(())
+ },
+ )?;
+
+ Ok(())
+ }
+
+ #[test]
+ fn offset_fixed_size_binary() -> Result<()> {
+ let values: [u8; 15] = *b"hellotherearrow";
+ // set the first and third element to be valid
+ let bits_v = [0b101_u8];
+
+ let data = ArrayData::builder(DataType::FixedSizeBinary(5))
+ .len(2)
+ .add_buffer(Buffer::from(&values[..]))
+ .offset(1)
+ .null_bit_buffer(Buffer::from(bits_v))
+ .build()
+ .unwrap();
+ // array is `[null, "arrow"]`
+ let array = FixedSizeBinaryArray::from(data);
+ // result is `[null, "rrow"]`
+ let result = substring(&array, 1, None)?;
+ let result = result
+ .as_any()
+ .downcast_ref::<FixedSizeBinaryArray>()
+ .unwrap();
+ let expected = FixedSizeBinaryArray::try_from_sparse_iter(
+ vec![None, Some(b"rrow")].into_iter(),
+ )
+ .unwrap();
+ assert_eq!(result, &expected);
+
+ Ok(())
+ }
+
fn with_nulls_generic_string<O: StringOffsetSizeTrait>() -> Result<()> {
let cases = vec![
+ // all-nulls array is always identical
+ (vec![None, None, None], 0, None, vec![None, None, None]),
// identity
(
vec![Some("hello"), None, Some("word")],
@@ -523,6 +872,8 @@ mod tests {
fn without_nulls_generic_string<O: StringOffsetSizeTrait>() -> Result<()> {
let cases = vec![
+ // empty array is always identical
+ (vec!["", "", ""], 0, None, vec!["", "", ""]),
// increase start
(
vec!["hello", "", "word"],