You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by vi...@apache.org on 2022/05/05 01:30:43 UTC

[arrow-rs] branch master updated: Add `substring` support for `FixedSizeBinaryArray` (#1633)

This is an automated email from the ASF dual-hosted git repository.

viirya pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/master by this push:
     new e51de5eb7 Add `substring` support for `FixedSizeBinaryArray` (#1633)
e51de5eb7 is described below

commit e51de5eb7ef40a4a35324e28d120df5495b68c6c
Author: Remzi Yang <59...@users.noreply.github.com>
AuthorDate: Thu May 5 09:30:37 2022 +0800

    Add `substring` support for `FixedSizeBinaryArray` (#1633)
    
    * add function, no tests yet
    
    Signed-off-by: remzi <13...@gmail.com>
    
    * adjust the fn structure
    
    Signed-off-by: remzi <13...@gmail.com>
    
    * cargo fmt
    
    Signed-off-by: remzi <13...@gmail.com>
    
    * add tests
    
    Signed-off-by: remzi <13...@gmail.com>
    
    * fix clippy
    
    Signed-off-by: remzi <13...@gmail.com>
    
    * add identical test cases for utf8 and binary
    
    Signed-off-by: remzi <13...@gmail.com>
    
    * add benchmark
    
    Signed-off-by: remzi <13...@gmail.com>
    
    * fix offset bug
    
    Signed-off-by: remzi <13...@gmail.com>
    
    * fix a nit
    
    Signed-off-by: remzi <13...@gmail.com>
---
 arrow/benches/string_kernels.rs        |  17 +-
 arrow/src/compute/kernels/substring.rs | 351 +++++++++++++++++++++++++++++++++
 2 files changed, 362 insertions(+), 6 deletions(-)

diff --git a/arrow/benches/string_kernels.rs b/arrow/benches/string_kernels.rs
index 37d1f3f89..7df52a6bb 100644
--- a/arrow/benches/string_kernels.rs
+++ b/arrow/benches/string_kernels.rs
@@ -25,22 +25,27 @@ use arrow::array::*;
 use arrow::compute::kernels::substring::substring;
 use arrow::util::bench_util::*;
 
-fn bench_substring(arr: &StringArray, start: i64, length: Option<u64>) {
+fn bench_substring(arr: &dyn Array, start: i64, length: Option<u64>) {
     substring(criterion::black_box(arr), start, length).unwrap();
 }
 
 fn add_benchmark(c: &mut Criterion) {
     let size = 65536;
-    let str_len = 1000;
+    let val_len = 1000;
 
-    let arr_string = create_string_array_with_len::<i32>(size, 0.0, str_len);
+    let arr_string = create_string_array_with_len::<i32>(size, 0.0, val_len);
+    let arr_fsb = create_fsb_array(size, 0.0, val_len);
 
-    c.bench_function("substring (start = 0, length = None)", |b| {
+    c.bench_function("substring utf8 (start = 0, length = None)", |b| {
         b.iter(|| bench_substring(&arr_string, 0, None))
     });
 
-    c.bench_function("substring (start = 1, length = str_len - 1)", |b| {
-        b.iter(|| bench_substring(&arr_string, 1, Some((str_len - 1) as u64)))
+    c.bench_function("substring utf8 (start = 1, length = str_len - 1)", |b| {
+        b.iter(|| bench_substring(&arr_string, 1, Some((val_len - 1) as u64)))
+    });
+
+    c.bench_function("substring fixed size binary array", |b| {
+        b.iter(|| bench_substring(&arr_fsb, 1, Some((val_len - 1) as u64)))
     });
 }
 
diff --git a/arrow/src/compute/kernels/substring.rs b/arrow/src/compute/kernels/substring.rs
index b687757c4..9d898fda0 100644
--- a/arrow/src/compute/kernels/substring.rs
+++ b/arrow/src/compute/kernels/substring.rs
@@ -86,6 +86,55 @@ fn binary_substring<OffsetSize: BinaryOffsetSizeTrait>(
     Ok(make_array(data))
 }
 
+fn fixed_size_binary_substring(
+    array: &FixedSizeBinaryArray,
+    old_len: i32,
+    start: i32,
+    length: Option<i32>,
+) -> Result<ArrayRef> {
+    let new_start = if start >= 0 {
+        start.min(old_len)
+    } else {
+        (old_len + start).max(0)
+    };
+    let new_len = match length {
+        Some(len) => len.min(old_len - new_start),
+        None => old_len - new_start,
+    };
+
+    // build value buffer
+    let num_of_elements = array.len();
+    let values = array.value_data();
+    let data = values.as_slice();
+    let mut new_values = MutableBuffer::new(num_of_elements * (new_len as usize));
+    (0..num_of_elements)
+        .map(|idx| {
+            let offset = array.value_offset(idx);
+            (
+                (offset + new_start) as usize,
+                (offset + new_start + new_len) as usize,
+            )
+        })
+        .for_each(|(start, end)| new_values.extend_from_slice(&data[start..end]));
+
+    let array_data = unsafe {
+        ArrayData::new_unchecked(
+            DataType::FixedSizeBinary(new_len),
+            num_of_elements,
+            None,
+            array
+                .data_ref()
+                .null_buffer()
+                .map(|b| b.bit_slice(array.offset(), num_of_elements)),
+            0,
+            vec![new_values.into()],
+            vec![],
+        )
+    };
+
+    Ok(make_array(array_data))
+}
+
 /// substring by byte
 fn utf8_substring<OffsetSize: StringOffsetSizeTrait>(
     array: &GenericStringArray<OffsetSize>,
@@ -219,6 +268,15 @@ pub fn substring(array: &dyn Array, start: i64, length: Option<u64>) -> Result<A
             start as i32,
             length.map(|e| e as i32),
         ),
+        DataType::FixedSizeBinary(old_len) => fixed_size_binary_substring(
+            array
+                .as_any()
+                .downcast_ref::<FixedSizeBinaryArray>()
+                .expect("a fixed size binary is expected"),
+            *old_len,
+            start as i32,
+            length.map(|e| e as i32),
+        ),
         DataType::LargeUtf8 => utf8_substring(
             array
                 .as_any()
@@ -249,6 +307,8 @@ mod tests {
     #[allow(clippy::type_complexity)]
     fn with_nulls_generic_binary<O: BinaryOffsetSizeTrait>() -> Result<()> {
         let cases: Vec<(Vec<Option<&[u8]>>, i64, Option<u64>, Vec<Option<&[u8]>>)> = vec![
+            // all-nulls array is always identical
+            (vec![None, None, None], -1, Some(1), vec![None, None, None]),
             // identity
             (
                 vec![Some(b"hello"), None, Some(&[0xf8, 0xf9, 0xff, 0xfa])],
@@ -318,6 +378,8 @@ mod tests {
     #[allow(clippy::type_complexity)]
     fn without_nulls_generic_binary<O: BinaryOffsetSizeTrait>() -> Result<()> {
         let cases: Vec<(Vec<&[u8]>, i64, Option<u64>, Vec<&[u8]>)> = vec![
+            // empty array is always identical
+            (vec![b"", b"", b""], 2, Some(1), vec![b"", b"", b""]),
             // increase start
             (
                 vec![b"hello", b"", &[0xf8, 0xf9, 0xff, 0xfa]],
@@ -453,8 +515,295 @@ mod tests {
         without_nulls_generic_binary::<i64>()
     }
 
+    #[test]
+    #[allow(clippy::type_complexity)]
+    fn with_nulls_fixed_size_binary() -> Result<()> {
+        let cases: Vec<(Vec<Option<&[u8]>>, i64, Option<u64>, Vec<Option<&[u8]>>)> = vec![
+            // all-nulls array is always identical
+            (vec![None, None, None], 3, Some(2), vec![None, None, None]),
+            // increase start
+            (
+                vec![Some(b"cat"), None, Some(&[0xf8, 0xf9, 0xff])],
+                0,
+                None,
+                vec![Some(b"cat"), None, Some(&[0xf8, 0xf9, 0xff])],
+            ),
+            (
+                vec![Some(b"cat"), None, Some(&[0xf8, 0xf9, 0xff])],
+                1,
+                None,
+                vec![Some(b"at"), None, Some(&[0xf9, 0xff])],
+            ),
+            (
+                vec![Some(b"cat"), None, Some(&[0xf8, 0xf9, 0xff])],
+                2,
+                None,
+                vec![Some(b"t"), None, Some(&[0xff])],
+            ),
+            (
+                vec![Some(b"cat"), None, Some(&[0xf8, 0xf9, 0xff])],
+                3,
+                None,
+                vec![Some(b""), None, Some(&[])],
+            ),
+            (
+                vec![Some(b"cat"), None, Some(&[0xf8, 0xf9, 0xff])],
+                10,
+                None,
+                vec![Some(b""), None, Some(b"")],
+            ),
+            // increase start negatively
+            (
+                vec![Some(b"cat"), None, Some(&[0xf8, 0xf9, 0xff])],
+                -1,
+                None,
+                vec![Some(b"t"), None, Some(&[0xff])],
+            ),
+            (
+                vec![Some(b"cat"), None, Some(&[0xf8, 0xf9, 0xff])],
+                -2,
+                None,
+                vec![Some(b"at"), None, Some(&[0xf9, 0xff])],
+            ),
+            (
+                vec![Some(b"cat"), None, Some(&[0xf8, 0xf9, 0xff])],
+                -3,
+                None,
+                vec![Some(b"cat"), None, Some(&[0xf8, 0xf9, 0xff])],
+            ),
+            (
+                vec![Some(b"cat"), None, Some(&[0xf8, 0xf9, 0xff])],
+                -10,
+                None,
+                vec![Some(b"cat"), None, Some(&[0xf8, 0xf9, 0xff])],
+            ),
+            // increase length
+            (
+                vec![Some(b"cat"), None, Some(&[0xf8, 0xf9, 0xff])],
+                1,
+                Some(1),
+                vec![Some(b"a"), None, Some(&[0xf9])],
+            ),
+            (
+                vec![Some(b"cat"), None, Some(&[0xf8, 0xf9, 0xff])],
+                1,
+                Some(2),
+                vec![Some(b"at"), None, Some(&[0xf9, 0xff])],
+            ),
+            (
+                vec![Some(b"cat"), None, Some(&[0xf8, 0xf9, 0xff])],
+                1,
+                Some(3),
+                vec![Some(b"at"), None, Some(&[0xf9, 0xff])],
+            ),
+            (
+                vec![Some(b"cat"), None, Some(&[0xf8, 0xf9, 0xff])],
+                -3,
+                Some(1),
+                vec![Some(b"c"), None, Some(&[0xf8])],
+            ),
+            (
+                vec![Some(b"cat"), None, Some(&[0xf8, 0xf9, 0xff])],
+                -3,
+                Some(2),
+                vec![Some(b"ca"), None, Some(&[0xf8, 0xf9])],
+            ),
+            (
+                vec![Some(b"cat"), None, Some(&[0xf8, 0xf9, 0xff])],
+                -3,
+                Some(3),
+                vec![Some(b"cat"), None, Some(&[0xf8, 0xf9, 0xff])],
+            ),
+            (
+                vec![Some(b"cat"), None, Some(&[0xf8, 0xf9, 0xff])],
+                -3,
+                Some(4),
+                vec![Some(b"cat"), None, Some(&[0xf8, 0xf9, 0xff])],
+            ),
+        ];
+
+        cases.into_iter().try_for_each::<_, Result<()>>(
+            |(array, start, length, expected)| {
+                let array = FixedSizeBinaryArray::try_from_sparse_iter(array.into_iter())
+                    .unwrap();
+                let result = substring(&array, start, length)?;
+                assert_eq!(array.len(), result.len());
+                let result = result
+                    .as_any()
+                    .downcast_ref::<FixedSizeBinaryArray>()
+                    .unwrap();
+                let expected =
+                    FixedSizeBinaryArray::try_from_sparse_iter(expected.into_iter())
+                        .unwrap();
+                assert_eq!(&expected, result,);
+                Ok(())
+            },
+        )?;
+
+        Ok(())
+    }
+
+    #[test]
+    #[allow(clippy::type_complexity)]
+    fn without_nulls_fixed_size_binary() -> Result<()> {
+        let cases: Vec<(Vec<&[u8]>, i64, Option<u64>, Vec<&[u8]>)> = vec![
+            // empty array is always identical
+            (vec![b"", b"", &[]], 3, Some(2), vec![b"", b"", &[]]),
+            // increase start
+            (
+                vec![b"cat", b"dog", &[0xf8, 0xf9, 0xff]],
+                0,
+                None,
+                vec![b"cat", b"dog", &[0xf8, 0xf9, 0xff]],
+            ),
+            (
+                vec![b"cat", b"dog", &[0xf8, 0xf9, 0xff]],
+                1,
+                None,
+                vec![b"at", b"og", &[0xf9, 0xff]],
+            ),
+            (
+                vec![b"cat", b"dog", &[0xf8, 0xf9, 0xff]],
+                2,
+                None,
+                vec![b"t", b"g", &[0xff]],
+            ),
+            (
+                vec![b"cat", b"dog", &[0xf8, 0xf9, 0xff]],
+                3,
+                None,
+                vec![b"", b"", &[]],
+            ),
+            (
+                vec![b"cat", b"dog", &[0xf8, 0xf9, 0xff]],
+                10,
+                None,
+                vec![b"", b"", b""],
+            ),
+            // increase start negatively
+            (
+                vec![b"cat", b"dog", &[0xf8, 0xf9, 0xff]],
+                -1,
+                None,
+                vec![b"t", b"g", &[0xff]],
+            ),
+            (
+                vec![b"cat", b"dog", &[0xf8, 0xf9, 0xff]],
+                -2,
+                None,
+                vec![b"at", b"og", &[0xf9, 0xff]],
+            ),
+            (
+                vec![b"cat", b"dog", &[0xf8, 0xf9, 0xff]],
+                -3,
+                None,
+                vec![b"cat", b"dog", &[0xf8, 0xf9, 0xff]],
+            ),
+            (
+                vec![b"cat", b"dog", &[0xf8, 0xf9, 0xff]],
+                -10,
+                None,
+                vec![b"cat", b"dog", &[0xf8, 0xf9, 0xff]],
+            ),
+            // increase length
+            (
+                vec![b"cat", b"dog", &[0xf8, 0xf9, 0xff]],
+                1,
+                Some(1),
+                vec![b"a", b"o", &[0xf9]],
+            ),
+            (
+                vec![b"cat", b"dog", &[0xf8, 0xf9, 0xff]],
+                1,
+                Some(2),
+                vec![b"at", b"og", &[0xf9, 0xff]],
+            ),
+            (
+                vec![b"cat", b"dog", &[0xf8, 0xf9, 0xff]],
+                1,
+                Some(3),
+                vec![b"at", b"og", &[0xf9, 0xff]],
+            ),
+            (
+                vec![b"cat", b"dog", &[0xf8, 0xf9, 0xff]],
+                -3,
+                Some(1),
+                vec![b"c", b"d", &[0xf8]],
+            ),
+            (
+                vec![b"cat", b"dog", &[0xf8, 0xf9, 0xff]],
+                -3,
+                Some(2),
+                vec![b"ca", b"do", &[0xf8, 0xf9]],
+            ),
+            (
+                vec![b"cat", b"dog", &[0xf8, 0xf9, 0xff]],
+                -3,
+                Some(3),
+                vec![b"cat", b"dog", &[0xf8, 0xf9, 0xff]],
+            ),
+            (
+                vec![b"cat", b"dog", &[0xf8, 0xf9, 0xff]],
+                -3,
+                Some(4),
+                vec![b"cat", b"dog", &[0xf8, 0xf9, 0xff]],
+            ),
+        ];
+
+        cases.into_iter().try_for_each::<_, Result<()>>(
+            |(array, start, length, expected)| {
+                let array =
+                    FixedSizeBinaryArray::try_from_iter(array.into_iter()).unwrap();
+                let result = substring(&array, start, length)?;
+                assert_eq!(array.len(), result.len());
+                let result = result
+                    .as_any()
+                    .downcast_ref::<FixedSizeBinaryArray>()
+                    .unwrap();
+                let expected =
+                    FixedSizeBinaryArray::try_from_iter(expected.into_iter()).unwrap();
+                assert_eq!(&expected, result,);
+                Ok(())
+            },
+        )?;
+
+        Ok(())
+    }
+
+    #[test]
+    fn offset_fixed_size_binary() -> Result<()> {
+        let values: [u8; 15] = *b"hellotherearrow";
+        // set the first and third element to be valid
+        let bits_v = [0b101_u8];
+
+        let data = ArrayData::builder(DataType::FixedSizeBinary(5))
+            .len(2)
+            .add_buffer(Buffer::from(&values[..]))
+            .offset(1)
+            .null_bit_buffer(Buffer::from(bits_v))
+            .build()
+            .unwrap();
+        // array is `[null, "arrow"]`
+        let array = FixedSizeBinaryArray::from(data);
+        // result is `[null, "rrow"]`
+        let result = substring(&array, 1, None)?;
+        let result = result
+            .as_any()
+            .downcast_ref::<FixedSizeBinaryArray>()
+            .unwrap();
+        let expected = FixedSizeBinaryArray::try_from_sparse_iter(
+            vec![None, Some(b"rrow")].into_iter(),
+        )
+        .unwrap();
+        assert_eq!(result, &expected);
+
+        Ok(())
+    }
+
     fn with_nulls_generic_string<O: StringOffsetSizeTrait>() -> Result<()> {
         let cases = vec![
+            // all-nulls array is always identical
+            (vec![None, None, None], 0, None, vec![None, None, None]),
             // identity
             (
                 vec![Some("hello"), None, Some("word")],
@@ -523,6 +872,8 @@ mod tests {
 
     fn without_nulls_generic_string<O: StringOffsetSizeTrait>() -> Result<()> {
         let cases = vec![
+            // empty array is always identical
+            (vec!["", "", ""], 0, None, vec!["", "", ""]),
             // increase start
             (
                 vec!["hello", "", "word"],