You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by tu...@apache.org on 2022/11/29 11:31:19 UTC

[arrow-rs] branch master updated: Support `FixedSizeBinary` in Row format (#3182)

This is an automated email from the ASF dual-hosted git repository.

tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/master by this push:
     new ab3f38448 Support `FixedSizeBinary` in Row format (#3182)
ab3f38448 is described below

commit ab3f384483c4fef645f9d1653f1adda3470594b2
Author: Raphael Taylor-Davies <17...@users.noreply.github.com>
AuthorDate: Tue Nov 29 11:31:13 2022 +0000

    Support `FixedSizeBinary` in Row format (#3182)
    
    * Add support for FixedSizeBinary in Row format
    
    * Add docs
---
 arrow/src/row/fixed.rs | 59 +++++++++++++++++++++++++++++++++++++++++++++++++-
 arrow/src/row/mod.rs   | 47 ++++++++++++++++++++++++++++++++++++++--
 2 files changed, 103 insertions(+), 3 deletions(-)

diff --git a/arrow/src/row/fixed.rs b/arrow/src/row/fixed.rs
index 9aef83ce2..03c53c994 100644
--- a/arrow/src/row/fixed.rs
+++ b/arrow/src/row/fixed.rs
@@ -20,7 +20,7 @@ use crate::compute::SortOptions;
 use crate::datatypes::ArrowPrimitiveType;
 use crate::row::{null_sentinel, Rows};
 use arrow_array::builder::BufferBuilder;
-use arrow_array::BooleanArray;
+use arrow_array::{BooleanArray, FixedSizeBinaryArray};
 use arrow_buffer::{bit_util, i256, ArrowNativeType, Buffer, MutableBuffer};
 use arrow_data::{ArrayData, ArrayDataBuilder};
 use arrow_schema::DataType;
@@ -201,6 +201,29 @@ pub fn encode<T: FixedLengthEncoding, I: IntoIterator<Item = Option<T>>>(
     }
 }
 
+pub fn encode_fixed_size_binary(
+    out: &mut Rows,
+    array: &FixedSizeBinaryArray,
+    opts: SortOptions,
+) {
+    let len = array.value_length() as usize;
+    for (offset, maybe_val) in out.offsets.iter_mut().skip(1).zip(array.iter()) {
+        let end_offset = *offset + len + 1;
+        if let Some(val) = maybe_val {
+            let to_write = &mut out.buffer[*offset..end_offset];
+            to_write[0] = 1;
+            to_write[1..].copy_from_slice(&val[..len]);
+            if opts.descending {
+                // Flip bits to reverse order
+                to_write[1..1 + len].iter_mut().for_each(|v| *v = !*v)
+            }
+        } else {
+            out.buffer[*offset] = null_sentinel(opts);
+        }
+        *offset = end_offset;
+    }
+}
+
 /// Splits `len` bytes from `src`
 #[inline]
 fn split_off<'a>(src: &mut &'a [u8], len: usize) -> &'a [u8] {
@@ -330,3 +353,37 @@ where
     // Validated data type above
     unsafe { decode_fixed::<T::Native>(rows, data_type, options).into() }
 }
+
+/// Decodes a `FixedLengthBinary` from rows
+pub fn decode_fixed_size_binary(
+    rows: &mut [&[u8]],
+    size: i32,
+    options: SortOptions,
+) -> FixedSizeBinaryArray {
+    let len = rows.len();
+
+    let mut values = MutableBuffer::new(size as usize * rows.len());
+    let (null_count, nulls) = decode_nulls(rows);
+
+    let encoded_len = size as usize + 1;
+
+    for row in rows {
+        let i = split_off(row, encoded_len);
+        values.extend_from_slice(&i[1..]);
+    }
+
+    if options.descending {
+        for v in values.as_slice_mut() {
+            *v = !*v;
+        }
+    }
+
+    let builder = ArrayDataBuilder::new(DataType::FixedSizeBinary(size))
+        .len(len)
+        .null_count(null_count)
+        .add_buffer(values.into())
+        .null_bit_buffer(Some(nulls));
+
+    // SAFETY: Buffers correct length
+    unsafe { builder.build_unchecked().into() }
+}
diff --git a/arrow/src/row/mod.rs b/arrow/src/row/mod.rs
index 8572bf892..cff49740f 100644
--- a/arrow/src/row/mod.rs
+++ b/arrow/src/row/mod.rs
@@ -139,7 +139,7 @@ use crate::error::{ArrowError, Result};
 use crate::row::dictionary::{
     compute_dictionary_mapping, decode_dictionary, encode_dictionary,
 };
-use crate::row::fixed::{decode_bool, decode_primitive};
+use crate::row::fixed::{decode_bool, decode_fixed_size_binary, decode_primitive};
 use crate::row::interner::OrderPreservingInterner;
 use crate::row::variable::{decode_binary, decode_string};
 use crate::{downcast_dictionary_array, downcast_primitive_array};
@@ -213,6 +213,16 @@ mod variable;
 ///
 /// They are then encoded in the same manner as a signed integer.
 ///
+/// ## Fixed Length Bytes Encoding
+///
+/// Fixed length bytes are encoded in the same fashion as primitive types above.
+///
+/// For a fixed length array of length `n`:
+///
+/// A null is encoded as `0_u8` null sentinel followed by `n` `0_u8` bytes
+///
+/// A valid value is encoded as `1_u8` followed by the value bytes
+///
 /// ## Variable Length Bytes (including Strings) Encoding
 ///
 /// A null is encoded as a `0_u8`.
@@ -936,6 +946,10 @@ fn new_empty_rows(cols: &[ArrayRef], encoders: &[Encoder], config: RowConfig) ->
                         .for_each(|(slice, length)| {
                             *length += variable::encoded_len(slice.map(|x| x.as_bytes()))
                         }),
+                    DataType::FixedSizeBinary(len) => {
+                        let len = len.to_usize().unwrap();
+                        lengths.iter_mut().for_each(|x| *x += 1 + len)
+                    }
                     _ => unreachable!(),
                 }
             }
@@ -1028,6 +1042,10 @@ fn encode_column(
                         .map(|x| x.map(|x| x.as_bytes())),
                     opts,
                 ),
+                DataType::FixedSizeBinary(_) => {
+                    let array = column.as_any().downcast_ref().unwrap();
+                    fixed::encode_fixed_size_binary(out, array, opts)
+                }
                 _ => unreachable!(),
             }
         }
@@ -1092,6 +1110,7 @@ unsafe fn decode_column(
                 DataType::Boolean => Arc::new(decode_bool(rows, options)),
                 DataType::Binary => Arc::new(decode_binary::<i32>(rows, options)),
                 DataType::LargeBinary => Arc::new(decode_binary::<i64>(rows, options)),
+                DataType::FixedSizeBinary(size) => Arc::new(decode_fixed_size_binary(rows, size, options)),
                 DataType::Utf8 => Arc::new(decode_string::<i32>(rows, options, validate_utf8)),
                 DataType::LargeUtf8 => Arc::new(decode_string::<i64>(rows, options, validate_utf8)),
                 _ => unreachable!()
@@ -1154,6 +1173,7 @@ unsafe fn decode_column(
 mod tests {
     use std::sync::Arc;
 
+    use arrow_array::builder::FixedSizeBinaryBuilder;
     use rand::distributions::uniform::SampleUniform;
     use rand::distributions::{Distribution, Standard};
     use rand::{thread_rng, Rng};
@@ -1713,9 +1733,31 @@ mod tests {
         DictionaryArray::from(data)
     }
 
+    fn generate_fixed_size_binary(
+        len: usize,
+        valid_percent: f64,
+    ) -> FixedSizeBinaryArray {
+        let mut rng = thread_rng();
+        let width = rng.gen_range(0..20);
+        let mut builder = FixedSizeBinaryBuilder::new(width);
+
+        let mut b = vec![0; width as usize];
+        for _ in 0..len {
+            match rng.gen_bool(valid_percent) {
+                true => {
+                    b.iter_mut().for_each(|x| *x = rng.gen());
+                    builder.append_value(&b).unwrap();
+                }
+                false => builder.append_null(),
+            }
+        }
+
+        builder.finish()
+    }
+
     fn generate_column(len: usize) -> ArrayRef {
         let mut rng = thread_rng();
-        match rng.gen_range(0..9) {
+        match rng.gen_range(0..10) {
             0 => Arc::new(generate_primitive_array::<Int32Type>(len, 0.8)),
             1 => Arc::new(generate_primitive_array::<UInt32Type>(len, 0.8)),
             2 => Arc::new(generate_primitive_array::<Int64Type>(len, 0.8)),
@@ -1738,6 +1780,7 @@ mod tests {
                 len,
                 0.8,
             )),
+            9 => Arc::new(generate_fixed_size_binary(len, 0.8)),
             _ => unreachable!(),
         }
     }