You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by tu...@apache.org on 2022/11/29 11:31:19 UTC
[arrow-rs] branch master updated: Support `FixedSizeBinary` in Row format (#3182)
This is an automated email from the ASF dual-hosted git repository.
tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new ab3f38448 Support `FixedSizeBinary` in Row format (#3182)
ab3f38448 is described below
commit ab3f384483c4fef645f9d1653f1adda3470594b2
Author: Raphael Taylor-Davies <17...@users.noreply.github.com>
AuthorDate: Tue Nov 29 11:31:13 2022 +0000
Support `FixedSizeBinary` in Row format (#3182)
* Add support for FixedSizeBinary in Row format
* Add docs
---
arrow/src/row/fixed.rs | 59 +++++++++++++++++++++++++++++++++++++++++++++++++-
arrow/src/row/mod.rs | 47 ++++++++++++++++++++++++++++++++++++++--
2 files changed, 103 insertions(+), 3 deletions(-)
diff --git a/arrow/src/row/fixed.rs b/arrow/src/row/fixed.rs
index 9aef83ce2..03c53c994 100644
--- a/arrow/src/row/fixed.rs
+++ b/arrow/src/row/fixed.rs
@@ -20,7 +20,7 @@ use crate::compute::SortOptions;
use crate::datatypes::ArrowPrimitiveType;
use crate::row::{null_sentinel, Rows};
use arrow_array::builder::BufferBuilder;
-use arrow_array::BooleanArray;
+use arrow_array::{BooleanArray, FixedSizeBinaryArray};
use arrow_buffer::{bit_util, i256, ArrowNativeType, Buffer, MutableBuffer};
use arrow_data::{ArrayData, ArrayDataBuilder};
use arrow_schema::DataType;
@@ -201,6 +201,29 @@ pub fn encode<T: FixedLengthEncoding, I: IntoIterator<Item = Option<T>>>(
}
}
+pub fn encode_fixed_size_binary(
+ out: &mut Rows,
+ array: &FixedSizeBinaryArray,
+ opts: SortOptions,
+) {
+ let len = array.value_length() as usize;
+ for (offset, maybe_val) in out.offsets.iter_mut().skip(1).zip(array.iter()) {
+ let end_offset = *offset + len + 1;
+ if let Some(val) = maybe_val {
+ let to_write = &mut out.buffer[*offset..end_offset];
+ to_write[0] = 1;
+ to_write[1..].copy_from_slice(&val[..len]);
+ if opts.descending {
+ // Flip bits to reverse order
+ to_write[1..1 + len].iter_mut().for_each(|v| *v = !*v)
+ }
+ } else {
+ out.buffer[*offset] = null_sentinel(opts);
+ }
+ *offset = end_offset;
+ }
+}
+
/// Splits `len` bytes from `src`
#[inline]
fn split_off<'a>(src: &mut &'a [u8], len: usize) -> &'a [u8] {
@@ -330,3 +353,37 @@ where
// Validated data type above
unsafe { decode_fixed::<T::Native>(rows, data_type, options).into() }
}
+
+/// Decodes a `FixedLengthBinary` from rows
+pub fn decode_fixed_size_binary(
+ rows: &mut [&[u8]],
+ size: i32,
+ options: SortOptions,
+) -> FixedSizeBinaryArray {
+ let len = rows.len();
+
+ let mut values = MutableBuffer::new(size as usize * rows.len());
+ let (null_count, nulls) = decode_nulls(rows);
+
+ let encoded_len = size as usize + 1;
+
+ for row in rows {
+ let i = split_off(row, encoded_len);
+ values.extend_from_slice(&i[1..]);
+ }
+
+ if options.descending {
+ for v in values.as_slice_mut() {
+ *v = !*v;
+ }
+ }
+
+ let builder = ArrayDataBuilder::new(DataType::FixedSizeBinary(size))
+ .len(len)
+ .null_count(null_count)
+ .add_buffer(values.into())
+ .null_bit_buffer(Some(nulls));
+
+ // SAFETY: Buffers correct length
+ unsafe { builder.build_unchecked().into() }
+}
diff --git a/arrow/src/row/mod.rs b/arrow/src/row/mod.rs
index 8572bf892..cff49740f 100644
--- a/arrow/src/row/mod.rs
+++ b/arrow/src/row/mod.rs
@@ -139,7 +139,7 @@ use crate::error::{ArrowError, Result};
use crate::row::dictionary::{
compute_dictionary_mapping, decode_dictionary, encode_dictionary,
};
-use crate::row::fixed::{decode_bool, decode_primitive};
+use crate::row::fixed::{decode_bool, decode_fixed_size_binary, decode_primitive};
use crate::row::interner::OrderPreservingInterner;
use crate::row::variable::{decode_binary, decode_string};
use crate::{downcast_dictionary_array, downcast_primitive_array};
@@ -213,6 +213,16 @@ mod variable;
///
/// They are then encoded in the same manner as a signed integer.
///
+/// ## Fixed Length Bytes Encoding
+///
+/// Fixed length bytes are encoded in the same fashion as primitive types above.
+///
+/// For a fixed length array of length `n`:
+///
+/// A null is encoded as `0_u8` null sentinel followed by `n` `0_u8` bytes
+///
+/// A valid value is encoded as `1_u8` followed by the value bytes
+///
/// ## Variable Length Bytes (including Strings) Encoding
///
/// A null is encoded as a `0_u8`.
@@ -936,6 +946,10 @@ fn new_empty_rows(cols: &[ArrayRef], encoders: &[Encoder], config: RowConfig) ->
.for_each(|(slice, length)| {
*length += variable::encoded_len(slice.map(|x| x.as_bytes()))
}),
+ DataType::FixedSizeBinary(len) => {
+ let len = len.to_usize().unwrap();
+ lengths.iter_mut().for_each(|x| *x += 1 + len)
+ }
_ => unreachable!(),
}
}
@@ -1028,6 +1042,10 @@ fn encode_column(
.map(|x| x.map(|x| x.as_bytes())),
opts,
),
+ DataType::FixedSizeBinary(_) => {
+ let array = column.as_any().downcast_ref().unwrap();
+ fixed::encode_fixed_size_binary(out, array, opts)
+ }
_ => unreachable!(),
}
}
@@ -1092,6 +1110,7 @@ unsafe fn decode_column(
DataType::Boolean => Arc::new(decode_bool(rows, options)),
DataType::Binary => Arc::new(decode_binary::<i32>(rows, options)),
DataType::LargeBinary => Arc::new(decode_binary::<i64>(rows, options)),
+ DataType::FixedSizeBinary(size) => Arc::new(decode_fixed_size_binary(rows, size, options)),
DataType::Utf8 => Arc::new(decode_string::<i32>(rows, options, validate_utf8)),
DataType::LargeUtf8 => Arc::new(decode_string::<i64>(rows, options, validate_utf8)),
_ => unreachable!()
@@ -1154,6 +1173,7 @@ unsafe fn decode_column(
mod tests {
use std::sync::Arc;
+ use arrow_array::builder::FixedSizeBinaryBuilder;
use rand::distributions::uniform::SampleUniform;
use rand::distributions::{Distribution, Standard};
use rand::{thread_rng, Rng};
@@ -1713,9 +1733,31 @@ mod tests {
DictionaryArray::from(data)
}
+ fn generate_fixed_size_binary(
+ len: usize,
+ valid_percent: f64,
+ ) -> FixedSizeBinaryArray {
+ let mut rng = thread_rng();
+ let width = rng.gen_range(0..20);
+ let mut builder = FixedSizeBinaryBuilder::new(width);
+
+ let mut b = vec![0; width as usize];
+ for _ in 0..len {
+ match rng.gen_bool(valid_percent) {
+ true => {
+ b.iter_mut().for_each(|x| *x = rng.gen());
+ builder.append_value(&b).unwrap();
+ }
+ false => builder.append_null(),
+ }
+ }
+
+ builder.finish()
+ }
+
fn generate_column(len: usize) -> ArrayRef {
let mut rng = thread_rng();
- match rng.gen_range(0..9) {
+ match rng.gen_range(0..10) {
0 => Arc::new(generate_primitive_array::<Int32Type>(len, 0.8)),
1 => Arc::new(generate_primitive_array::<UInt32Type>(len, 0.8)),
2 => Arc::new(generate_primitive_array::<Int64Type>(len, 0.8)),
@@ -1738,6 +1780,7 @@ mod tests {
len,
0.8,
)),
+ 9 => Arc::new(generate_fixed_size_binary(len, 0.8)),
_ => unreachable!(),
}
}