You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by tu...@apache.org on 2023/01/07 11:09:41 UTC
[arrow-rs] branch master updated: Preserve empty list array elements in take kernel (#3473)
This is an automated email from the ASF dual-hosted git repository.
tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new c28d69aa5 Preserve empty list array elements in take kernel (#3473)
c28d69aa5 is described below
commit c28d69aa5be2cce2d065300c9a79c4063589f300
Author: Jon Mease <jo...@gmail.com>
AuthorDate: Sat Jan 7 06:09:35 2023 -0500
Preserve empty list array elements in take kernel (#3473)
* Update test_take_list to fail with empty list
* Fix null_bit_buffer to match intended array
comment says: [[0,null,0], [-1,-2,3], null, [5,null]]
which implies null buffer of: 0b11111011
* Fix null_bit_buffer to match intended array
comment says: [[0,null,0], [-1,-2,3], [null], [5,null]]
which has not null values at the list level, or a null
buffer of 0b11111011
* Compute null buffer in take_value_indices_from_list
this way we can distinguish empty list elements from null elements.
* clippy
---
arrow-select/src/take.rs | 73 ++++++++++++++++++++++++++----------------------
1 file changed, 40 insertions(+), 33 deletions(-)
diff --git a/arrow-select/src/take.rs b/arrow-select/src/take.rs
index 458fbdb66..9fffa0b5f 100644
--- a/arrow-select/src/take.rs
+++ b/arrow-select/src/take.rs
@@ -691,27 +691,11 @@ where
{
// TODO: Some optimizations can be done here such as if it is
// taking the whole list or a contiguous sublist
- let (list_indices, offsets) =
+ let (list_indices, offsets, null_buf) =
take_value_indices_from_list::<IndexType, OffsetType>(values, indices)?;
let taken = take_impl::<OffsetType>(values.values().as_ref(), &list_indices, None)?;
- // determine null count and null buffer, which are a function of `values` and `indices`
- let mut null_count = 0;
- let num_bytes = bit_util::ceil(indices.len(), 8);
- let mut null_buf = MutableBuffer::new(num_bytes).with_bitset(num_bytes, true);
- {
- let null_slice = null_buf.as_slice_mut();
- offsets[..].windows(2).enumerate().for_each(
- |(i, window): (usize, &[OffsetType::Native])| {
- if window[0] == window[1] {
- // offsets are equal, slot is null
- bit_util::unset_bit(null_slice, i);
- null_count += 1;
- }
- },
- );
- }
- let value_offsets = Buffer::from_slice_ref(&offsets);
+ let value_offsets = Buffer::from_slice_ref(offsets);
// create a new list with taken data and computed null information
let list_data = ArrayDataBuilder::new(values.data_type().clone())
.len(indices.len())
@@ -831,10 +815,18 @@ where
/// Where a list array has indices `[0,2,5,10]`, taking indices of `[2,0]` returns
/// an array of the indices `[5..10, 0..2]` and offsets `[0,5,7]` (5 elements and 2
/// elements)
+#[allow(clippy::type_complexity)]
fn take_value_indices_from_list<IndexType, OffsetType>(
list: &GenericListArray<OffsetType::Native>,
indices: &PrimitiveArray<IndexType>,
-) -> Result<(PrimitiveArray<OffsetType>, Vec<OffsetType::Native>), ArrowError>
+) -> Result<
+ (
+ PrimitiveArray<OffsetType>,
+ Vec<OffsetType::Native>,
+ MutableBuffer,
+ ),
+ ArrowError,
+>
where
IndexType: ArrowPrimitiveType,
IndexType::Native: ToPrimitive,
@@ -850,6 +842,12 @@ where
let mut current_offset = OffsetType::Native::zero();
// add first offset
new_offsets.push(OffsetType::Native::zero());
+
+ // Initialize null buffer
+ let num_bytes = bit_util::ceil(indices.len(), 8);
+ let mut null_buf = MutableBuffer::new(num_bytes).with_bitset(num_bytes, true);
+ let null_slice = null_buf.as_slice_mut();
+
// compute the value indices, and set offsets accordingly
for i in 0..indices.len() {
if indices.is_valid(i) {
@@ -868,12 +866,20 @@ where
values.push(Some(curr));
curr += num::One::one();
}
+ if !list.is_valid(ix) {
+ bit_util::unset_bit(null_slice, i);
+ }
} else {
+ bit_util::unset_bit(null_slice, i);
new_offsets.push(current_offset);
}
}
- Ok((PrimitiveArray::<OffsetType>::from(values), new_offsets))
+ Ok((
+ PrimitiveArray::<OffsetType>::from(values),
+ new_offsets,
+ null_buf,
+ ))
}
/// Takes/filters a fixed size list array's inner data using the offsets of the list array.
@@ -1519,12 +1525,12 @@ mod tests {
macro_rules! test_take_list {
($offset_type:ty, $list_data_type:ident, $list_array_type:ident) => {{
- // Construct a value array, [[0,0,0], [-1,-2,-1], [2,3]]
+ // Construct a value array, [[0,0,0], [-1,-2,-1], [], [2,3]]
let value_data = Int32Array::from(vec![0, 0, 0, -1, -2, -1, 2, 3])
.data()
.clone();
// Construct offsets
- let value_offsets: [$offset_type; 4] = [0, 3, 6, 8];
+ let value_offsets: [$offset_type; 5] = [0, 3, 6, 6, 8];
let value_offsets = Buffer::from_slice_ref(&value_offsets);
// Construct a list array from the above two
let list_data_type = DataType::$list_data_type(Box::new(Field::new(
@@ -1533,30 +1539,28 @@ mod tests {
false,
)));
let list_data = ArrayData::builder(list_data_type.clone())
- .len(3)
+ .len(4)
.add_buffer(value_offsets)
.add_child_data(value_data)
.build()
.unwrap();
let list_array = $list_array_type::from(list_data);
- // index returns: [[2,3], null, [-1,-2,-1], [2,3], [0,0,0]]
- let index = UInt32Array::from(vec![Some(2), None, Some(1), Some(2), Some(0)]);
+ // index returns: [[2,3], null, [-1,-2,-1], [], [0,0,0]]
+ let index = UInt32Array::from(vec![Some(3), None, Some(1), Some(2), Some(0)]);
let a = take(&list_array, &index, None).unwrap();
let a: &$list_array_type =
a.as_any().downcast_ref::<$list_array_type>().unwrap();
// construct a value array with expected results:
- // [[2,3], null, [-1,-2,-1], [2,3], [0,0,0]]
+ // [[2,3], null, [-1,-2,-1], [], [0,0,0]]
let expected_data = Int32Array::from(vec![
Some(2),
Some(3),
Some(-1),
Some(-2),
Some(-1),
- Some(2),
- Some(3),
Some(0),
Some(0),
Some(0),
@@ -1564,7 +1568,7 @@ mod tests {
.data()
.clone();
// construct offsets
- let expected_offsets: [$offset_type; 6] = [0, 2, 2, 5, 7, 10];
+ let expected_offsets: [$offset_type; 6] = [0, 2, 2, 5, 5, 8];
let expected_offsets = Buffer::from_slice_ref(&expected_offsets);
// construct list array from the two
let expected_list_data = ArrayData::builder(list_data_type)
@@ -1609,7 +1613,7 @@ mod tests {
let list_data = ArrayData::builder(list_data_type.clone())
.len(4)
.add_buffer(value_offsets)
- .null_bit_buffer(Some(Buffer::from([0b10111101, 0b00000000])))
+ .null_bit_buffer(Some(Buffer::from([0b11111111])))
.add_child_data(value_data)
.build()
.unwrap();
@@ -1682,7 +1686,7 @@ mod tests {
let list_data = ArrayData::builder(list_data_type.clone())
.len(4)
.add_buffer(value_offsets)
- .null_bit_buffer(Some(Buffer::from([0b01111101])))
+ .null_bit_buffer(Some(Buffer::from([0b11111011])))
.add_child_data(value_data)
.build()
.unwrap();
@@ -2057,10 +2061,12 @@ mod tests {
]);
let indices = UInt32Array::from(vec![2, 0]);
- let (indexed, offsets) = take_value_indices_from_list(&list, &indices).unwrap();
+ let (indexed, offsets, null_buf) =
+ take_value_indices_from_list(&list, &indices).unwrap();
assert_eq!(indexed, Int32Array::from(vec![5, 6, 7, 8, 9, 0, 1]));
assert_eq!(offsets, vec![0, 5, 7]);
+ assert_eq!(null_buf.as_slice(), &[0b11111111]);
}
#[test]
@@ -2072,11 +2078,12 @@ mod tests {
]);
let indices = UInt32Array::from(vec![2, 0]);
- let (indexed, offsets) =
+ let (indexed, offsets, null_buf) =
take_value_indices_from_list::<_, Int64Type>(&list, &indices).unwrap();
assert_eq!(indexed, Int64Array::from(vec![5, 6, 7, 8, 9, 0, 1]));
assert_eq!(offsets, vec![0, 5, 7]);
+ assert_eq!(null_buf.as_slice(), &[0b11111111]);
}
#[test]