You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by tu...@apache.org on 2022/07/16 14:26:12 UTC
[arrow-rs] branch master updated: Support nullable indices in boolean take kernel and some optimizations (#2064)
This is an automated email from the ASF dual-hosted git repository.
tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new a6379fe98 Support nullable indices in boolean take kernel and some optimizations (#2064)
a6379fe98 is described below
commit a6379fe9818de3133a80700b7b8d6d9d59b29e37
Author: Jörn Horstmann <gi...@jhorstmann.net>
AuthorDate: Sat Jul 16 16:26:08 2022 +0200
Support nullable indices in boolean take kernel and some optimizations (#2064)
* Use iterator in boolean take kernel and support nullable indices
* Improve performance of take_boolean kernel by processing validity and value bits separately
* Add test for take_boolean with masked out of bounds indices
* Test and fix for when only the indices contain null values
---
arrow/src/compute/kernels/take.rs | 134 ++++++++++++++++++++++++--------------
1 file changed, 86 insertions(+), 48 deletions(-)
diff --git a/arrow/src/compute/kernels/take.rs b/arrow/src/compute/kernels/take.rs
index fa907656a..4ff7d84d6 100644
--- a/arrow/src/compute/kernels/take.rs
+++ b/arrow/src/compute/kernels/take.rs
@@ -599,66 +599,58 @@ where
Ok(PrimitiveArray::<T>::from(data))
}
-/// `take` implementation for boolean arrays
-fn take_boolean<IndexType>(
- values: &BooleanArray,
+fn take_bits<IndexType>(
+ values: &Buffer,
+ values_offset: usize,
indices: &PrimitiveArray<IndexType>,
-) -> Result<BooleanArray>
+) -> Result<Buffer>
where
IndexType: ArrowNumericType,
IndexType::Native: ToPrimitive,
{
- let data_len = indices.len();
+ let len = indices.len();
+ let values_slice = values.as_slice();
+ let mut output_buffer = MutableBuffer::new_null(len);
+ let output_slice = output_buffer.as_slice_mut();
- let num_byte = bit_util::ceil(data_len, 8);
- let mut val_buf = MutableBuffer::from_len_zeroed(num_byte);
-
- let val_slice = val_buf.as_slice_mut();
-
- let null_count = values.null_count();
-
- let nulls = if null_count == 0 {
- (0..data_len).try_for_each::<_, Result<()>>(|i| {
- let index = ToPrimitive::to_usize(&indices.value(i)).ok_or_else(|| {
- ArrowError::ComputeError("Cast to usize failed".to_string())
- })?;
+ indices
+ .iter()
+ .enumerate()
+ .try_for_each::<_, Result<()>>(|(i, index)| {
+ if let Some(index) = index {
+ let index = ToPrimitive::to_usize(&index).ok_or_else(|| {
+ ArrowError::ComputeError("Cast to usize failed".to_string())
+ })?;
- if values.value(index) {
- bit_util::set_bit(val_slice, i);
+ if bit_util::get_bit(values_slice, values_offset + index) {
+ bit_util::set_bit(output_slice, i);
+ }
}
Ok(())
})?;
- indices.data_ref().null_buffer().cloned()
- } else {
- let mut null_buf = MutableBuffer::new(num_byte).with_bitset(num_byte, true);
- let null_slice = null_buf.as_slice_mut();
-
- (0..data_len).try_for_each::<_, Result<()>>(|i| {
- let index = ToPrimitive::to_usize(&indices.value(i)).ok_or_else(|| {
- ArrowError::ComputeError("Cast to usize failed".to_string())
- })?;
-
- if values.is_null(index) {
- bit_util::unset_bit(null_slice, i);
- } else if values.value(index) {
- bit_util::set_bit(val_slice, i);
- }
-
- Ok(())
- })?;
+ Ok(output_buffer.into())
+}
- match indices.data_ref().null_buffer() {
- Some(buffer) => Some(buffer_bin_and(
- buffer,
- indices.offset(),
- &null_buf.into(),
- 0,
- indices.len(),
- )),
- None => Some(null_buf.into()),
+/// `take` implementation for boolean arrays
+fn take_boolean<IndexType>(
+ values: &BooleanArray,
+ indices: &PrimitiveArray<IndexType>,
+) -> Result<BooleanArray>
+where
+ IndexType: ArrowNumericType,
+ IndexType::Native: ToPrimitive,
+{
+ let val_buf = take_bits(values.values(), values.offset(), indices)?;
+ let null_buf = match values.data().null_buffer() {
+ Some(buf) if values.null_count() > 0 => {
+ Some(take_bits(buf, values.offset(), indices)?)
}
+ _ => indices
+ .data()
+ .null_buffer()
+ .map(|b| b.bit_slice(indices.offset(), indices.len())),
};
let data = unsafe {
@@ -666,9 +658,9 @@ where
DataType::Boolean,
indices.len(),
None,
- nulls,
+ null_buf,
0,
- vec![val_buf.into()],
+ vec![val_buf],
vec![],
)
};
@@ -1467,6 +1459,52 @@ mod tests {
);
}
+ #[test]
+ fn test_take_bool_nullable_index() {
+ // indices where the masked invalid elements would be out of bounds
+ let index_data = ArrayData::try_new(
+ DataType::Int32,
+ 6,
+ Some(Buffer::from_iter(vec![
+ false, true, false, true, false, true,
+ ])),
+ 0,
+ vec![Buffer::from_iter(vec![99, 0, 999, 1, 9999, 2])],
+ vec![],
+ )
+ .unwrap();
+ let index = UInt32Array::from(index_data);
+ test_take_boolean_arrays(
+ vec![Some(true), None, Some(false)],
+ &index,
+ None,
+ vec![None, Some(true), None, None, None, Some(false)],
+ );
+ }
+
+ #[test]
+ fn test_take_bool_nullable_index_nonnull_values() {
+ // indices where the masked invalid elements would be out of bounds
+ let index_data = ArrayData::try_new(
+ DataType::Int32,
+ 6,
+ Some(Buffer::from_iter(vec![
+ false, true, false, true, false, true,
+ ])),
+ 0,
+ vec![Buffer::from_iter(vec![99, 0, 999, 1, 9999, 2])],
+ vec![],
+ )
+ .unwrap();
+ let index = UInt32Array::from(index_data);
+ test_take_boolean_arrays(
+ vec![Some(true), Some(true), Some(false)],
+ &index,
+ None,
+ vec![None, Some(true), None, Some(true), None, Some(false)],
+ );
+ }
+
#[test]
fn test_take_bool_with_offset() {
let index =