You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2020/07/29 07:31:39 UTC

[GitHub] [arrow] andygrove commented on a change in pull request #7798: ARROW-9523 [Rust] Improve filter kernel performance

andygrove commented on a change in pull request #7798:
URL: https://github.com/apache/arrow/pull/7798#discussion_r461228050



##########
File path: rust/arrow/src/compute/kernels/filter.rs
##########
@@ -17,139 +17,466 @@
 
 //! Defines miscellaneous array kernels.
 
-use std::sync::Arc;
-
 use crate::array::*;
 use crate::datatypes::{ArrowNumericType, DataType, TimeUnit};
 use crate::error::{ArrowError, Result};
+use crate::record_batch::RecordBatch;
+use crate::{
+    bitmap::Bitmap,
+    buffer::{Buffer, MutableBuffer},
+    memory,
+    util::bit_util,
+};
+use std::{mem, sync::Arc};
 
-/// Helper function to perform boolean lambda function on values from two arrays.
-fn bool_op<T, F>(
-    left: &PrimitiveArray<T>,
-    right: &PrimitiveArray<T>,
-    op: F,
-) -> Result<BooleanArray>
-where
-    T: ArrowNumericType,
-    F: Fn(Option<T::Native>, Option<T::Native>) -> bool,
-{
-    if left.len() != right.len() {
-        return Err(ArrowError::ComputeError(
-            "Cannot perform math operation on arrays of different length".to_string(),
-        ));
+/// trait for copying filtered null bitmap bits
+trait CopyNullBit {
+    fn copy_null_bit(&mut self, source_index: usize);
+    fn copy_null_bits(&mut self, source_index: usize, count: usize);
+    fn null_count(&self) -> usize;
+    fn null_buffer(&mut self) -> Buffer;
+}
+
+/// no-op null bitmap copy implementation,
+/// used when the filtered data array doesn't have a null bitmap
+struct NullBitNoop {}
+
+impl NullBitNoop {
+    fn new() -> Self {
+        NullBitNoop {}
     }
-    let mut b = BooleanArray::builder(left.len());
-    for i in 0..left.len() {
-        let index = i;
-        let l = if left.is_null(i) {
-            None
-        } else {
-            Some(left.value(index))
-        };
-        let r = if right.is_null(i) {
-            None
-        } else {
-            Some(right.value(index))
-        };
-        b.append_value(op(l, r))?;
+}
+
+impl CopyNullBit for NullBitNoop {
+    #[inline]
+    fn copy_null_bit(&mut self, _source_index: usize) {
+        // do nothing
+    }
+
+    #[inline]
+    fn copy_null_bits(&mut self, _source_index: usize, _count: usize) {
+        // do nothing
+    }
+
+    fn null_count(&self) -> usize {
+        0
+    }
+
+    fn null_buffer(&mut self) -> Buffer {
+        Buffer::from([0u8; 0])
     }
-    Ok(b.finish())
 }
 
-macro_rules! filter_array {
-    ($array:expr, $filter:expr, $array_type:ident) => {{
-        let b = $array.as_any().downcast_ref::<$array_type>().unwrap();
-        let mut builder = $array_type::builder(b.len());
-        for i in 0..b.len() {
-            if $filter.value(i) {
-                if b.is_null(i) {
-                    builder.append_null()?;
-                } else {
-                    builder.append_value(b.value(i))?;
-                }
-            }
-        }
-        Ok(Arc::new(builder.finish()))
-    }};
+/// null bitmap copy implementation,
+/// used when the filtered data array has a null bitmap
+struct NullBitSetter<'a> {
+    target_buffer: MutableBuffer,
+    source_bytes: &'a [u8],
+    target_index: usize,
+    null_count: usize,
 }
 
-/// Returns the array, taking only the elements matching the filter
-pub fn filter(array: &Array, filter: &BooleanArray) -> Result<ArrayRef> {
-    match array.data_type() {
-        DataType::UInt8 => filter_array!(array, filter, UInt8Array),
-        DataType::UInt16 => filter_array!(array, filter, UInt16Array),
-        DataType::UInt32 => filter_array!(array, filter, UInt32Array),
-        DataType::UInt64 => filter_array!(array, filter, UInt64Array),
-        DataType::Int8 => filter_array!(array, filter, Int8Array),
-        DataType::Int16 => filter_array!(array, filter, Int16Array),
-        DataType::Int32 => filter_array!(array, filter, Int32Array),
-        DataType::Int64 => filter_array!(array, filter, Int64Array),
-        DataType::Float32 => filter_array!(array, filter, Float32Array),
-        DataType::Float64 => filter_array!(array, filter, Float64Array),
-        DataType::Boolean => filter_array!(array, filter, BooleanArray),
-        DataType::Date32(_) => filter_array!(array, filter, Date32Array),
-        DataType::Date64(_) => filter_array!(array, filter, Date64Array),
-        DataType::Time32(TimeUnit::Second) => {
-            filter_array!(array, filter, Time32SecondArray)
-        }
-        DataType::Time32(TimeUnit::Millisecond) => {
-            filter_array!(array, filter, Time32MillisecondArray)
-        }
-        DataType::Time64(TimeUnit::Microsecond) => {
-            filter_array!(array, filter, Time64MicrosecondArray)
-        }
-        DataType::Time64(TimeUnit::Nanosecond) => {
-            filter_array!(array, filter, Time64NanosecondArray)
-        }
-        DataType::Duration(TimeUnit::Second) => {
-            filter_array!(array, filter, DurationSecondArray)
-        }
-        DataType::Duration(TimeUnit::Millisecond) => {
-            filter_array!(array, filter, DurationMillisecondArray)
-        }
-        DataType::Duration(TimeUnit::Microsecond) => {
-            filter_array!(array, filter, DurationMicrosecondArray)
+impl<'a> NullBitSetter<'a> {
+    fn new(null_bitmap: &'a Bitmap) -> Self {
+        let null_bytes = null_bitmap.buffer_ref().data();
+        // create null bitmap buffer with same length and initialize null bitmap buffer to 1s
+        let null_buffer =
+            MutableBuffer::new(null_bytes.len()).with_bitset(null_bytes.len(), true);
+        NullBitSetter {
+            source_bytes: null_bytes,
+            target_buffer: null_buffer,
+            target_index: 0,
+            null_count: 0,
         }
-        DataType::Duration(TimeUnit::Nanosecond) => {
-            filter_array!(array, filter, DurationNanosecondArray)
+    }
+}
+
+impl<'a> CopyNullBit for NullBitSetter<'a> {
+    #[inline]
+    fn copy_null_bit(&mut self, source_index: usize) {
+        if !bit_util::get_bit(self.source_bytes, source_index) {
+            bit_util::unset_bit(self.target_buffer.data_mut(), self.target_index);
+            self.null_count += 1;
         }
-        DataType::Timestamp(TimeUnit::Second, _) => {
-            filter_array!(array, filter, TimestampSecondArray)
+        self.target_index += 1;
+    }
+
+    #[inline]
+    fn copy_null_bits(&mut self, source_index: usize, count: usize) {
+        for i in 0..count {
+            self.copy_null_bit(source_index + i);
         }
-        DataType::Timestamp(TimeUnit::Millisecond, _) => {
-            filter_array!(array, filter, TimestampMillisecondArray)
+    }
+
+    fn null_count(&self) -> usize {
+        self.null_count
+    }
+
+    fn null_buffer(&mut self) -> Buffer {
+        self.target_buffer.resize(self.target_index).unwrap();
+        // use mem::replace to detach self.target_buffer from self so that it can be returned
+        let target_buffer = mem::replace(&mut self.target_buffer, MutableBuffer::new(0));
+        target_buffer.freeze()
+    }
+}
+
+fn get_null_bit_setter<'a>(data_array: &'a impl Array) -> Box<CopyNullBit + 'a> {
+    if let Some(null_bitmap) = data_array.data_ref().null_bitmap() {
+        // only return an actual null bit copy implementation if null_bitmap is set
+        Box::new(NullBitSetter::new(null_bitmap))
+    } else {
+        // otherwise return a no-op copy null bit implementation
+        // for improved performance when the filtered array doesn't contain NULLs
+        Box::new(NullBitNoop::new())
+    }
+}
+
+// transmute filter array to u64
+// - optimize filtering with highly selective filters by skipping entire batches of 64 filter bits
+// - if the data array being filtered doesn't have a null bitmap, no time is wasted to copy a null bitmap
+fn filter_array_impl(
+    filter_context: &FilterContext,
+    data_array: &impl Array,
+    array_type: DataType,
+    value_size: usize,
+) -> Result<ArrayDataBuilder> {
+    if filter_context.filter_len > data_array.len() {
+        return Err(ArrowError::ComputeError(
+            "Filter array cannot be larger than data array".to_string(),
+        ));
+    }
+    let filtered_count = filter_context.filtered_count;
+    let filter_mask = &filter_context.filter_mask;
+    let filter_u64 = &filter_context.filter_u64;
+    let data_start = data_array.data_ref().buffers()[0].raw_data();
+    let mut value_buffer = MutableBuffer::new(filtered_count * value_size);
+    value_buffer.resize(filtered_count * value_size)?;
+    let mut value_position = value_buffer.raw_data_mut();
+    let mut null_bit_setter = get_null_bit_setter(data_array);
+    let null_bit_setter = null_bit_setter.as_mut();
+    let all_ones_batch = !0u64;
+
+    for (i, filter_batch) in filter_u64.iter().enumerate() {
+        // foreach u64 batch
+        let filter_batch = *filter_batch;
+        if filter_batch == 0 {
+            // if batch == 0: skip
+            continue;
+        } else if filter_batch == all_ones_batch {
+            // if batch == all 1s: copy all 64 values in one go
+            let data_index = i * 64;
+            let data_len = value_size * 64;
+            null_bit_setter.copy_null_bits(data_index, 64);
+            unsafe {

Review comment:
       Thanks @yordan-pavlov We can always consider the unsafe changes in a future PR, but removing them for now makes it much easier to get these changes merged.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org