You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2022/02/01 14:50:53 UTC

[GitHub] [arrow-rs] tustvold commented on a change in pull request #1228: Faster bitmask iteration

tustvold commented on a change in pull request #1228:
URL: https://github.com/apache/arrow-rs/pull/1228#discussion_r796673841



##########
File path: arrow/src/compute/kernels/filter.rs
##########
@@ -17,184 +17,117 @@
 
 //! Defines miscellaneous array kernels.
 
+use crate::array::*;
 use crate::buffer::buffer_bin_and;
 use crate::datatypes::DataType;
 use crate::error::Result;
 use crate::record_batch::RecordBatch;
-use crate::{array::*, util::bit_chunk_iterator::BitChunkIterator};
-use std::iter::Enumerate;
+use crate::util::bit_chunk_iterator::{UnalignedBitChunk, UnalignedBitChunkIterator};
 
 /// Function that can filter arbitrary arrays
 pub type Filter<'a> = Box<dyn Fn(&ArrayData) -> ArrayData + 'a>;
 
-/// Internal state of [SlicesIterator]
-#[derive(Debug, PartialEq)]
-enum State {
-    // it is iterating over bits of a mask (`u64`, steps of size of 1 slot)
-    Bits(u64),
-    // it is iterating over chunks (steps of size of 64 slots)
-    Chunks,
-    // it is iterating over the remaining bits (steps of size of 1 slot)
-    Remainder,
-    // nothing more to iterate.
-    Finish,
-}
-
 /// An iterator of `(usize, usize)` each representing an interval `[start,end[` whose
 /// slots of a [BooleanArray] are true. Each interval corresponds to a contiguous region of memory to be
 /// "taken" from an array to be filtered.
 #[derive(Debug)]
 pub struct SlicesIterator<'a> {
-    iter: Enumerate<BitChunkIterator<'a>>,
-    state: State,
-    filter: &'a BooleanArray,
-    remainder_mask: u64,
-    remainder_len: usize,
-    chunk_len: usize,
+    iter: UnalignedBitChunkIterator<'a>,
     len: usize,
-    start: usize,
-    on_region: bool,
-    current_chunk: usize,
-    current_bit: usize,
+    offset: usize,
+    current_chunk: u64,
 }
 
 impl<'a> SlicesIterator<'a> {
     pub fn new(filter: &'a BooleanArray) -> Self {
         let values = &filter.data_ref().buffers()[0];
-        let chunks = values.bit_chunks(filter.offset(), filter.len());
+        let len = filter.len();
+        let chunk = UnalignedBitChunk::new(values.as_slice(), filter.offset(), len);
+        let mut iter = chunk.iter();
+
+        let offset = chunk.lead_padding();
+        let current_chunk = iter.next().unwrap_or(0);
 
         Self {
-            iter: chunks.iter().enumerate(),
-            state: State::Chunks,
-            filter,
-            remainder_len: chunks.remainder_len(),
-            chunk_len: chunks.chunk_len(),
-            remainder_mask: chunks.remainder_bits(),
-            len: 0,
-            start: 0,
-            on_region: false,
-            current_chunk: 0,
-            current_bit: 0,
+            iter,
+            len,
+            offset,
+            current_chunk,
         }
     }
 
-    /// Counts the number of set bits in the filter array.
-    fn filter_count(&self) -> usize {
-        let values = self.filter.values();
-        values.count_set_bits_offset(self.filter.offset(), self.filter.len())
-    }
-
-    #[inline]
-    fn current_start(&self) -> usize {
-        self.current_chunk * 64 + self.current_bit
-    }
-
-    #[inline]
-    fn iterate_bits(&mut self, mask: u64, max: usize) -> Option<(usize, usize)> {
-        while self.current_bit < max {
-            if (mask & (1 << self.current_bit)) != 0 {
-                if !self.on_region {
-                    self.start = self.current_start();
-                    self.on_region = true;
-                }
-                self.len += 1;
-            } else if self.on_region {
-                let result = (self.start, self.start + self.len);
-                self.len = 0;
-                self.on_region = false;
-                self.current_bit += 1;
-                return Some(result);
+    fn advance_to_set_bit(&mut self) -> Option<(usize, u32)> {

Review comment:
       Done - was not an intentional omission




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org