You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2022/02/06 15:02:20 UTC

[GitHub] [arrow-rs] jorgecarleitao commented on a change in pull request #1248: Specialized filter kernels

jorgecarleitao commented on a change in pull request #1248:
URL: https://github.com/apache/arrow-rs/pull/1248#discussion_r800191885



##########
File path: arrow/src/compute/kernels/filter.rs
##########
@@ -119,17 +155,83 @@ impl<'a> Iterator for SlicesIterator<'a> {
     }
 }
 
+/// An iterator of `usize` whose index in [`BooleanArray`] is true
+///
+/// This provides the best performance on all but the least selective predicates (which keep most
+/// / all rows), where the benefits of copying large runs instead favours [`SlicesIterator`]
+struct IndexIterator<'a> {
+    current_chunk: u64,
+    chunk_offset: i64,
+    remaining: usize,
+    iter: UnalignedBitChunkIterator<'a>,
+}
+
+impl<'a> IndexIterator<'a> {
+    fn new(filter: &'a BooleanArray, len: usize) -> Self {
+        assert_eq!(filter.null_count(), 0);
+        let data = filter.data();
+        let chunks =
+            UnalignedBitChunk::new(&data.buffers()[0], data.offset(), data.len());
+        let mut iter = chunks.iter();
+
+        let current_chunk = iter.next().unwrap_or(0);
+        let chunk_offset = -(chunks.lead_padding() as i64);
+
+        Self {
+            current_chunk,
+            chunk_offset,
+            remaining: len,
+            iter,
+        }
+    }
+}
+
+impl<'a> Iterator for IndexIterator<'a> {
+    type Item = usize;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        while self.remaining != 0 {
+            if self.current_chunk != 0 {
+                let bit_pos = self.current_chunk.trailing_zeros();
+                self.current_chunk ^= 1 << bit_pos;
+                self.remaining -= 1;
+                return Some((self.chunk_offset + bit_pos as i64) as usize);
+            }
+
+            // Must panic if exhausted early as trusted length iterator
+            self.current_chunk = self.iter.next().expect("IndexIterator exhausted early");
+            self.chunk_offset += 64;
+        }
+        None
+    }
+
+    fn size_hint(&self) -> (usize, Option<usize>) {
+        (self.remaining, Some(self.remaining))
+    }
+}
+
+/// Counts the number of set bits in `filter`
 fn filter_count(filter: &BooleanArray) -> usize {

Review comment:
       not sure why this was here, but isn't this equal to `filter.len() - filter.null_count()`?




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org