You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by al...@apache.org on 2021/12/15 19:56:52 UTC
[arrow-rs] branch master updated: Avoid allocating vector of indices in lexicographical_partition_ranges (#998)

This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/master by this push:
     new fc343e7  Avoid allocating vector of indices in lexicographical_partition_ranges (#998)
fc343e7 is described below

commit fc343e7c02a05cd7d2a1d401f4e5feeb8d6de179
Author: Jörn Horstmann <gi...@jhorstmann.net>
AuthorDate: Wed Dec 15 20:56:45 2021 +0100

    Avoid allocating vector of indices in lexicographical_partition_ranges (#998)
    
    * Avoid allocating vector of indices in lexicographical_partition_ranges
    
    * Adjust comments
    
    * Improve comments and remove one unneeded parameter
---
 arrow/src/compute/kernels/partition.rs | 133 ++++++++++++++++++++++++++-------
 1 file changed, 108 insertions(+), 25 deletions(-)

diff --git a/arrow/src/compute/kernels/partition.rs b/arrow/src/compute/kernels/partition.rs
index 8a16942..991aa7b 100644
--- a/arrow/src/compute/kernels/partition.rs
+++ b/arrow/src/compute/kernels/partition.rs
@@ -44,7 +44,6 @@ struct LexicographicalPartitionIterator<'a> {
     num_rows: usize,
     previous_partition_point: usize,
     partition_point: usize,
-    value_indices: Vec<usize>,
 }
 
 impl<'a> LexicographicalPartitionIterator<'a> {
@@ -62,39 +61,78 @@ impl<'a> LexicographicalPartitionIterator<'a> {
         };
 
         let comparator = LexicographicalComparator::try_new(columns)?;
-        let value_indices = (0..num_rows).collect::<Vec<usize>>();
         Ok(LexicographicalPartitionIterator {
             comparator,
             num_rows,
             previous_partition_point: 0,
             partition_point: 0,
-            value_indices,
         })
     }
 }
 
-/// Exponential search is to remedy for the case when array size and cardinality are both large
+/// Returns the next partition point of the range `start..end` according to the given comparator.
+/// The return value is the index of the first element of the second partition,
+/// and is guaranteed to be between `start..=end` (inclusive).
+///
+/// The values corresponding to those indices are assumed to be partitioned according to the given comparator.
+///
+/// Exponential search is to remedy for the case when array size and cardinality are both large.
+/// In these cases the partition point would be near the beginning of the range and
+/// plain binary search would be doing some unnecessary iterations on each call.
+///
 /// see <https://en.wikipedia.org/wiki/Exponential_search>
 #[inline]
-fn exponential_search(
-    indices: &[usize],
-    target: &usize,
+fn exponential_search_next_partition_point(
+    start: usize,
+    end: usize,
     comparator: &LexicographicalComparator<'_>,
 ) -> usize {
+    let target = start;
     let mut bound = 1;
-    while bound < indices.len()
-        && comparator.compare(&indices[bound], target) != Ordering::Greater
+    while bound + start < end
+        && comparator.compare(&(bound + start), &target) != Ordering::Greater
     {
         bound *= 2;
     }
+
     // invariant after while loop:
-    // indices[bound / 2] <= target < indices[min(indices.len(), bound + 1)]
+    // (start + bound / 2) <= target < min(end, start + bound + 1)
     // where <= and < are defined by the comparator;
-    // note here we have right = min(indices.len(), bound + 1) because indices[bound] might
+    // note here we have right = min(end, start + bound + 1) because (start + bound) might
     // actually be considered and must be included.
-    (bound / 2)
-        + indices[(bound / 2)..indices.len().min(bound + 1)]
-            .partition_point(|idx| comparator.compare(idx, target) != Ordering::Greater)
+    partition_point(start + bound / 2, end.min(start + bound + 1), |idx| {
+        comparator.compare(&idx, &target) != Ordering::Greater
+    })
+}
+
+/// Returns the partition point of the range `start..end` according to the given predicate.
+/// The return value is the index of the first element of the second partition,
+/// and is guaranteed to be between `start..=end` (inclusive).
+///
+/// The algorithm is similar to a binary search.
+///
+/// The values corresponding to those indices are assumed to be partitioned according to the given predicate.
+///
+/// See [`std::slice::partition_point`]
+#[inline]
+fn partition_point<P: Fn(usize) -> bool>(start: usize, end: usize, pred: P) -> usize {
+    let mut left = start;
+    let mut right = end;
+    let mut size = right - left;
+    while left < right {
+        let mid = left + size / 2;
+
+        let less = pred(mid);
+
+        if less {
+            left = mid + 1;
+        } else {
+            right = mid;
+        }
+
+        size = right - left;
+    }
+    left
 }
 
 impl<'a> Iterator for LexicographicalPartitionIterator<'a> {
@@ -103,17 +141,12 @@ impl<'a> Iterator for LexicographicalPartitionIterator<'a> {
     fn next(&mut self) -> Option<Self::Item> {
         if self.partition_point < self.num_rows {
             // invariant:
-            // value_indices[0..previous_partition_point] all are values <= value_indices[previous_partition_point]
-            // so in order to save time we can do binary search on the value_indices[previous_partition_point..]
-            // and find when any value is greater than value_indices[previous_partition_point]; because we are using
-            // new indices, the new offset is _added_ to the previous_partition_point.
-            //
-            // be careful that idx is of type &usize which points to the actual value within value_indices, which itself
-            // contains usize (0..row_count), providing access to lexicographical_comparator as pointers into the
-            // original columnar data.
-            self.partition_point += exponential_search(
-                &self.value_indices[self.partition_point..],
-                &self.partition_point,
+            // in the range [0..previous_partition_point] all values are <= the value at [previous_partition_point]
+            // so in order to save time we can do binary search on the range [previous_partition_point..num_rows]
+            // and find the index where any value is greater than the value at [previous_partition_point]
+            self.partition_point = exponential_search_next_partition_point(
+                self.partition_point,
+                self.num_rows,
                 &self.comparator,
             );
             let start = self.previous_partition_point;
@@ -135,6 +168,56 @@ mod tests {
     use std::sync::Arc;
 
     #[test]
+    fn test_partition_point() {
+        let input = &[1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 4];
+        {
+            let median = input[input.len() / 2];
+            assert_eq!(
+                9,
+                partition_point(
+                    0,
+                    input.len(),
+                    &(|i: usize| input[i].cmp(&median) != Ordering::Greater)
+                )
+            );
+        }
+        {
+            let search = input[9];
+            assert_eq!(
+                12,
+                partition_point(
+                    9,
+                    input.len(),
+                    &(|i: usize| input[i].cmp(&search) != Ordering::Greater)
+                )
+            );
+        }
+        {
+            let search = input[0];
+            assert_eq!(
+                3,
+                partition_point(
+                    0,
+                    9,
+                    &(|i: usize| input[i].cmp(&search) != Ordering::Greater)
+                )
+            );
+        }
+        let input = &[1, 2, 2, 2, 2, 2, 2, 2, 9];
+        {
+            let search = input[5];
+            assert_eq!(
+                8,
+                partition_point(
+                    5,
+                    9,
+                    &(|i: usize| input[i].cmp(&search) != Ordering::Greater)
+                )
+            );
+        }
+    }
+
+    #[test]
     fn test_lexicographical_partition_ranges_empty() {
         let input = vec![];
         assert!(