You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by al...@apache.org on 2021/07/25 10:57:30 UTC

[arrow-rs] branch master updated: use exponential search for lexicographical_partition_ranges (#585)

This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/master by this push:
     new 6c1a86e  use exponential search for lexicographical_partition_ranges (#585)
6c1a86e is described below

commit 6c1a86e26ae45913fdabf3b5852f8bec289dcad9
Author: Jiayu Liu <Ji...@users.noreply.github.com>
AuthorDate: Sun Jul 25 18:57:23 2021 +0800

    use exponential search for lexicographical_partition_ranges (#585)
---
 arrow/src/compute/kernels/partition.rs | 34 +++++++++++++++++++++++++++++-----
 1 file changed, 29 insertions(+), 5 deletions(-)

diff --git a/arrow/src/compute/kernels/partition.rs b/arrow/src/compute/kernels/partition.rs
index ad35e92..8a16942 100644
--- a/arrow/src/compute/kernels/partition.rs
+++ b/arrow/src/compute/kernels/partition.rs
@@ -73,6 +73,30 @@ impl<'a> LexicographicalPartitionIterator<'a> {
     }
 }
 
+/// Exponential search is to remedy for the case when array size and cardinality are both large
+/// see <https://en.wikipedia.org/wiki/Exponential_search>
+#[inline]
+fn exponential_search(
+    indices: &[usize],
+    target: &usize,
+    comparator: &LexicographicalComparator<'_>,
+) -> usize {
+    let mut bound = 1;
+    while bound < indices.len()
+        && comparator.compare(&indices[bound], target) != Ordering::Greater
+    {
+        bound *= 2;
+    }
+    // invariant after while loop:
+    // indices[bound / 2] <= target < indices[min(indices.len(), bound + 1)]
+    // where <= and < are defined by the comparator;
+    // note here we have right = min(indices.len(), bound + 1) because indices[bound] might
+    // actually be considered and must be included.
+    (bound / 2)
+        + indices[(bound / 2)..indices.len().min(bound + 1)]
+            .partition_point(|idx| comparator.compare(idx, target) != Ordering::Greater)
+}
+
 impl<'a> Iterator for LexicographicalPartitionIterator<'a> {
     type Item = Range<usize>;
 
@@ -87,11 +111,11 @@ impl<'a> Iterator for LexicographicalPartitionIterator<'a> {
             // be careful that idx is of type &usize which points to the actual value within value_indices, which itself
             // contains usize (0..row_count), providing access to lexicographical_comparator as pointers into the
             // original columnar data.
-            self.partition_point += self.value_indices[self.partition_point..]
-                .partition_point(|idx| {
-                    self.comparator.compare(idx, &self.partition_point)
-                        != Ordering::Greater
-                });
+            self.partition_point += exponential_search(
+                &self.value_indices[self.partition_point..],
+                &self.partition_point,
+                &self.comparator,
+            );
             let start = self.previous_partition_point;
             let end = self.partition_point;
             self.previous_partition_point = self.partition_point;