You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by "tustvold (via GitHub)" <gi...@apache.org> on 2023/07/27 16:23:15 UTC

[GitHub] [arrow-rs] tustvold commented on a diff in pull request #4575: Vectorized lexicographical_partition_ranges (~80% faster)

tustvold commented on code in PR #4575:
URL: https://github.com/apache/arrow-rs/pull/4575#discussion_r1276530177


##########
arrow-ord/src/partition.rs:
##########
@@ -17,146 +17,74 @@
 
 //! Defines partition kernel for `ArrayRef`
 
-use crate::sort::{LexicographicalComparator, SortColumn};
+use crate::comparison::neq_dyn;
+use crate::sort::SortColumn;
+use arrow_array::Array;
+use arrow_buffer::BooleanBuffer;
 use arrow_schema::ArrowError;
-use std::cmp::Ordering;
 use std::ops::Range;
 
 /// Given a list of already sorted columns, find partition ranges that would partition
 /// lexicographically equal values across columns.
 ///
-/// Here LexicographicalComparator is used in conjunction with binary
-/// search so the columns *MUST* be pre-sorted already.
-///
 /// The returned vec would be of size k where k is cardinality of the sorted values; Consecutive
 /// values will be connected: (a, b) and (b, c), where start = 0 and end = n for the first and last
 /// range.
 pub fn lexicographical_partition_ranges(
     columns: &[SortColumn],
 ) -> Result<impl Iterator<Item = Range<usize>> + '_, ArrowError> {
-    LexicographicalPartitionIterator::try_new(columns)
-}
-
-struct LexicographicalPartitionIterator<'a> {
-    comparator: LexicographicalComparator<'a>,
-    num_rows: usize,
-    previous_partition_point: usize,
-    partition_point: usize,
-}
+    if columns.is_empty() {
+        return Err(ArrowError::InvalidArgumentError(
+            "Sort requires at least one column".to_string(),
+        ));
+    }
+    let num_rows = columns[0].values.len();
+    if columns.iter().any(|item| item.values.len() != num_rows) {
+        return Err(ArrowError::ComputeError(
+            "Lexical sort columns have different row counts".to_string(),
+        ));
+    };
 
-impl<'a> LexicographicalPartitionIterator<'a> {
-    fn try_new(
-        columns: &'a [SortColumn],
-    ) -> Result<LexicographicalPartitionIterator, ArrowError> {
-        if columns.is_empty() {
-            return Err(ArrowError::InvalidArgumentError(
-                "Sort requires at least one column".to_string(),
-            ));
-        }
-        let num_rows = columns[0].values.len();
-        if columns.iter().any(|item| item.values.len() != num_rows) {
-            return Err(ArrowError::ComputeError(
-                "Lexical sort columns have different row counts".to_string(),
-            ));
-        };
+    let acc = find_boundaries(&columns[0])?;
+    let acc = columns
+        .iter()
+        .skip(1)
+        .try_fold(acc, |acc, c| find_boundaries(c).map(|b| &acc | &b))?;
 
-        let comparator = LexicographicalComparator::try_new(columns)?;
-        Ok(LexicographicalPartitionIterator {
-            comparator,
-            num_rows,
-            previous_partition_point: 0,
-            partition_point: 0,
-        })
+    let mut out = vec![];
+    let mut current = 0;
+    for idx in acc.set_indices() {
+        let t = current;
+        current = idx + 1;
+        out.push(t..current)
     }
-}
-
-/// Returns the next partition point of the range `start..end` according to the given comparator.
-/// The return value is the index of the first element of the second partition,
-/// and is guaranteed to be between `start..=end` (inclusive).
-///
-/// The values corresponding to those indices are assumed to be partitioned according to the given comparator.
-///
-/// Exponential search is to remedy for the case when array size and cardinality are both large.
-/// In these cases the partition point would be near the beginning of the range and
-/// plain binary search would be doing some unnecessary iterations on each call.
-///
-/// see <https://en.wikipedia.org/wiki/Exponential_search>
-#[inline]
-fn exponential_search_next_partition_point(
-    start: usize,
-    end: usize,
-    comparator: &LexicographicalComparator<'_>,
-) -> usize {
-    let target = start;
-    let mut bound = 1;
-    while bound + start < end
-        && comparator.compare(bound + start, target) != Ordering::Greater
-    {
-        bound *= 2;
+    if current != num_rows {
+        out.push(current..num_rows)
     }
-
-    // invariant after while loop:
-    // (start + bound / 2) <= target < min(end, start + bound + 1)
-    // where <= and < are defined by the comparator;
-    // note here we have right = min(end, start + bound + 1) because (start + bound) might
-    // actually be considered and must be included.
-    partition_point(start + bound / 2, end.min(start + bound + 1), |idx| {
-        comparator.compare(idx, target) != Ordering::Greater
-    })
+    Ok(out.into_iter())

Review Comment:
   In opted to preserve the existing function signature for now, I can definitely see a future incarnation returning the computed bitmask somehow to allow for more optimal processing



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org