You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by al...@apache.org on 2021/06/22 10:51:37 UTC

[arrow-rs] branch cherry_pick_1f1c6372 created (now c1878ce)

This is an automated email from the ASF dual-hosted git repository.

alamb pushed a change to branch cherry_pick_1f1c6372
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git.


      at c1878ce  Use partition for bool sort (#448)

This branch includes the following new commits:

     new c1878ce  Use partition for bool sort (#448)

The 1 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


[arrow-rs] 01/01: Use partition for bool sort (#448)

Posted by al...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch cherry_pick_1f1c6372
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git

commit c1878ce05ef9f6d56e5ab65b81aa8d946363a5cc
Author: Jiayu Liu <Ji...@users.noreply.github.com>
AuthorDate: Sat Jun 19 19:44:49 2021 +0800

    Use partition for bool sort (#448)
    
    * optimize boolean sort using parition
    
    * add docs
---
 arrow/src/compute/kernels/sort.rs | 60 +++++++++++++++++++++++++--------------
 1 file changed, 38 insertions(+), 22 deletions(-)

diff --git a/arrow/src/compute/kernels/sort.rs b/arrow/src/compute/kernels/sort.rs
index 9f55826..4d916a6 100644
--- a/arrow/src/compute/kernels/sort.rs
+++ b/arrow/src/compute/kernels/sort.rs
@@ -395,8 +395,13 @@ impl Default for SortOptions {
     }
 }
 
-/// Sort primitive values
-#[allow(clippy::unnecessary_wraps)]
+/// Sort boolean values
+///
+/// when a limit is present, the sort is pair-comparison based as k-select might be more efficient,
+/// when the limit is absent, binary partition is used to speed up (which is linear).
+///
+/// TODO maybe partition_validity call can be eliminated in this case and tri-color sort can be used
+/// instead. https://en.wikipedia.org/wiki/Dutch_national_flag_problem
 fn sort_boolean(
     values: &ArrayRef,
     value_indices: Vec<u32>,
@@ -410,30 +415,41 @@ fn sort_boolean(
         .expect("Unable to downcast to boolean array");
     let descending = options.descending;
 
-    // create tuples that are used for sorting
-    let mut valids = value_indices
-        .into_iter()
-        .map(|index| (index, values.value(index as usize)))
-        .collect::<Vec<(u32, bool)>>();
-
-    let mut nulls = null_indices;
-
-    let valids_len = valids.len();
-    let nulls_len = nulls.len();
+    let valids_len = value_indices.len();
+    let nulls_len = null_indices.len();
 
     let mut len = values.len();
-    if let Some(limit) = limit {
+    let valids = if let Some(limit) = limit {
         len = limit.min(len);
-    }
-    if !descending {
-        sort_by(&mut valids, len.saturating_sub(nulls_len), |a, b| {
-            cmp(a.1, b.1)
-        });
+        // create tuples that are used for sorting
+        let mut valids = value_indices
+            .into_iter()
+            .map(|index| (index, values.value(index as usize)))
+            .collect::<Vec<(u32, bool)>>();
+        if !descending {
+            sort_by(&mut valids, len.saturating_sub(nulls_len), |a, b| {
+                cmp(a.1, b.1)
+            });
+        } else {
+            sort_by(&mut valids, len.saturating_sub(nulls_len), |a, b| {
+                cmp(a.1, b.1).reverse()
+            });
+        }
+        valids
     } else {
-        sort_by(&mut valids, len.saturating_sub(nulls_len), |a, b| {
-            cmp(a.1, b.1).reverse()
-        });
-        // reverse to keep a stable ordering
+        // when limit is not present, we have a better way than sorting: we can just partition
+        // the vec into [false..., true...] or [true..., false...] when descending
+        // TODO when https://github.com/rust-lang/rust/issues/62543 is merged we can use partition_in_place
+        let (mut a, b): (Vec<(u32, bool)>, Vec<(u32, bool)>) = value_indices
+            .into_iter()
+            .map(|index| (index, values.value(index as usize)))
+            .partition(|(_, value)| *value == descending);
+        a.extend(b);
+        a
+    };
+
+    let mut nulls = null_indices;
+    if descending {
         nulls.reverse();
     }