You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2020/06/25 14:57:17 UTC

[arrow] branch master updated: ARROW-9225: [C++][Compute] Speed up counting sort

This is an automated email from the ASF dual-hosted git repository.

wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 408882b  ARROW-9225: [C++][Compute] Speed up counting sort
408882b is described below

commit 408882b88751c3d49f91062f6abc98af17b21586
Author: Yibo Cai <yi...@arm.com>
AuthorDate: Thu Jun 25 09:56:52 2020 -0500

    ARROW-9225: [C++][Compute] Speed up counting sort
    
    For counting sort, using BitmapReader for null checking performs much
    better than BitBlockCount.
    This patch also drops counting nulls as it's not necessary.
    
    Closes #7542 from cyb70289/sort
    
    Authored-by: Yibo Cai <yi...@arm.com>
    Signed-off-by: Wes McKinney <we...@apache.org>
---
 cpp/src/arrow/compute/kernels/vector_sort.cc | 42 +++++++++++++++++++++-------
 1 file changed, 32 insertions(+), 10 deletions(-)

diff --git a/cpp/src/arrow/compute/kernels/vector_sort.cc b/cpp/src/arrow/compute/kernels/vector_sort.cc
index 8e36737..220b3ae 100644
--- a/cpp/src/arrow/compute/kernels/vector_sort.cc
+++ b/cpp/src/arrow/compute/kernels/vector_sort.cc
@@ -88,6 +88,28 @@ struct PartitionIndices {
   }
 };
 
+template <typename ArrayType, typename VisitorNotNull, typename VisitorNull>
+inline void VisitRawValuesInline(const ArrayType& values,
+                                 VisitorNotNull&& visitor_not_null,
+                                 VisitorNull&& visitor_null) {
+  const auto data = values.raw_values();
+  if (values.null_count() > 0) {
+    BitmapReader reader(values.null_bitmap_data(), values.offset(), values.length());
+    for (int64_t i = 0; i < values.length(); ++i) {
+      if (reader.IsSet()) {
+        visitor_not_null(data[i]);
+      } else {
+        visitor_null();
+      }
+      reader.Next();
+    }
+  } else {
+    for (int64_t i = 0; i < values.length(); ++i) {
+      visitor_not_null(data[i]);
+    }
+  }
+}
+
 }  // namespace
 
 template <typename ArrowType>
@@ -145,20 +167,19 @@ class CountSorter {
                     const ArrayType& values) {
     const uint32_t value_range = value_range_;
 
-    // first slot reserved for prefix sum, last slot for null value
-    std::vector<CounterType> counts(1 + value_range + 1);
+    // first slot reserved for prefix sum
+    std::vector<CounterType> counts(1 + value_range);
 
-    VisitArrayDataInline<ArrowType>(
-        *values.data(), [&](c_type v) { ++counts[v - min_ + 1]; },
-        [&]() { ++counts[value_range + 1]; });
+    VisitRawValuesInline(
+        values, [&](c_type v) { ++counts[v - min_ + 1]; }, []() {});
 
     for (uint32_t i = 1; i <= value_range; ++i) {
       counts[i] += counts[i - 1];
     }
 
     int64_t index = 0;
-    VisitArrayDataInline<ArrowType>(
-        *values.data(), [&](c_type v) { indices_begin[counts[v - min_]++] = index++; },
+    VisitRawValuesInline(
+        values, [&](c_type v) { indices_begin[counts[v - min_]++] = index++; },
         [&]() { indices_begin[counts[value_range]++] = index++; });
   }
 };
@@ -177,13 +198,14 @@ class CountOrCompareSorter {
       c_type min{std::numeric_limits<c_type>::max()};
       c_type max{std::numeric_limits<c_type>::min()};
 
-      VisitArrayDataInline<ArrowType>(
-          *values.data(),
+      VisitRawValuesInline(
+          values,
           [&](c_type v) {
             min = std::min(min, v);
             max = std::max(max, v);
           },
-          [&]() {});
+          []() {});
+
       // For signed int32/64, (max - min) may overflow and trigger UBSAN.
       // Cast to largest unsigned type(uint64_t) before subtraction.
       if (static_cast<uint64_t>(max) - static_cast<uint64_t>(min) <=