You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ap...@apache.org on 2022/07/19 12:05:26 UTC

[arrow] branch master updated: ARROW-17096: [C++][Compute] Fix mode kernel error on boolean array (#13646)

This is an automated email from the ASF dual-hosted git repository.

apitrou pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 3a5284fc29 ARROW-17096: [C++][Compute] Fix mode kernel error on boolean array (#13646)
3a5284fc29 is described below

commit 3a5284fc29a2b6906ecdb0d89a581051645d7549
Author: Yibo Cai <yi...@arm.com>
AuthorDate: Tue Jul 19 20:05:21 2022 +0800

    ARROW-17096: [C++][Compute] Fix mode kernel error on boolean array (#13646)
    
    Authored-by: Yibo Cai <yi...@arm.com>
    Signed-off-by: Antoine Pitrou <an...@python.org>
---
 cpp/src/arrow/compute/kernels/aggregate_mode.cc | 25 +++++++++++++++----------
 cpp/src/arrow/compute/kernels/aggregate_test.cc | 19 +++++++++++++++++++
 python/pyarrow/tests/test_compute.py            |  6 ++++++
 3 files changed, 40 insertions(+), 10 deletions(-)

diff --git a/cpp/src/arrow/compute/kernels/aggregate_mode.cc b/cpp/src/arrow/compute/kernels/aggregate_mode.cc
index c67ca31422..263eced9e3 100644
--- a/cpp/src/arrow/compute/kernels/aggregate_mode.cc
+++ b/cpp/src/arrow/compute/kernels/aggregate_mode.cc
@@ -26,6 +26,7 @@
 #include "arrow/result.h"
 #include "arrow/stl_allocator.h"
 #include "arrow/type_traits.h"
+#include "arrow/util/bit_util.h"
 
 namespace arrow {
 namespace compute {
@@ -58,7 +59,8 @@ Result<std::pair<CType*, int64_t*>> PrepareOutput(int64_t n, KernelContext* ctx,
   int64_t* count_buffer = nullptr;
 
   if (n > 0) {
-    ARROW_ASSIGN_OR_RAISE(mode_data->buffers[1], ctx->Allocate(n * sizeof(CType)));
+    const auto mode_buffer_size = bit_util::BytesForBits(n * mode_type->bit_width());
+    ARROW_ASSIGN_OR_RAISE(mode_data->buffers[1], ctx->Allocate(mode_buffer_size));
     ARROW_ASSIGN_OR_RAISE(count_data->buffers[1], ctx->Allocate(n * sizeof(int64_t)));
     mode_buffer = mode_data->template GetMutableValues<CType>(1);
     count_buffer = count_data->template GetMutableValues<int64_t>(1);
@@ -201,18 +203,21 @@ struct CountModer<BooleanType> {
     const int64_t distinct_values = (this->counts[0] != 0) + (this->counts[1] != 0);
     const int64_t n = std::min(options.n, distinct_values);
 
-    bool* mode_buffer;
+    uint8_t* mode_buffer;
     int64_t* count_buffer;
     ARROW_ASSIGN_OR_RAISE(std::tie(mode_buffer, count_buffer),
-                          PrepareOutput<BooleanType>(n, ctx, type, out));
+                          (PrepareOutput<BooleanType, uint8_t>(n, ctx, type, out)));
 
     if (n >= 1) {
-      const bool index = counts[1] > counts[0];
-      mode_buffer[0] = index;
-      count_buffer[0] = counts[index];
+      // at most two bits are useful in mode buffer
+      mode_buffer[0] = 0;
+      const bool first_mode = counts[true] > counts[false];
+      bit_util::SetBitTo(mode_buffer, 0, first_mode);
+      count_buffer[0] = counts[first_mode];
       if (n == 2) {
-        mode_buffer[1] = !index;
-        count_buffer[1] = counts[!index];
+        const bool second_mode = !first_mode;
+        bit_util::SetBitTo(mode_buffer, 1, second_mode);
+        count_buffer[1] = counts[second_mode];
       }
     }
 
@@ -224,7 +229,7 @@ struct CountModer<BooleanType> {
     const ModeOptions& options = ModeState::Get(ctx);
     if ((!options.skip_nulls && values.GetNullCount() > 0) ||
         (values.length - values.null_count < options.min_count)) {
-      return PrepareOutput<BooleanType>(/*n=*/0, ctx, *out->type(), out).status();
+      return PrepareOutput<BooleanType, uint8_t>(0, ctx, *out->type(), out).status();
     }
     UpdateCounts(values);
     return WrapResult(ctx, options, *out->type(), out);
@@ -236,7 +241,7 @@ struct CountModer<BooleanType> {
     ExecResult result;
     if ((!options.skip_nulls && values.null_count() > 0) ||
         (values.length() - values.null_count() < options.min_count)) {
-      RETURN_NOT_OK(PrepareOutput<BooleanType>(/*n=*/0, ctx, *out->type(), &result));
+      RETURN_NOT_OK((PrepareOutput<BooleanType, uint8_t>(0, ctx, *out->type(), &result)));
     } else {
       UpdateCounts(values);
       RETURN_NOT_OK(WrapResult(ctx, options, *out->type(), &result));
diff --git a/cpp/src/arrow/compute/kernels/aggregate_test.cc b/cpp/src/arrow/compute/kernels/aggregate_test.cc
index abd5b5210a..c7909487fb 100644
--- a/cpp/src/arrow/compute/kernels/aggregate_test.cc
+++ b/cpp/src/arrow/compute/kernels/aggregate_test.cc
@@ -2548,6 +2548,24 @@ void CheckModes(const Datum& array, const ModeOptions options,
   }
 }
 
+template <>
+void CheckModes<bool>(const Datum& array, const ModeOptions options,
+                      const std::vector<bool>& expected_modes,
+                      const std::vector<int64_t>& expected_counts) {
+  ASSERT_OK_AND_ASSIGN(Datum out, Mode(array, options));
+  ValidateOutput(out);
+  const StructArray out_array(out.array());
+  ASSERT_EQ(out_array.length(), expected_modes.size());
+  ASSERT_EQ(out_array.num_fields(), 2);
+
+  const uint8_t* out_modes = out_array.field(0)->data()->GetValues<uint8_t>(1);
+  const int64_t* out_counts = out_array.field(1)->data()->GetValues<int64_t>(1);
+  for (int i = 0; i < out_array.length(); ++i) {
+    ASSERT_TRUE(expected_modes[i] == bit_util::GetBit(out_modes, i));
+    ASSERT_EQ(expected_counts[i], out_counts[i]);
+  }
+}
+
 template <typename T>
 class TestPrimitiveModeKernel : public ::testing::Test {
  public:
@@ -2642,6 +2660,7 @@ TEST_F(TestBooleanModeKernel, Basics) {
   this->AssertModeIs({"[true, null]", "[]", "[null, false]"}, false, 1);
   this->AssertModesEmpty({"[null, null]", "[]", "[null]"});
 
+  this->AssertModesAre("[true, false]", 2, {false, true}, {1, 1});
   this->AssertModesAre("[false, false, true, true, true, false]", 2, {false, true},
                        {3, 3});
   this->AssertModesAre("[true, null, false, false, null, true, null, null, true]", 100,
diff --git a/python/pyarrow/tests/test_compute.py b/python/pyarrow/tests/test_compute.py
index 6664f2f824..2bdec412f1 100644
--- a/python/pyarrow/tests/test_compute.py
+++ b/python/pyarrow/tests/test_compute.py
@@ -370,6 +370,12 @@ def test_mode_array():
     mode = pc.mode(arr, skip_nulls=False, min_count=5)
     assert len(mode) == 0
 
+    arr = pa.array([True, False])
+    mode = pc.mode(arr, n=2)
+    assert len(mode) == 2
+    assert mode[0].as_py() == {"mode": False, "count": 1}
+    assert mode[1].as_py() == {"mode": True, "count": 1}
+
 
 def test_mode_chunked_array():
     # ARROW-9917