You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2020/11/11 09:39:07 UTC

[GitHub] [arrow] cyb70289 opened a new pull request #8637: ARROW-10021: [C++][Compute] Return top-n modes in mode kernel

cyb70289 opened a new pull request #8637:
URL: https://github.com/apache/arrow/pull/8637


   This patch generalize mode kernel to return top-n modes.
   No performance difference for normal benchmarks.
   About 20% performance drop for 100% null benchmarks.


----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] pitrou commented on a change in pull request #8637: ARROW-10021: [C++][Compute] Return top-n modes in mode kernel

Posted by GitBox <gi...@apache.org>.
pitrou commented on a change in pull request #8637:
URL: https://github.com/apache/arrow/pull/8637#discussion_r521967685



##########
File path: docs/source/cpp/compute.rst
##########
@@ -140,7 +140,7 @@ Aggregations
 +--------------------------+------------+--------------------+-----------------------+--------------------------------------------+
 | min_max                  | Unary      | Numeric            | Scalar Struct  (1)    | :struct:`MinMaxOptions`                    |
 +--------------------------+------------+--------------------+-----------------------+--------------------------------------------+
-| mode                     | Unary      | Numeric            | Scalar Struct  (2)    |                                            |
+| mode                     | Unary      | Numeric            | Scalar Struct  (2)    | :struct:`ModeOptions`                      |

Review comment:
       `min_max` returns a scalar, AFAIK.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] cyb70289 commented on a change in pull request #8637: ARROW-10021: [C++][Compute] Return top-n modes in mode kernel

Posted by GitBox <gi...@apache.org>.
cyb70289 commented on a change in pull request #8637:
URL: https://github.com/apache/arrow/pull/8637#discussion_r521865178



##########
File path: cpp/src/arrow/compute/kernels/aggregate_mode.cc
##########
@@ -28,6 +29,9 @@ namespace internal {
 
 namespace {
 
+constexpr char kModeFieldName[] = "modes";
+constexpr char kCountFieldName[] = "counts";

Review comment:
       Done




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] cyb70289 commented on a change in pull request #8637: ARROW-10021: [C++][Compute] Return top-n modes in mode kernel

Posted by GitBox <gi...@apache.org>.
cyb70289 commented on a change in pull request #8637:
URL: https://github.com/apache/arrow/pull/8637#discussion_r521865256



##########
File path: cpp/src/arrow/compute/kernels/aggregate_mode.cc
##########
@@ -205,35 +243,59 @@ struct ModeImpl : public ScalarAggregator {
     this->state.MergeFrom(std::move(other.state));
   }
 
-  void Finalize(KernelContext*, Datum* out) override {
-    using ModeType = typename TypeTraits<ArrowType>::ScalarType;
-    using CountType = typename TypeTraits<Int64Type>::ScalarType;
+  static std::shared_ptr<ArrayData> MakeArrayData(
+      const std::shared_ptr<DataType>& data_type, int64_t n) {
+    auto data = ArrayData::Make(data_type, n, 0);
+    data->buffers.resize(2);
+    data->buffers[0] = nullptr;
+    return data;
+  }
 
-    std::vector<std::shared_ptr<Scalar>> values;
-    auto mode_count = this->state.Finalize();
-    auto mode = mode_count.first;
-    auto count = mode_count.second;
-    if (count == 0) {
-      values = {std::make_shared<ModeType>(), std::make_shared<CountType>()};
-    } else {
-      values = {std::make_shared<ModeType>(mode), std::make_shared<CountType>(count)};
+  void Finalize(KernelContext* ctx, Datum* out) override {
+    const auto& mode_type = TypeTraits<ArrowType>::type_singleton();
+    const auto& count_type = int64();
+    const auto& out_type =
+        struct_({field(kModeFieldName, mode_type), field(kCountFieldName, count_type)});
+
+    int n = this->options.n;
+    if (n > state.DistinctValues()) {
+      n = state.DistinctValues();
+    }
+    if (n <= 0) {
+      *out = Datum(StructArray(out_type, 0, ArrayVector{}).data());

Review comment:
       Done

##########
File path: cpp/src/arrow/compute/kernels/aggregate_mode.cc
##########
@@ -205,35 +243,59 @@ struct ModeImpl : public ScalarAggregator {
     this->state.MergeFrom(std::move(other.state));
   }
 
-  void Finalize(KernelContext*, Datum* out) override {
-    using ModeType = typename TypeTraits<ArrowType>::ScalarType;
-    using CountType = typename TypeTraits<Int64Type>::ScalarType;
+  static std::shared_ptr<ArrayData> MakeArrayData(
+      const std::shared_ptr<DataType>& data_type, int64_t n) {
+    auto data = ArrayData::Make(data_type, n, 0);
+    data->buffers.resize(2);
+    data->buffers[0] = nullptr;
+    return data;
+  }
 
-    std::vector<std::shared_ptr<Scalar>> values;
-    auto mode_count = this->state.Finalize();
-    auto mode = mode_count.first;
-    auto count = mode_count.second;
-    if (count == 0) {
-      values = {std::make_shared<ModeType>(), std::make_shared<CountType>()};
-    } else {
-      values = {std::make_shared<ModeType>(mode), std::make_shared<CountType>(count)};
+  void Finalize(KernelContext* ctx, Datum* out) override {
+    const auto& mode_type = TypeTraits<ArrowType>::type_singleton();
+    const auto& count_type = int64();
+    const auto& out_type =
+        struct_({field(kModeFieldName, mode_type), field(kCountFieldName, count_type)});
+
+    int n = this->options.n;
+    if (n > state.DistinctValues()) {
+      n = state.DistinctValues();
+    }
+    if (n <= 0) {
+      *out = Datum(StructArray(out_type, 0, ArrayVector{}).data());
+      return;
     }
-    out->value = std::make_shared<StructScalar>(std::move(values), this->out_type);
+
+    auto mode_data = this->MakeArrayData(mode_type, n);
+    auto count_data = this->MakeArrayData(count_type, n);
+    KERNEL_ASSIGN_OR_RAISE(mode_data->buffers[1], ctx, ctx->Allocate(n * sizeof(CType)));
+    KERNEL_ASSIGN_OR_RAISE(count_data->buffers[1], ctx,
+                           ctx->Allocate(n * sizeof(int64_t)));
+
+    CType* mode_buffer = mode_data->template GetMutableValues<CType>(1);
+    int64_t* count_buffer = count_data->template GetMutableValues<int64_t>(1);
+    this->state.Finalize(mode_buffer, count_buffer, n);
+
+    *out = Datum(
+        StructArray(out_type, n, ArrayVector{MakeArray(mode_data), MakeArray(count_data)})

Review comment:
       Done

##########
File path: cpp/src/arrow/compute/kernels/aggregate_test.cc
##########
@@ -735,71 +735,63 @@ class TestPrimitiveModeKernel : public ::testing::Test {
  public:
   using ArrowType = T;
   using Traits = TypeTraits<ArrowType>;
-  using c_type = typename ArrowType::c_type;
-  using ModeType = typename Traits::ScalarType;
-  using CountType = typename TypeTraits<Int64Type>::ScalarType;
-
-  void AssertModeIs(const Datum& array, c_type expected_mode, int64_t expected_count) {
-    ASSERT_OK_AND_ASSIGN(Datum out, Mode(array));
-    const StructScalar& value = out.scalar_as<StructScalar>();
+  using CType = typename ArrowType::c_type;
+
+  void AssertModesAre(const Datum& array, const int n,
+                      const std::vector<CType>& expected_modes,
+                      const std::vector<int64_t>& expected_counts) {
+    ASSERT_OK_AND_ASSIGN(Datum out, Mode(array, ModeOptions{n}));

Review comment:
       Done

##########
File path: cpp/src/arrow/compute/kernels/aggregate_test.cc
##########
@@ -735,71 +735,63 @@ class TestPrimitiveModeKernel : public ::testing::Test {
  public:
   using ArrowType = T;
   using Traits = TypeTraits<ArrowType>;
-  using c_type = typename ArrowType::c_type;
-  using ModeType = typename Traits::ScalarType;
-  using CountType = typename TypeTraits<Int64Type>::ScalarType;
-
-  void AssertModeIs(const Datum& array, c_type expected_mode, int64_t expected_count) {
-    ASSERT_OK_AND_ASSIGN(Datum out, Mode(array));
-    const StructScalar& value = out.scalar_as<StructScalar>();
+  using CType = typename ArrowType::c_type;
+
+  void AssertModesAre(const Datum& array, const int n,
+                      const std::vector<CType>& expected_modes,
+                      const std::vector<int64_t>& expected_counts) {
+    ASSERT_OK_AND_ASSIGN(Datum out, Mode(array, ModeOptions{n}));
+    const StructArray out_array(out.array());
+    ASSERT_EQ(out_array.length(), expected_modes.size());
+    ASSERT_EQ(out_array.num_fields(), 2);
+
+    const CType* out_modes = out_array.field(0)->data()->GetValues<CType>(1);
+    const int64_t* out_counts = out_array.field(1)->data()->GetValues<int64_t>(1);
+    for (int i = 0; i < out_array.length(); ++i) {
+      // equal or nan equal
+      ASSERT_TRUE(
+          (expected_modes[i] == out_modes[i]) ||
+          (expected_modes[i] != expected_modes[i] && out_modes[i] != out_modes[i]));
+      ASSERT_EQ(expected_counts[i], out_counts[i]);
+    }
+  }
 
-    const auto& out_mode = checked_cast<const ModeType&>(*value.value[0]);
-    ASSERT_EQ(expected_mode, out_mode.value);
+  void AssertModesAre(const std::string& json, const int n,
+                      const std::vector<CType>& expected_modes,
+                      const std::vector<int64_t>& expected_counts) {
+    auto array = ArrayFromJSON(type_singleton(), json);
+    AssertModesAre(array, n, expected_modes, expected_counts);
+  }
 
-    const auto& out_count = checked_cast<const CountType&>(*value.value[1]);
-    ASSERT_EQ(expected_count, out_count.value);
+  void AssertModeIs(const Datum& array, CType expected_mode, int64_t expected_count) {
+    AssertModesAre(array, 1, {expected_mode}, {expected_count});
   }
 
-  void AssertModeIs(const std::string& json, c_type expected_mode,
+  void AssertModeIs(const std::string& json, CType expected_mode,
                     int64_t expected_count) {
     auto array = ArrayFromJSON(type_singleton(), json);
     AssertModeIs(array, expected_mode, expected_count);
   }
 
-  void AssertModeIs(const std::vector<std::string>& json, c_type expected_mode,
+  void AssertModeIs(const std::vector<std::string>& json, CType expected_mode,
                     int64_t expected_count) {
     auto chunked = ChunkedArrayFromJSON(type_singleton(), json);
     AssertModeIs(chunked, expected_mode, expected_count);
   }
 
-  void AssertModeIsNull(const Datum& array) {
-    ASSERT_OK_AND_ASSIGN(Datum out, Mode(array));
-    const StructScalar& value = out.scalar_as<StructScalar>();
-
-    for (const auto& val : value.value) {
-      ASSERT_FALSE(val->is_valid);
-    }
+  void AssertModeIsNull(const Datum& array, int n) {

Review comment:
       Done

##########
File path: docs/source/cpp/compute.rst
##########
@@ -153,7 +153,7 @@ Notes:
 
 * \(1) Output is a ``{"min": input type, "max": input type}`` Struct
 
-* \(2) Output is a ``{"mode": input type, "count": Int64}`` Struct
+* \(2) Output is an array of ``{"mode": input type, "count": Int64}`` Struct

Review comment:
       Done




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] github-actions[bot] commented on pull request #8637: ARROW-10021: [C++][Compute] Return top-n modes in mode kernel

Posted by GitBox <gi...@apache.org>.
github-actions[bot] commented on pull request #8637:
URL: https://github.com/apache/arrow/pull/8637#issuecomment-725320345


   https://issues.apache.org/jira/browse/ARROW-10021


----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] pitrou commented on a change in pull request #8637: ARROW-10021: [C++][Compute] Return top-n modes in mode kernel

Posted by GitBox <gi...@apache.org>.
pitrou commented on a change in pull request #8637:
URL: https://github.com/apache/arrow/pull/8637#discussion_r521968240



##########
File path: docs/source/cpp/compute.rst
##########
@@ -140,7 +140,7 @@ Aggregations
 +--------------------------+------------+--------------------+-----------------------+--------------------------------------------+
 | min_max                  | Unary      | Numeric            | Scalar Struct  (1)    | :struct:`MinMaxOptions`                    |
 +--------------------------+------------+--------------------+-----------------------+--------------------------------------------+
-| mode                     | Unary      | Numeric            | Scalar Struct  (2)    |                                            |
+| mode                     | Unary      | Numeric            | Scalar Struct  (2)    | :struct:`ModeOptions`                      |

Review comment:
       And `mode` should be changed to "Struct".




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] cyb70289 commented on pull request #8637: ARROW-10021: [C++][Compute] Return top-n modes in mode kernel

Posted by GitBox <gi...@apache.org>.
cyb70289 commented on pull request #8637:
URL: https://github.com/apache/arrow/pull/8637#issuecomment-725891740


   Python unit test failed. I'm updating mode kernel python binding per cpp changes.


----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] cyb70289 commented on a change in pull request #8637: ARROW-10021: [C++][Compute] Return top-n modes in mode kernel

Posted by GitBox <gi...@apache.org>.
cyb70289 commented on a change in pull request #8637:
URL: https://github.com/apache/arrow/pull/8637#discussion_r521776788



##########
File path: docs/source/cpp/compute.rst
##########
@@ -140,7 +140,7 @@ Aggregations
 +--------------------------+------------+--------------------+-----------------------+--------------------------------------------+
 | min_max                  | Unary      | Numeric            | Scalar Struct  (1)    | :struct:`MinMaxOptions`                    |
 +--------------------------+------------+--------------------+-----------------------+--------------------------------------------+
-| mode                     | Unary      | Numeric            | Scalar Struct  (2)    |                                            |
+| mode                     | Unary      | Numeric            | Scalar Struct  (2)    | :struct:`ModeOptions`                      |

Review comment:
       Mode now returns an array of struct. Should it be "Struct Array"? or simply "Struct'.
   Should I also change above line (min_max) to "Struct"? or leave it "Scalar Struct".




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] cyb70289 commented on a change in pull request #8637: ARROW-10021: [C++][Compute] Return top-n modes in mode kernel

Posted by GitBox <gi...@apache.org>.
cyb70289 commented on a change in pull request #8637:
URL: https://github.com/apache/arrow/pull/8637#discussion_r521865103



##########
File path: cpp/src/arrow/compute/api_aggregate.h
##########
@@ -76,6 +76,18 @@ struct ARROW_EXPORT MinMaxOptions : public FunctionOptions {
   enum Mode null_handling;
 };
 
+/// \brief Control Mode kernel behavior
+///
+/// Returns top-n common values and counts.
+/// By default, returns the most common value and count.
+struct ARROW_EXPORT ModeOptions : public FunctionOptions {
+  explicit ModeOptions(int n = 1) : n(n) {}

Review comment:
       Done




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] pitrou closed pull request #8637: ARROW-10021: [C++][Compute] Return top-n modes in mode kernel

Posted by GitBox <gi...@apache.org>.
pitrou closed pull request #8637:
URL: https://github.com/apache/arrow/pull/8637


   


----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] cyb70289 commented on a change in pull request #8637: ARROW-10021: [C++][Compute] Return top-n modes in mode kernel

Posted by GitBox <gi...@apache.org>.
cyb70289 commented on a change in pull request #8637:
URL: https://github.com/apache/arrow/pull/8637#discussion_r521973124



##########
File path: docs/source/cpp/compute.rst
##########
@@ -140,7 +140,7 @@ Aggregations
 +--------------------------+------------+--------------------+-----------------------+--------------------------------------------+
 | min_max                  | Unary      | Numeric            | Scalar Struct  (1)    | :struct:`MinMaxOptions`                    |
 +--------------------------+------------+--------------------+-----------------------+--------------------------------------------+
-| mode                     | Unary      | Numeric            | Scalar Struct  (2)    |                                            |
+| mode                     | Unary      | Numeric            | Scalar Struct  (2)    | :struct:`ModeOptions`                      |

Review comment:
       Thanks. Done.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] pitrou commented on a change in pull request #8637: ARROW-10021: [C++][Compute] Return top-n modes in mode kernel

Posted by GitBox <gi...@apache.org>.
pitrou commented on a change in pull request #8637:
URL: https://github.com/apache/arrow/pull/8637#discussion_r521255222



##########
File path: docs/source/cpp/compute.rst
##########
@@ -140,7 +140,7 @@ Aggregations
 +--------------------------+------------+--------------------+-----------------------+--------------------------------------------+
 | min_max                  | Unary      | Numeric            | Scalar Struct  (1)    | :struct:`MinMaxOptions`                    |
 +--------------------------+------------+--------------------+-----------------------+--------------------------------------------+
-| mode                     | Unary      | Numeric            | Scalar Struct  (2)    |                                            |
+| mode                     | Unary      | Numeric            | Scalar Struct  (2)    | :struct:`ModeOptions`                      |

Review comment:
       Replace "Scalar Struct" with "Struct".

##########
File path: cpp/src/arrow/compute/kernels/aggregate_test.cc
##########
@@ -735,71 +735,63 @@ class TestPrimitiveModeKernel : public ::testing::Test {
  public:
   using ArrowType = T;
   using Traits = TypeTraits<ArrowType>;
-  using c_type = typename ArrowType::c_type;
-  using ModeType = typename Traits::ScalarType;
-  using CountType = typename TypeTraits<Int64Type>::ScalarType;
-
-  void AssertModeIs(const Datum& array, c_type expected_mode, int64_t expected_count) {
-    ASSERT_OK_AND_ASSIGN(Datum out, Mode(array));
-    const StructScalar& value = out.scalar_as<StructScalar>();
+  using CType = typename ArrowType::c_type;
+
+  void AssertModesAre(const Datum& array, const int n,
+                      const std::vector<CType>& expected_modes,
+                      const std::vector<int64_t>& expected_counts) {
+    ASSERT_OK_AND_ASSIGN(Datum out, Mode(array, ModeOptions{n}));
+    const StructArray out_array(out.array());
+    ASSERT_EQ(out_array.length(), expected_modes.size());
+    ASSERT_EQ(out_array.num_fields(), 2);
+
+    const CType* out_modes = out_array.field(0)->data()->GetValues<CType>(1);
+    const int64_t* out_counts = out_array.field(1)->data()->GetValues<int64_t>(1);
+    for (int i = 0; i < out_array.length(); ++i) {
+      // equal or nan equal
+      ASSERT_TRUE(
+          (expected_modes[i] == out_modes[i]) ||
+          (expected_modes[i] != expected_modes[i] && out_modes[i] != out_modes[i]));
+      ASSERT_EQ(expected_counts[i], out_counts[i]);
+    }
+  }
 
-    const auto& out_mode = checked_cast<const ModeType&>(*value.value[0]);
-    ASSERT_EQ(expected_mode, out_mode.value);
+  void AssertModesAre(const std::string& json, const int n,
+                      const std::vector<CType>& expected_modes,
+                      const std::vector<int64_t>& expected_counts) {
+    auto array = ArrayFromJSON(type_singleton(), json);
+    AssertModesAre(array, n, expected_modes, expected_counts);
+  }
 
-    const auto& out_count = checked_cast<const CountType&>(*value.value[1]);
-    ASSERT_EQ(expected_count, out_count.value);
+  void AssertModeIs(const Datum& array, CType expected_mode, int64_t expected_count) {
+    AssertModesAre(array, 1, {expected_mode}, {expected_count});
   }
 
-  void AssertModeIs(const std::string& json, c_type expected_mode,
+  void AssertModeIs(const std::string& json, CType expected_mode,
                     int64_t expected_count) {
     auto array = ArrayFromJSON(type_singleton(), json);
     AssertModeIs(array, expected_mode, expected_count);
   }
 
-  void AssertModeIs(const std::vector<std::string>& json, c_type expected_mode,
+  void AssertModeIs(const std::vector<std::string>& json, CType expected_mode,
                     int64_t expected_count) {
     auto chunked = ChunkedArrayFromJSON(type_singleton(), json);
     AssertModeIs(chunked, expected_mode, expected_count);
   }
 
-  void AssertModeIsNull(const Datum& array) {
-    ASSERT_OK_AND_ASSIGN(Datum out, Mode(array));
-    const StructScalar& value = out.scalar_as<StructScalar>();
-
-    for (const auto& val : value.value) {
-      ASSERT_FALSE(val->is_valid);
-    }
+  void AssertModeIsNull(const Datum& array, int n) {

Review comment:
       This should be renamed, since it's not null anymore. `AssertModesEmpty`?

##########
File path: cpp/src/arrow/compute/api_aggregate.h
##########
@@ -76,6 +76,18 @@ struct ARROW_EXPORT MinMaxOptions : public FunctionOptions {
   enum Mode null_handling;
 };
 
+/// \brief Control Mode kernel behavior
+///
+/// Returns top-n common values and counts.
+/// By default, returns the most common value and count.
+struct ARROW_EXPORT ModeOptions : public FunctionOptions {
+  explicit ModeOptions(int n = 1) : n(n) {}

Review comment:
       Can we use `int64_t` everywhere for this?

##########
File path: cpp/src/arrow/compute/kernels/aggregate_test.cc
##########
@@ -735,71 +735,63 @@ class TestPrimitiveModeKernel : public ::testing::Test {
  public:
   using ArrowType = T;
   using Traits = TypeTraits<ArrowType>;
-  using c_type = typename ArrowType::c_type;
-  using ModeType = typename Traits::ScalarType;
-  using CountType = typename TypeTraits<Int64Type>::ScalarType;
-
-  void AssertModeIs(const Datum& array, c_type expected_mode, int64_t expected_count) {
-    ASSERT_OK_AND_ASSIGN(Datum out, Mode(array));
-    const StructScalar& value = out.scalar_as<StructScalar>();
+  using CType = typename ArrowType::c_type;
+
+  void AssertModesAre(const Datum& array, const int n,
+                      const std::vector<CType>& expected_modes,
+                      const std::vector<int64_t>& expected_counts) {
+    ASSERT_OK_AND_ASSIGN(Datum out, Mode(array, ModeOptions{n}));

Review comment:
       Can you call `ValidateFull` on `Mode` results in these test functions?

##########
File path: cpp/src/arrow/compute/kernels/aggregate_mode.cc
##########
@@ -205,35 +243,59 @@ struct ModeImpl : public ScalarAggregator {
     this->state.MergeFrom(std::move(other.state));
   }
 
-  void Finalize(KernelContext*, Datum* out) override {
-    using ModeType = typename TypeTraits<ArrowType>::ScalarType;
-    using CountType = typename TypeTraits<Int64Type>::ScalarType;
+  static std::shared_ptr<ArrayData> MakeArrayData(
+      const std::shared_ptr<DataType>& data_type, int64_t n) {
+    auto data = ArrayData::Make(data_type, n, 0);
+    data->buffers.resize(2);
+    data->buffers[0] = nullptr;
+    return data;
+  }
 
-    std::vector<std::shared_ptr<Scalar>> values;
-    auto mode_count = this->state.Finalize();
-    auto mode = mode_count.first;
-    auto count = mode_count.second;
-    if (count == 0) {
-      values = {std::make_shared<ModeType>(), std::make_shared<CountType>()};
-    } else {
-      values = {std::make_shared<ModeType>(mode), std::make_shared<CountType>(count)};
+  void Finalize(KernelContext* ctx, Datum* out) override {
+    const auto& mode_type = TypeTraits<ArrowType>::type_singleton();
+    const auto& count_type = int64();
+    const auto& out_type =
+        struct_({field(kModeFieldName, mode_type), field(kCountFieldName, count_type)});
+
+    int n = this->options.n;
+    if (n > state.DistinctValues()) {
+      n = state.DistinctValues();
+    }
+    if (n <= 0) {
+      *out = Datum(StructArray(out_type, 0, ArrayVector{}).data());
+      return;
     }
-    out->value = std::make_shared<StructScalar>(std::move(values), this->out_type);
+
+    auto mode_data = this->MakeArrayData(mode_type, n);
+    auto count_data = this->MakeArrayData(count_type, n);
+    KERNEL_ASSIGN_OR_RAISE(mode_data->buffers[1], ctx, ctx->Allocate(n * sizeof(CType)));
+    KERNEL_ASSIGN_OR_RAISE(count_data->buffers[1], ctx,
+                           ctx->Allocate(n * sizeof(int64_t)));
+
+    CType* mode_buffer = mode_data->template GetMutableValues<CType>(1);
+    int64_t* count_buffer = count_data->template GetMutableValues<int64_t>(1);
+    this->state.Finalize(mode_buffer, count_buffer, n);
+
+    *out = Datum(
+        StructArray(out_type, n, ArrayVector{MakeArray(mode_data), MakeArray(count_data)})

Review comment:
       You could perhaps construct the `ArrayData` directly instead of going through a temporary `StructArray`.

##########
File path: docs/source/cpp/compute.rst
##########
@@ -153,7 +153,7 @@ Notes:
 
 * \(1) Output is a ``{"min": input type, "max": input type}`` Struct
 
-* \(2) Output is a ``{"mode": input type, "count": Int64}`` Struct
+* \(2) Output is an array of ``{"mode": input type, "count": Int64}`` Struct

Review comment:
       Explain a bit more what the array represents?

##########
File path: cpp/src/arrow/compute/kernels/aggregate_mode.cc
##########
@@ -28,6 +29,9 @@ namespace internal {
 
 namespace {
 
+constexpr char kModeFieldName[] = "modes";
+constexpr char kCountFieldName[] = "counts";

Review comment:
       Can you keep "mode", "count" as documented?
   

##########
File path: cpp/src/arrow/compute/kernels/aggregate_mode.cc
##########
@@ -205,35 +243,59 @@ struct ModeImpl : public ScalarAggregator {
     this->state.MergeFrom(std::move(other.state));
   }
 
-  void Finalize(KernelContext*, Datum* out) override {
-    using ModeType = typename TypeTraits<ArrowType>::ScalarType;
-    using CountType = typename TypeTraits<Int64Type>::ScalarType;
+  static std::shared_ptr<ArrayData> MakeArrayData(
+      const std::shared_ptr<DataType>& data_type, int64_t n) {
+    auto data = ArrayData::Make(data_type, n, 0);
+    data->buffers.resize(2);
+    data->buffers[0] = nullptr;
+    return data;
+  }
 
-    std::vector<std::shared_ptr<Scalar>> values;
-    auto mode_count = this->state.Finalize();
-    auto mode = mode_count.first;
-    auto count = mode_count.second;
-    if (count == 0) {
-      values = {std::make_shared<ModeType>(), std::make_shared<CountType>()};
-    } else {
-      values = {std::make_shared<ModeType>(mode), std::make_shared<CountType>(count)};
+  void Finalize(KernelContext* ctx, Datum* out) override {
+    const auto& mode_type = TypeTraits<ArrowType>::type_singleton();
+    const auto& count_type = int64();
+    const auto& out_type =
+        struct_({field(kModeFieldName, mode_type), field(kCountFieldName, count_type)});
+
+    int n = this->options.n;
+    if (n > state.DistinctValues()) {
+      n = state.DistinctValues();
+    }
+    if (n <= 0) {
+      *out = Datum(StructArray(out_type, 0, ArrayVector{}).data());

Review comment:
       I'm not convinced this is correct. The struct array should have two child arrays, even if empty.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org