You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2020/11/11 10:30:47 UTC

[GitHub] [arrow] pitrou commented on a change in pull request #8637: ARROW-10021: [C++][Compute] Return top-n modes in mode kernel

pitrou commented on a change in pull request #8637:
URL: https://github.com/apache/arrow/pull/8637#discussion_r521255222



##########
File path: docs/source/cpp/compute.rst
##########
@@ -140,7 +140,7 @@ Aggregations
 +--------------------------+------------+--------------------+-----------------------+--------------------------------------------+
 | min_max                  | Unary      | Numeric            | Scalar Struct  (1)    | :struct:`MinMaxOptions`                    |
 +--------------------------+------------+--------------------+-----------------------+--------------------------------------------+
-| mode                     | Unary      | Numeric            | Scalar Struct  (2)    |                                            |
+| mode                     | Unary      | Numeric            | Scalar Struct  (2)    | :struct:`ModeOptions`                      |

Review comment:
       Replace "Scalar Struct" with "Struct".

##########
File path: cpp/src/arrow/compute/kernels/aggregate_test.cc
##########
@@ -735,71 +735,63 @@ class TestPrimitiveModeKernel : public ::testing::Test {
  public:
   using ArrowType = T;
   using Traits = TypeTraits<ArrowType>;
-  using c_type = typename ArrowType::c_type;
-  using ModeType = typename Traits::ScalarType;
-  using CountType = typename TypeTraits<Int64Type>::ScalarType;
-
-  void AssertModeIs(const Datum& array, c_type expected_mode, int64_t expected_count) {
-    ASSERT_OK_AND_ASSIGN(Datum out, Mode(array));
-    const StructScalar& value = out.scalar_as<StructScalar>();
+  using CType = typename ArrowType::c_type;
+
+  void AssertModesAre(const Datum& array, const int n,
+                      const std::vector<CType>& expected_modes,
+                      const std::vector<int64_t>& expected_counts) {
+    ASSERT_OK_AND_ASSIGN(Datum out, Mode(array, ModeOptions{n}));
+    const StructArray out_array(out.array());
+    ASSERT_EQ(out_array.length(), expected_modes.size());
+    ASSERT_EQ(out_array.num_fields(), 2);
+
+    const CType* out_modes = out_array.field(0)->data()->GetValues<CType>(1);
+    const int64_t* out_counts = out_array.field(1)->data()->GetValues<int64_t>(1);
+    for (int i = 0; i < out_array.length(); ++i) {
+      // equal or nan equal
+      ASSERT_TRUE(
+          (expected_modes[i] == out_modes[i]) ||
+          (expected_modes[i] != expected_modes[i] && out_modes[i] != out_modes[i]));
+      ASSERT_EQ(expected_counts[i], out_counts[i]);
+    }
+  }
 
-    const auto& out_mode = checked_cast<const ModeType&>(*value.value[0]);
-    ASSERT_EQ(expected_mode, out_mode.value);
+  void AssertModesAre(const std::string& json, const int n,
+                      const std::vector<CType>& expected_modes,
+                      const std::vector<int64_t>& expected_counts) {
+    auto array = ArrayFromJSON(type_singleton(), json);
+    AssertModesAre(array, n, expected_modes, expected_counts);
+  }
 
-    const auto& out_count = checked_cast<const CountType&>(*value.value[1]);
-    ASSERT_EQ(expected_count, out_count.value);
+  void AssertModeIs(const Datum& array, CType expected_mode, int64_t expected_count) {
+    AssertModesAre(array, 1, {expected_mode}, {expected_count});
   }
 
-  void AssertModeIs(const std::string& json, c_type expected_mode,
+  void AssertModeIs(const std::string& json, CType expected_mode,
                     int64_t expected_count) {
     auto array = ArrayFromJSON(type_singleton(), json);
     AssertModeIs(array, expected_mode, expected_count);
   }
 
-  void AssertModeIs(const std::vector<std::string>& json, c_type expected_mode,
+  void AssertModeIs(const std::vector<std::string>& json, CType expected_mode,
                     int64_t expected_count) {
     auto chunked = ChunkedArrayFromJSON(type_singleton(), json);
     AssertModeIs(chunked, expected_mode, expected_count);
   }
 
-  void AssertModeIsNull(const Datum& array) {
-    ASSERT_OK_AND_ASSIGN(Datum out, Mode(array));
-    const StructScalar& value = out.scalar_as<StructScalar>();
-
-    for (const auto& val : value.value) {
-      ASSERT_FALSE(val->is_valid);
-    }
+  void AssertModeIsNull(const Datum& array, int n) {

Review comment:
       This should be renamed, since it's not null anymore. `AssertModesEmpty`?

##########
File path: cpp/src/arrow/compute/api_aggregate.h
##########
@@ -76,6 +76,18 @@ struct ARROW_EXPORT MinMaxOptions : public FunctionOptions {
   enum Mode null_handling;
 };
 
+/// \brief Control Mode kernel behavior
+///
+/// Returns top-n common values and counts.
+/// By default, returns the most common value and count.
+struct ARROW_EXPORT ModeOptions : public FunctionOptions {
+  explicit ModeOptions(int n = 1) : n(n) {}

Review comment:
       Can we use `int64_t` everywhere for this?

##########
File path: cpp/src/arrow/compute/kernels/aggregate_test.cc
##########
@@ -735,71 +735,63 @@ class TestPrimitiveModeKernel : public ::testing::Test {
  public:
   using ArrowType = T;
   using Traits = TypeTraits<ArrowType>;
-  using c_type = typename ArrowType::c_type;
-  using ModeType = typename Traits::ScalarType;
-  using CountType = typename TypeTraits<Int64Type>::ScalarType;
-
-  void AssertModeIs(const Datum& array, c_type expected_mode, int64_t expected_count) {
-    ASSERT_OK_AND_ASSIGN(Datum out, Mode(array));
-    const StructScalar& value = out.scalar_as<StructScalar>();
+  using CType = typename ArrowType::c_type;
+
+  void AssertModesAre(const Datum& array, const int n,
+                      const std::vector<CType>& expected_modes,
+                      const std::vector<int64_t>& expected_counts) {
+    ASSERT_OK_AND_ASSIGN(Datum out, Mode(array, ModeOptions{n}));

Review comment:
       Can you call `ValidateFull` on `Mode` results in these test functions?

##########
File path: cpp/src/arrow/compute/kernels/aggregate_mode.cc
##########
@@ -205,35 +243,59 @@ struct ModeImpl : public ScalarAggregator {
     this->state.MergeFrom(std::move(other.state));
   }
 
-  void Finalize(KernelContext*, Datum* out) override {
-    using ModeType = typename TypeTraits<ArrowType>::ScalarType;
-    using CountType = typename TypeTraits<Int64Type>::ScalarType;
+  static std::shared_ptr<ArrayData> MakeArrayData(
+      const std::shared_ptr<DataType>& data_type, int64_t n) {
+    auto data = ArrayData::Make(data_type, n, 0);
+    data->buffers.resize(2);
+    data->buffers[0] = nullptr;
+    return data;
+  }
 
-    std::vector<std::shared_ptr<Scalar>> values;
-    auto mode_count = this->state.Finalize();
-    auto mode = mode_count.first;
-    auto count = mode_count.second;
-    if (count == 0) {
-      values = {std::make_shared<ModeType>(), std::make_shared<CountType>()};
-    } else {
-      values = {std::make_shared<ModeType>(mode), std::make_shared<CountType>(count)};
+  void Finalize(KernelContext* ctx, Datum* out) override {
+    const auto& mode_type = TypeTraits<ArrowType>::type_singleton();
+    const auto& count_type = int64();
+    const auto& out_type =
+        struct_({field(kModeFieldName, mode_type), field(kCountFieldName, count_type)});
+
+    int n = this->options.n;
+    if (n > state.DistinctValues()) {
+      n = state.DistinctValues();
+    }
+    if (n <= 0) {
+      *out = Datum(StructArray(out_type, 0, ArrayVector{}).data());
+      return;
     }
-    out->value = std::make_shared<StructScalar>(std::move(values), this->out_type);
+
+    auto mode_data = this->MakeArrayData(mode_type, n);
+    auto count_data = this->MakeArrayData(count_type, n);
+    KERNEL_ASSIGN_OR_RAISE(mode_data->buffers[1], ctx, ctx->Allocate(n * sizeof(CType)));
+    KERNEL_ASSIGN_OR_RAISE(count_data->buffers[1], ctx,
+                           ctx->Allocate(n * sizeof(int64_t)));
+
+    CType* mode_buffer = mode_data->template GetMutableValues<CType>(1);
+    int64_t* count_buffer = count_data->template GetMutableValues<int64_t>(1);
+    this->state.Finalize(mode_buffer, count_buffer, n);
+
+    *out = Datum(
+        StructArray(out_type, n, ArrayVector{MakeArray(mode_data), MakeArray(count_data)})

Review comment:
       You could perhaps construct the `ArrayData` directly instead of going through a temporary `StructArray`.

##########
File path: docs/source/cpp/compute.rst
##########
@@ -153,7 +153,7 @@ Notes:
 
 * \(1) Output is a ``{"min": input type, "max": input type}`` Struct
 
-* \(2) Output is a ``{"mode": input type, "count": Int64}`` Struct
+* \(2) Output is an array of ``{"mode": input type, "count": Int64}`` Struct

Review comment:
       Explain a bit more what the array represents?

##########
File path: cpp/src/arrow/compute/kernels/aggregate_mode.cc
##########
@@ -28,6 +29,9 @@ namespace internal {
 
 namespace {
 
+constexpr char kModeFieldName[] = "modes";
+constexpr char kCountFieldName[] = "counts";

Review comment:
       Can you keep "mode", "count" as documented?
   

##########
File path: cpp/src/arrow/compute/kernels/aggregate_mode.cc
##########
@@ -205,35 +243,59 @@ struct ModeImpl : public ScalarAggregator {
     this->state.MergeFrom(std::move(other.state));
   }
 
-  void Finalize(KernelContext*, Datum* out) override {
-    using ModeType = typename TypeTraits<ArrowType>::ScalarType;
-    using CountType = typename TypeTraits<Int64Type>::ScalarType;
+  static std::shared_ptr<ArrayData> MakeArrayData(
+      const std::shared_ptr<DataType>& data_type, int64_t n) {
+    auto data = ArrayData::Make(data_type, n, 0);
+    data->buffers.resize(2);
+    data->buffers[0] = nullptr;
+    return data;
+  }
 
-    std::vector<std::shared_ptr<Scalar>> values;
-    auto mode_count = this->state.Finalize();
-    auto mode = mode_count.first;
-    auto count = mode_count.second;
-    if (count == 0) {
-      values = {std::make_shared<ModeType>(), std::make_shared<CountType>()};
-    } else {
-      values = {std::make_shared<ModeType>(mode), std::make_shared<CountType>(count)};
+  void Finalize(KernelContext* ctx, Datum* out) override {
+    const auto& mode_type = TypeTraits<ArrowType>::type_singleton();
+    const auto& count_type = int64();
+    const auto& out_type =
+        struct_({field(kModeFieldName, mode_type), field(kCountFieldName, count_type)});
+
+    int n = this->options.n;
+    if (n > state.DistinctValues()) {
+      n = state.DistinctValues();
+    }
+    if (n <= 0) {
+      *out = Datum(StructArray(out_type, 0, ArrayVector{}).data());

Review comment:
       I'm not convinced this is correct. The struct array should have two child arrays, even if empty.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org