You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2021/08/30 11:01:59 UTC

[GitHub] [arrow] pitrou commented on a change in pull request #11011: ARROW-13764: [C++] Support CountOptions in grouped count distinct

pitrou commented on a change in pull request #11011:
URL: https://github.com/apache/arrow/pull/11011#discussion_r698385134



##########
File path: cpp/src/arrow/compute/kernels/hash_aggregate.cc
##########
@@ -2013,8 +2015,21 @@ struct GroupedCountDistinctImpl : public GroupedAggregator {
 
     ARROW_ASSIGN_OR_RAISE(auto uniques, grouper_->GetUniques());
     auto* g = uniques[1].array()->GetValues<uint32_t>(1);
-    for (int64_t i = 0; i < uniques.length; i++) {
-      counts[g[i]]++;
+    const auto& items = *uniques[0].array();
+    const auto* valid = items.GetValues<uint8_t>(0, 0);
+    if (options_.mode == CountOptions::ALL ||
+        (options_.mode == CountOptions::ONLY_VALID && !valid)) {
+      for (int64_t i = 0; i < uniques.length; i++) {
+        counts[g[i]]++;
+      }
+    } else if (options_.mode == CountOptions::ONLY_VALID) {
+      for (int64_t i = 0; i < uniques.length; i++) {
+        counts[g[i]] += BitUtil::GetBit(valid, items.offset + i);
+      }
+    } else {  // ONLY_NULL
+      for (int64_t i = 0; i < uniques.length; i++) {
+        counts[g[i]] += !BitUtil::GetBit(valid, items.offset + i);

Review comment:
       I think `valid` might be null here.

##########
File path: cpp/src/arrow/compute/kernels/hash_aggregate.cc
##########
@@ -2036,7 +2052,50 @@ struct GroupedDistinctImpl : public GroupedCountDistinctImpl {
     ARROW_ASSIGN_OR_RAISE(auto groupings, grouper_->MakeGroupings(
                                               *uniques[1].array_as<UInt32Array>(),
                                               static_cast<uint32_t>(num_groups_), ctx_));
-    return grouper_->ApplyGroupings(*groupings, *uniques[0].make_array(), ctx_);
+    ARROW_ASSIGN_OR_RAISE(
+        auto list, grouper_->ApplyGroupings(*groupings, *uniques[0].make_array(), ctx_));
+    auto values = list->values();
+    int32_t* offsets = reinterpret_cast<int32_t*>(list->value_offsets()->mutable_data());
+    if (options_.mode == CountOptions::ALL ||
+        (options_.mode == CountOptions::ONLY_VALID && values->null_count() == 0)) {
+      return list;
+    } else if (options_.mode == CountOptions::ONLY_VALID) {
+      int32_t prev_offset = offsets[0];
+      for (int64_t i = 0; i < list->length(); i++) {
+        const int32_t slot_length = offsets[i + 1] - prev_offset;
+        const int64_t null_count =
+            slot_length - arrow::internal::CountSetBits(values->null_bitmap()->data(),
+                                                        prev_offset, slot_length);

Review comment:
       Might add `DCHECK_LE(null_count, 1)` here?

##########
File path: cpp/src/arrow/compute/kernels/hash_aggregate.cc
##########
@@ -2036,7 +2052,50 @@ struct GroupedDistinctImpl : public GroupedCountDistinctImpl {
     ARROW_ASSIGN_OR_RAISE(auto groupings, grouper_->MakeGroupings(
                                               *uniques[1].array_as<UInt32Array>(),
                                               static_cast<uint32_t>(num_groups_), ctx_));
-    return grouper_->ApplyGroupings(*groupings, *uniques[0].make_array(), ctx_);
+    ARROW_ASSIGN_OR_RAISE(
+        auto list, grouper_->ApplyGroupings(*groupings, *uniques[0].make_array(), ctx_));
+    auto values = list->values();

Review comment:
       Perhaps `DCHECK_EQ(values->offset(), 0)` since it is relied upon below?

##########
File path: docs/source/cpp/compute.rst
##########
@@ -289,36 +289,42 @@ The supported aggregation functions are as follows. All function names are
 prefixed with ``hash_``, which differentiates them from their scalar
 equivalents above and reflects how they are implemented internally.
 
-+---------------+-------+-------------+-----------------+----------------------------------+-------+
-| Function name | Arity | Input types | Output type     | Options class                    | Notes |
-+===============+=======+=============+=================+==================================+=======+
-| hash_all      | Unary | Boolean     | Boolean         | :struct:`ScalarAggregateOptions` | \(1)  |
-+---------------+-------+-------------+-----------------+----------------------------------+-------+
-| hash_any      | Unary | Boolean     | Boolean         | :struct:`ScalarAggregateOptions` | \(1)  |
-+---------------+-------+-------------+-----------------+----------------------------------+-------+
-| hash_count    | Unary | Any         | Int64           | :struct:`CountOptions`           | \(2)  |
-+---------------+-------+-------------+-----------------+----------------------------------+-------+
-| hash_mean     | Unary | Numeric     | Decimal/Float64 | :struct:`ScalarAggregateOptions` |       |
-+---------------+-------+-------------+-----------------+----------------------------------+-------+
-| hash_min_max  | Unary | Numeric     | Struct          | :struct:`ScalarAggregateOptions` | \(3)  |
-+---------------+-------+-------------+-----------------+----------------------------------+-------+
-| hash_product  | Unary | Numeric     | Numeric         | :struct:`ScalarAggregateOptions` | \(4)  |
-+---------------+-------+-------------+-----------------+----------------------------------+-------+
-| hash_stddev   | Unary | Numeric     | Float64         | :struct:`VarianceOptions`        |       |
-+---------------+-------+-------------+-----------------+----------------------------------+-------+
-| hash_sum      | Unary | Numeric     | Numeric         | :struct:`ScalarAggregateOptions` | \(4)  |
-+---------------+-------+-------------+-----------------+----------------------------------+-------+
-| hash_tdigest  | Unary | Numeric     | Float64         | :struct:`TDigestOptions`         | \(5)  |
-+---------------+-------+-------------+-----------------+----------------------------------+-------+
-| hash_variance | Unary | Numeric     | Float64         | :struct:`VarianceOptions`        |       |
-+---------------+-------+-------------+-----------------+----------------------------------+-------+
++---------------------+-------+-------------+-----------------+----------------------------------+-------+
+| Function name       | Arity | Input types | Output type     | Options class                    | Notes |
++=====================+=======+=============+=================+==================================+=======+
+| hash_all            | Unary | Boolean     | Boolean         | :struct:`ScalarAggregateOptions` | \(1)  |
++---------------------+-------+-------------+-----------------+----------------------------------+-------+
+| hash_any            | Unary | Boolean     | Boolean         | :struct:`ScalarAggregateOptions` | \(1)  |
++---------------------+-------+-------------+-----------------+----------------------------------+-------+
+| hash_count          | Unary | Any         | Int64           | :struct:`CountOptions`           | \(2)  |
++---------------------+-------+-------------+-----------------+----------------------------------+-------+
+| hash_count_distinct | Unary | Any         | Int64           | :struct:`CountOptions`           | \(2)  |
++---------------------+-------+-------------+-----------------+----------------------------------+-------+
+| hash_distinct       | Unary | Any         | Input type      | :struct:`CountOptions`           | \(2)  |
++---------------------+-------+-------------+-----------------+----------------------------------+-------+
+| hash_mean           | Unary | Numeric     | Decimal/Float64 | :struct:`ScalarAggregateOptions` |       |
++---------------------+-------+-------------+-----------------+----------------------------------+-------+
+| hash_min_max        | Unary | Numeric     | Struct          | :struct:`ScalarAggregateOptions` | \(3)  |
++---------------------+-------+-------------+-----------------+----------------------------------+-------+
+| hash_product        | Unary | Numeric     | Numeric         | :struct:`ScalarAggregateOptions` | \(4)  |
++---------------------+-------+-------------+-----------------+----------------------------------+-------+
+| hash_stddev         | Unary | Numeric     | Float64         | :struct:`VarianceOptions`        |       |
++---------------------+-------+-------------+-----------------+----------------------------------+-------+
+| hash_sum            | Unary | Numeric     | Numeric         | :struct:`ScalarAggregateOptions` | \(4)  |
++---------------------+-------+-------------+-----------------+----------------------------------+-------+
+| hash_tdigest        | Unary | Numeric     | Float64         | :struct:`TDigestOptions`         | \(5)  |
++---------------------+-------+-------------+-----------------+----------------------------------+-------+
+| hash_variance       | Unary | Numeric     | Float64         | :struct:`VarianceOptions`        |       |
++---------------------+-------+-------------+-----------------+----------------------------------+-------+
 
 * \(1) If null values are taken into account, by setting the
   :member:`ScalarAggregateOptions::skip_nulls` to false, then `Kleene logic`_
   logic is applied. The min_count option is not respected.
 
-* \(2) CountMode controls whether only non-null values are counted (the
-  default), only null values are counted, or all values are counted.
+* \(2) CountMode controls whether only non-null values are counted
+  (the default), only null values are counted, or all values are
+  counted. For hash_distinct, it instead controls whether null values
+  are emitted.

Review comment:
       Also add that the setting doesn't apply to the grouping keys?

##########
File path: cpp/src/arrow/compute/kernels/hash_aggregate.cc
##########
@@ -2036,7 +2052,50 @@ struct GroupedDistinctImpl : public GroupedCountDistinctImpl {
     ARROW_ASSIGN_OR_RAISE(auto groupings, grouper_->MakeGroupings(
                                               *uniques[1].array_as<UInt32Array>(),
                                               static_cast<uint32_t>(num_groups_), ctx_));
-    return grouper_->ApplyGroupings(*groupings, *uniques[0].make_array(), ctx_);
+    ARROW_ASSIGN_OR_RAISE(
+        auto list, grouper_->ApplyGroupings(*groupings, *uniques[0].make_array(), ctx_));
+    auto values = list->values();
+    int32_t* offsets = reinterpret_cast<int32_t*>(list->value_offsets()->mutable_data());
+    if (options_.mode == CountOptions::ALL ||
+        (options_.mode == CountOptions::ONLY_VALID && values->null_count() == 0)) {
+      return list;
+    } else if (options_.mode == CountOptions::ONLY_VALID) {
+      int32_t prev_offset = offsets[0];
+      for (int64_t i = 0; i < list->length(); i++) {
+        const int32_t slot_length = offsets[i + 1] - prev_offset;
+        const int64_t null_count =
+            slot_length - arrow::internal::CountSetBits(values->null_bitmap()->data(),
+                                                        prev_offset, slot_length);
+        const int32_t offset = null_count > 0 ? slot_length - 1 : slot_length;
+        prev_offset = offsets[i + 1];
+        offsets[i + 1] = offsets[i] + offset;
+      }
+      auto filter =
+          std::make_shared<BooleanArray>(values->length(), values->null_bitmap());
+      ARROW_ASSIGN_OR_RAISE(
+          auto new_values,
+          Filter(std::move(values), filter, FilterOptions(FilterOptions::DROP), ctx_));
+      return std::make_shared<ListArray>(list->type(), list->length(),
+                                         list->value_offsets(), new_values.make_array());
+    }
+    // ONLY_NULL
+    int32_t prev_offset = offsets[0];
+    for (int64_t i = 0; i < list->length(); i++) {
+      const int32_t slot_length = offsets[i + 1] - prev_offset;
+      const int64_t null_count =
+          slot_length - arrow::internal::CountSetBits(values->null_bitmap()->data(),

Review comment:
       `values->null_bitmap()` may be null here, I think.

##########
File path: cpp/src/arrow/compute/kernels/hash_aggregate_test.cc
##########
@@ -1327,33 +1347,43 @@ TEST(GroupBy, CountDistinct) {
                          internal::GroupBy(
                              {
                                  table->GetColumnByName("argument"),
+                                 table->GetColumnByName("argument"),
+                                 table->GetColumnByName("argument"),
                              },
                              {
                                  table->GetColumnByName("key"),
                              },
                              {
-                                 {"hash_count_distinct", nullptr},
+                                 {"hash_count_distinct", &all},
+                                 {"hash_count_distinct", &only_valid},
+                                 {"hash_count_distinct", &only_null},
                              },
                              use_threads));
     ValidateOutput(aggregated_and_grouped);
     SortBy({"key_0"}, &aggregated_and_grouped);
 
     AssertDatumsEqual(ArrayFromJSON(struct_({
+                                        field("hash_count_distinct", int64()),
+                                        field("hash_count_distinct", int64()),
                                         field("hash_count_distinct", int64()),
                                         field("key_0", int64()),
                                     }),
                                     R"([
-    [1, 1],
-    [2, 2],
-    [3, 3],
-    [4, null]
+    [1, 1, 0, 1],
+    [2, 2, 0, 2],
+    [3, 2, 1, 3],
+    [1, 0, 1, 4],
+    [4, 4, 0, null]
   ])"),

Review comment:
       Perhaps add a test case where none of the values is null?




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org