You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2021/04/29 03:20:41 UTC

[GitHub] [arrow] cyb70289 commented on a change in pull request #9758: ARROW-9054: [C++] Add ScalarAggregateOptions

cyb70289 commented on a change in pull request #9758:
URL: https://github.com/apache/arrow/pull/9758#discussion_r622698623



##########
File path: cpp/src/arrow/compute/kernels/aggregate_basic_internal.h
##########
@@ -72,44 +73,54 @@ struct SumImpl : public ScalarAggregator {
 
   Status MergeFrom(KernelContext*, KernelState&& src) override {
     const auto& other = checked_cast<const ThisType&>(src);
+    this->length += other.length;
     this->count += other.count;
     this->sum += other.sum;
     return Status::OK();
   }
 
   Status Finalize(KernelContext*, Datum* out) override {
-    if (this->count == 0) {
+    if (this->count < options.min_count) {
       out->value = std::make_shared<OutputType>();
     } else {
       out->value = MakeScalar(this->sum);
     }
     return Status::OK();
   }
 
+  size_t length = 0;
   size_t count = 0;
   typename SumType::c_type sum = 0;
+  ScalarAggregateOptions options;
 };
 
 template <typename ArrowType, SimdLevel::type SimdLevel>
 struct MeanImpl : public SumImpl<ArrowType, SimdLevel> {
   Status Finalize(KernelContext*, Datum* out) override {
-    if (this->count == 0) {
+    if (this->count < options.min_count) {
       out->value = std::make_shared<DoubleScalar>();
-    } else {
+    } else if (options.skip_nulls) {
       const double mean = static_cast<double>(this->sum) / this->count;
       out->value = std::make_shared<DoubleScalar>(mean);
+    } else {
+      const double mean = static_cast<double>(this->sum) / this->length;

Review comment:
       Does it mean `treat null as 0`? IMHO it is not the wanted behaviour, and may give misleading results.
   
   We can simply return empty scalar if there are nulls but the user doesn't want to skip nulls. Similar as pandas.
   ```
   In [1]: import pandas as pd
   
   In [2]: a = pd.Series([1,2,None])
   
   In [3]: a.sum(skipna=True)
   Out[3]: 3.0
   
   In [4]: a.sum(skipna=False)
   Out[4]: nan
   
   In [5]: a.mean(skipna=True)
   Out[5]: 1.5
   
   In [6]: a.mean(skipna=False)
   Out[6]: nan
   
   ```

##########
File path: python/pyarrow/tests/test_compute.py
##########
@@ -1092,11 +1092,8 @@ def test_strptime():
 def test_count():
     arr = pa.array([1, 2, 3, None, None])
     assert pc.count(arr).as_py() == 3
-    assert pc.count(arr, count_mode='count_non_null').as_py() == 3
-    assert pc.count(arr, count_mode='count_null').as_py() == 2
-
-    with pytest.raises(ValueError, match="'zzz' is not a valid count_mode"):
-        pc.count(arr, count_mode='zzz')

Review comment:
       Why remove this?

##########
File path: python/pyarrow/tests/test_compute.py
##########
@@ -207,7 +207,7 @@ def test_sum_array(arrow_type):
     assert pc.sum(arr).as_py() == 10

Review comment:
       Also test arrays with Nones?

##########
File path: cpp/src/arrow/compute/kernels/hash_aggregate_test.cc
##########
@@ -659,18 +659,17 @@ TEST(GroupBy, ConcreteCaseWithValidateGroupBy) {
     [null,  "gama"]
   ])");
 
-  CountOptions count_non_null{CountOptions::COUNT_NON_NULL},
-      count_null{CountOptions::COUNT_NULL};
-
-  MinMaxOptions emit_null{MinMaxOptions::EMIT_NULL};
+  ScalarAggregateOptions keepna{false, 1};
+  ScalarAggregateOptions skipna{true, 1};
+  ScalarAggregateOptions other{true, 1};
 
   using internal::Aggregate;
   for (auto agg : {
-           Aggregate{"hash_sum", nullptr},

Review comment:
       Why `nullptr` is not accepted now?

##########
File path: python/pyarrow/tests/test_compute.py
##########
@@ -229,7 +229,7 @@ def test_sum_chunked_array(arrow_type):
 
     arr = pa.chunked_array((), type=arrow_type)
     assert arr.num_chunks == 0
-    assert pc.sum(arr).as_py() is None  # noqa: E711
+    assert pc.sum(arr).as_py() == 0

Review comment:
       This `sum empty array to 0` is a bit surprising to me. But looks pandas is doing the same thing.

##########
File path: cpp/src/arrow/compute/kernels/aggregate_basic_internal.h
##########
@@ -256,8 +268,10 @@ struct MinMaxImpl : public ScalarAggregator {
     using ScalarType = typename TypeTraits<ArrowType>::ScalarType;
 
     std::vector<std::shared_ptr<Scalar>> values;
-    if (!state.has_values ||
-        (state.has_nulls && options.null_handling == MinMaxOptions::EMIT_NULL)) {
+    if (!state.has_values && options.skip_nulls && options.min_count == 0) {
+      values = {std::make_shared<ScalarType>(state.min),
+                std::make_shared<ScalarType>(state.max)};

Review comment:
       Code in this `if` clause looks not necessary as they are exactly the same as code in the last `else`.




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org