You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2021/04/15 08:22:51 UTC

[GitHub] [arrow] jorisvandenbossche commented on a change in pull request #9758: ARROW-9054: [C++] Add ScalarAggregateOptions

jorisvandenbossche commented on a change in pull request #9758:
URL: https://github.com/apache/arrow/pull/9758#discussion_r613850242



##########
File path: cpp/src/arrow/compute/api_aggregate.h
##########
@@ -37,43 +37,25 @@ class ExecContext;
 // ----------------------------------------------------------------------
 // Aggregate functions
 
-/// \addtogroup compute-concrete-options
-/// @{
-
-/// \brief Control Count kernel behavior
-///
-/// By default, all non-null values are counted.
-struct ARROW_EXPORT CountOptions : public FunctionOptions {
-  enum Mode {
-    /// Count all non-null values.
-    COUNT_NON_NULL = 0,
-    /// Count all null values.
-    COUNT_NULL,
-  };
-
-  explicit CountOptions(enum Mode count_mode = COUNT_NON_NULL) : count_mode(count_mode) {}
-
-  static CountOptions Defaults() { return CountOptions(COUNT_NON_NULL); }
-
-  enum Mode count_mode;
-};
-
-/// \brief Control MinMax kernel behavior
+/// \brief Control general scalar aggregate kernel behavior
 ///
 /// By default, null values are ignored
-struct ARROW_EXPORT MinMaxOptions : public FunctionOptions {
+struct ARROW_EXPORT ScalarAggregateOptions : public FunctionOptions {
   enum Mode {
-    /// Skip null values
-    SKIP = 0,
-    /// Any nulls will result in null output
-    EMIT_NULL
+    /// Skip null values.
+    SKIPNA = 0,

Review comment:
       Can you use the "NULL" terminilogy instead of "NA" (since "null" is what we generally use within Arrow), eg `SKIP_NULL`

##########
File path: cpp/src/arrow/compute/api_aggregate.h
##########
@@ -37,43 +37,25 @@ class ExecContext;
 // ----------------------------------------------------------------------
 // Aggregate functions
 
-/// \addtogroup compute-concrete-options
-/// @{
-
-/// \brief Control Count kernel behavior
-///
-/// By default, all non-null values are counted.
-struct ARROW_EXPORT CountOptions : public FunctionOptions {
-  enum Mode {
-    /// Count all non-null values.
-    COUNT_NON_NULL = 0,
-    /// Count all null values.
-    COUNT_NULL,
-  };
-
-  explicit CountOptions(enum Mode count_mode = COUNT_NON_NULL) : count_mode(count_mode) {}
-
-  static CountOptions Defaults() { return CountOptions(COUNT_NON_NULL); }
-
-  enum Mode count_mode;
-};
-
-/// \brief Control MinMax kernel behavior
+/// \brief Control general scalar aggregate kernel behavior
 ///
 /// By default, null values are ignored
-struct ARROW_EXPORT MinMaxOptions : public FunctionOptions {
+struct ARROW_EXPORT ScalarAggregateOptions : public FunctionOptions {
   enum Mode {

Review comment:
       Could this "Mode" get a more descriptive name? As I understand it, this is specifically about handling nulls?

##########
File path: cpp/src/arrow/compute/api_aggregate.h
##########
@@ -37,43 +37,25 @@ class ExecContext;
 // ----------------------------------------------------------------------
 // Aggregate functions
 
-/// \addtogroup compute-concrete-options
-/// @{
-
-/// \brief Control Count kernel behavior
-///
-/// By default, all non-null values are counted.
-struct ARROW_EXPORT CountOptions : public FunctionOptions {
-  enum Mode {
-    /// Count all non-null values.
-    COUNT_NON_NULL = 0,
-    /// Count all null values.
-    COUNT_NULL,
-  };
-
-  explicit CountOptions(enum Mode count_mode = COUNT_NON_NULL) : count_mode(count_mode) {}
-
-  static CountOptions Defaults() { return CountOptions(COUNT_NON_NULL); }
-
-  enum Mode count_mode;
-};
-
-/// \brief Control MinMax kernel behavior
+/// \brief Control general scalar aggregate kernel behavior
 ///
 /// By default, null values are ignored
-struct ARROW_EXPORT MinMaxOptions : public FunctionOptions {
+struct ARROW_EXPORT ScalarAggregateOptions : public FunctionOptions {
   enum Mode {

Review comment:
       (ah, I see that the field itself below is called "null_handling", that's more descriptive)

##########
File path: cpp/src/arrow/compute/api_aggregate.h
##########
@@ -37,43 +37,25 @@ class ExecContext;
 // ----------------------------------------------------------------------
 // Aggregate functions
 
-/// \addtogroup compute-concrete-options
-/// @{
-
-/// \brief Control Count kernel behavior
-///
-/// By default, all non-null values are counted.
-struct ARROW_EXPORT CountOptions : public FunctionOptions {
-  enum Mode {
-    /// Count all non-null values.
-    COUNT_NON_NULL = 0,
-    /// Count all null values.
-    COUNT_NULL,
-  };
-
-  explicit CountOptions(enum Mode count_mode = COUNT_NON_NULL) : count_mode(count_mode) {}
-
-  static CountOptions Defaults() { return CountOptions(COUNT_NON_NULL); }
-
-  enum Mode count_mode;
-};
-
-/// \brief Control MinMax kernel behavior
+/// \brief Control general scalar aggregate kernel behavior
 ///
 /// By default, null values are ignored
-struct ARROW_EXPORT MinMaxOptions : public FunctionOptions {
+struct ARROW_EXPORT ScalarAggregateOptions : public FunctionOptions {
   enum Mode {
-    /// Skip null values
-    SKIP = 0,
-    /// Any nulls will result in null output
-    EMIT_NULL
+    /// Skip null values.
+    SKIPNA = 0,
+    /// Calculate over all values.
+    KEEPNA,

Review comment:
       Do we think there will come more possible options in the future? Because if not, this null_handling mode could also be a boolean skip_nulls=True/False ?

##########
File path: cpp/src/arrow/compute/kernels/aggregate_test.cc
##########
@@ -399,6 +498,13 @@ TYPED_TEST_SUITE(TestMeanKernelNumeric, NumericArrowTypes);
 TYPED_TEST(TestMeanKernelNumeric, SimpleMean) {
   using ScalarType = typename TypeTraits<DoubleType>::ScalarType;
 
+  const ScalarAggregateOptions& options =
+      ScalarAggregateOptions(ScalarAggregateOptions::SKIPNA, 0);
+
+  ValidateMean<TypeParam>("[]", Datum(std::make_shared<ScalarType>(0)), options);

Review comment:
       Just to make sure I am reading this one correctly, is it basically testing `mean([], min_count=0) == 0` ?
   
   I am not sure that should give 0 as result? I would have expected either NaN or Null. 
   Basically mean of empty is dividing 0/0, which is NaN. 




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org