You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by "bkietz (via GitHub)" <gi...@apache.org> on 2023/06/20 18:25:44 UTC

[GitHub] [arrow] bkietz commented on a diff in pull request #36020: GH-32190: [C++][Compute] Implement cumulative prod, max and min functions

bkietz commented on code in PR #36020:
URL: https://github.com/apache/arrow/pull/36020#discussion_r1235619772


##########
cpp/src/arrow/compute/function_internal.h:
##########
@@ -382,6 +377,12 @@ static inline Result<std::shared_ptr<Scalar>> GenericToScalar(const Datum& value
   }
 }
 
+template <typename T>
+static inline Result<decltype(MakeScalar(std::declval<T>()))> GenericToScalar(
+    const std::optional<T>& value) {
+  return value.has_value() ? MakeScalar(value.value()) : std::make_shared<NullScalar>();
+}

Review Comment:
   This might be more readable with a trailing return type. Also, do you think we should include an overload which accepts std::nullopt?
   
   ```suggestion
   static inline Result<std::shared_ptr<Scalar>> GenericToScalar(std::nullopt_t) {
     return std::make_shared<NullScalar>();
   }
   
   template <typename T>
   static inline auto GenericToScalar(
       const std::optional<T>& value) -> Result<decltype(MakeScalar(value.value()))> {
     return value.has_value() ? MakeScalar(value.value()) : std::make_shared<NullScalar>();
   }
   ```



##########
cpp/src/arrow/compute/kernels/vector_cumulative_ops.cc:
##########
@@ -160,18 +169,52 @@ const FunctionDoc cumulative_sum_doc{
     ("`values` must be numeric. Return an array/chunked array which is the\n"
      "cumulative sum computed over `values`. Results will wrap around on\n"
      "integer overflow. Use function \"cumulative_sum_checked\" if you want\n"
-     "overflow to return an error."),
+     "overflow to return an error. The default start is 0."),
     {"values"},
-    "CumulativeSumOptions"};
+    "CumulativeOptions"};
 
 const FunctionDoc cumulative_sum_checked_doc{
     "Compute the cumulative sum over a numeric input",
     ("`values` must be numeric. Return an array/chunked array which is the\n"
      "cumulative sum computed over `values`. This function returns an error\n"
      "on overflow. For a variant that doesn't fail on overflow, use\n"
-     "function \"cumulative_sum\"."),
+     "function \"cumulative_sum\". The default start is 0."),
+    {"values"},
+    "CumulativeOptions"};
+
+const FunctionDoc cumulative_prod_doc{
+    "Compute the cumulative product over a numeric input",
+    ("`values` must be numeric. Return an array/chunked array which is the\n"
+     "cumulative product computed over `values`. Results will wrap around on\n"
+     "integer overflow. Use function \"cumulative_prod_checked\" if you want\n"
+     "overflow to return an error. The default start is 1."),
+    {"values"},
+    "CumulativeOptions"};
+
+const FunctionDoc cumulative_prod_checked_doc{
+    "Compute the cumulative product over a numeric input",
+    ("`values` must be numeric. Return an array/chunked array which is the\n"
+     "cumulative product computed over `values`. This function returns an error\n"
+     "on overflow. For a variant that doesn't fail on overflow, use\n"
+     "function \"cumulative_prod\". The default start is 1."),
+    {"values"},
+    "CumulativeOptions"};
+
+const FunctionDoc cumulative_max_doc{
+    "Compute the cumulative max over a numeric input",
+    ("`values` must be numeric. Return an array/chunked array which is the\n"
+     "cumulative max computed over `values`. The default start is the minimum\n"
+     "value of input type."),

Review Comment:
   I think it'd be worthwhile to be even more explicit here
   ```suggestion
        "value of input type (so that any other value will replace the\n
        "start as the new maximum)."),
   ```
   (and similarly for cumulative min)



##########
cpp/src/arrow/compute/function_internal.h:
##########
@@ -510,6 +491,29 @@ static inline enable_if_same_result<T, Datum> GenericFromScalar(
   return Status::Invalid("Cannot deserialize Datum from ", value->ToString());
 }
 
+template <typename>
+constexpr bool is_optional_impl = false;
+template <typename T>
+constexpr bool is_optional_impl<std::optional<T>> = true;
+
+template <typename T>
+using is_optional =
+    std::integral_constant<bool, is_optional_impl<std::decay_t<T>> ||
+                                     std::is_same<T, std::nullopt_t>::value>;
+
+template <typename T, typename R = void>
+using enable_if_optional = enable_if_t<is_optional<T>::value, Result<T>>;
+
+template <typename T>
+static inline enable_if_optional<T> GenericFromScalar(

Review Comment:
   I think this doesn't need the `integral_constant` or the custom `enable_if_optional`, would you mind simplifying it?
   
   ```suggestion
   template <typename>
   constexpr bool is_optional_v = false;
   template <typename T>
   constexpr bool is_optional_v<std::optional<T>> = true;
   template <>
   constexpr bool is_optional_v<std::nullopt_t> = true;
   
   template <typename T>
   static inline std::enable_if_t<is_optional_v<T>, Result<T>> GenericFromScalar(
   ```



##########
python/pyarrow/_compute.pyx:
##########
@@ -1928,31 +1928,37 @@ class PartitionNthOptions(_PartitionNthOptions):
         self._set_options(pivot, null_placement)
 
 
-cdef class _CumulativeSumOptions(FunctionOptions):
+cdef class _CumulativeOptions(FunctionOptions):
     def _set_options(self, start, skip_nulls):
-        if not isinstance(start, Scalar):
+        if start is None:
+            self.wrapped.reset(new CCumulativeOptions(skip_nulls))
+        elif isinstance(start, Scalar):
+            self.wrapped.reset(new CCumulativeOptions(
+                pyarrow_unwrap_scalar(start), skip_nulls))
+        else:
             try:
                 start = lib.scalar(start)
+                self.wrapped.reset(new CCumulativeOptions(
+                    pyarrow_unwrap_scalar(start), skip_nulls))
             except Exception:
                 _raise_invalid_function_option(
                     start, "`start` type for CumulativeSumOptions", TypeError)
 
-        self.wrapped.reset(new CCumulativeSumOptions((<Scalar> start).unwrap(), skip_nulls))
-
 
-class CumulativeSumOptions(_CumulativeSumOptions):
+class CumulativeOptions(_CumulativeOptions):
     """
-    Options for `cumulative_sum` function.
+    Options for `cumulative` functions.

Review Comment:
   I think it'd be worthwhile to list these explicitly
   ```suggestion
       Options for `cumulative_*` functions.
       
       - cumulative_sum
       - cumulative_sum_checked
       ...
   ```



##########
python/pyarrow/compute.py:
##########
@@ -33,7 +33,7 @@
     AssumeTimezoneOptions,
     CastOptions,
     CountOptions,
-    CumulativeSumOptions,
+    CumulativeOptions,

Review Comment:
   @jorisvandenbossche Does this need an alias?
   ```suggestion
       CumulativeOptions,
       CumulativeOptions as CumulativeSumOptions,
   ```



##########
cpp/src/arrow/compute/function_internal.h:
##########
@@ -510,6 +491,29 @@ static inline enable_if_same_result<T, Datum> GenericFromScalar(
   return Status::Invalid("Cannot deserialize Datum from ", value->ToString());
 }
 
+template <typename>
+constexpr bool is_optional_impl = false;
+template <typename T>
+constexpr bool is_optional_impl<std::optional<T>> = true;
+
+template <typename T>
+using is_optional =
+    std::integral_constant<bool, is_optional_impl<std::decay_t<T>> ||
+                                     std::is_same<T, std::nullopt_t>::value>;
+
+template <typename T, typename R = void>
+using enable_if_optional = enable_if_t<is_optional<T>::value, Result<T>>;
+
+template <typename T>
+static inline enable_if_optional<T> GenericFromScalar(

Review Comment:
   That seems like a better default indeed



##########
cpp/src/arrow/compute/kernels/base_arithmetic_internal.h:
##########
@@ -60,6 +60,11 @@ struct Add {
   static enable_if_decimal_value<T> Call(KernelContext*, Arg0 left, Arg1 right, Status*) {
     return left + right;
   }
+
+  template <typename T>
+  static constexpr T Identity() {

Review Comment:
   This is great. Could you make this into a doccomment? Also, it might be more obviously part of the interface for these kernel ops if  you made it a trait:
   
   ```c++
   /// The term identity is from the mathematical notation monoid.
   /// For any associative binary operation, identity is defined as
   ///
   ///     Op(identity, x) = x for all x.
   template <typename Op>
   struct Identity;
   
   template <>
   struct Identity<Add> {
     template <typename Value>
     static constexpr Value value{0};
   };
   
   template <>
   struct Identity<AddChecked> : Identity<Add> {};
   
   // ...
   ```
   
   Then the value of the identity can be referenced with `Identity<Add>::value<int32_t>` or so



##########
cpp/src/arrow/compute/kernels/vector_cumulative_ops.cc:
##########
@@ -49,15 +50,12 @@ struct CumulativeOptionsWrapper : public OptionsWrapper<OptionsType> {
     }
 
     const auto& start = options->start;
-    if (!start || !start->is_valid) {
-      return Status::Invalid("Cumulative `start` option must be non-null and valid");
-    }
 
-    // Ensure `start` option matches input type
-    if (!start->type->Equals(*args.inputs[0])) {
-      ARROW_ASSIGN_OR_RAISE(
-          auto casted_start,
-          Cast(Datum(start), args.inputs[0], CastOptions::Safe(), ctx->exec_context()));
+    // Ensure `start` option, if given, matches input type,

Review Comment:
   ```suggestion
       // Ensure `start` option, if given, matches input type
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org