You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2021/12/18 16:43:52 UTC

[GitHub] [arrow] lidavidm commented on a change in pull request #11990: ARROW-15032 Add DateStruct Function

lidavidm commented on a change in pull request #11990:
URL: https://github.com/apache/arrow/pull/11990#discussion_r771842555



##########
File path: cpp/src/arrow/compute/api_scalar.h
##########
@@ -1024,6 +1024,18 @@ Result<Datum> Month(const Datum& values, ExecContext* ctx = NULLPTR);
 ARROW_EXPORT
 Result<Datum> Day(const Datum& values, ExecContext* ctx = NULLPTR);
 
+/// \brief DateStruct returns a struct containing the Year, Month and Day value for
+/// each element of `values`.
+///
+/// \param[in] values input to extract (year, month, day) struct from
+/// \param[in] ctx the function execution context, optional
+/// \return the resulting datum
+///
+/// \since 7.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> DateStruct(const Datum& values, ExecContext* ctx = NULLPTR);

Review comment:
       `YearMonthDay`/`year_month_day` might be a little more consistent with the naming here?

##########
File path: cpp/src/arrow/compute/kernels/scalar_temporal_test.cc
##########
@@ -580,6 +627,7 @@ TEST_F(ScalarTemporalTest, TestZoned2) {
     CheckScalarUnary("year", unit, times_seconds_precision, int64(), year);
     CheckScalarUnary("month", unit, times_seconds_precision, int64(), month);
     CheckScalarUnary("day", unit, times_seconds_precision, int64(), day);
+    // TODO: where is year assigned in this function?

Review comment:
       TEST_F declares the test as using a [test fixture in Googletest](https://google.github.io/googletest/primer.html#same-data-multiple-tests) so it effectively becomes a method of the fixture class above. Hence `year` comes from here: https://github.com/apache/arrow/blob/b5a47e166c7c168c3393256d4da48bf7b7f6add5/cpp/src/arrow/compute/kernels/scalar_temporal_test.cc#L141

##########
File path: cpp/src/arrow/compute/kernels/scalar_temporal_unary.cc
##########
@@ -185,6 +191,98 @@ struct Day {
   Localizer localizer_;
 };
 
+// ----------------------------------------------------------------------
+// Extract (year, month, day) struct from temporal types
+
+template <typename Duration, typename InType, typename BuilderType>
+struct DateStructVisitValueFunction {
+  static Result<std::function<Status(typename InType::c_type arg)>> Get(
+      const std::vector<BuilderType*>& field_builders, const ArrayData&,
+      StructBuilder* struct_builder) {
+    return [=](typename InType::c_type arg) {
+      const auto ymd = year_month_day(floor<days>(NonZonedLocalizer{}.template ConvertTimePoint<Duration>(arg)));;
+      field_builders[0]->UnsafeAppend(static_cast<const int32_t>(ymd.year()));
+      field_builders[1]->UnsafeAppend(static_cast<const uint32_t>(ymd.month()));
+      field_builders[2]->UnsafeAppend(static_cast<const uint32_t>(ymd.day()));
+      return struct_builder->Append();
+    };
+  };
+};  
+
+template <typename Duration, typename BuilderType>
+struct DateStructVisitValueFunction<Duration, TimestampType, BuilderType> {
+  static Result<std::function<Status(typename TimestampType::c_type arg)>> Get(
+      const std::vector<BuilderType*>& field_builders, const ArrayData& in,
+      StructBuilder* struct_builder) {
+    const auto& timezone = GetInputTimezone(in);
+    if (timezone.empty()) {
+      return [=](TimestampType::c_type arg) {
+        const auto ymd = year_month_day(floor<days>(NonZonedLocalizer{}.template ConvertTimePoint<Duration>(arg)));;
+        field_builders[0]->UnsafeAppend(static_cast<const int32_t>(ymd.year()));
+        field_builders[1]->UnsafeAppend(static_cast<const uint32_t>(ymd.month()));
+        field_builders[2]->UnsafeAppend(static_cast<const uint32_t>(ymd.day()));
+        return struct_builder->Append();
+      };
+    }
+    ARROW_ASSIGN_OR_RAISE(auto tz, LocateZone(timezone));
+    return [=](TimestampType::c_type arg) {
+        const auto ymd = year_month_day(floor<days>(ZonedLocalizer{tz}.template ConvertTimePoint<Duration>(arg)));      
+        field_builders[0]->UnsafeAppend(static_cast<const int32_t>(ymd.year()));
+        field_builders[1]->UnsafeAppend(static_cast<const uint32_t>(ymd.month()));
+        field_builders[2]->UnsafeAppend(static_cast<const uint32_t>(ymd.day()));
+      return struct_builder->Append();
+    };
+  }
+};
+
+template <typename Duration, typename InType>
+struct DateStruct {
+  static Status Call(KernelContext* ctx, const Scalar& in, Scalar* out) {
+    if (in.is_valid) {
+      const auto& in_val = internal::UnboxScalar<const InType>::Unbox(in);
+      const auto t = floor<days>(NonZonedLocalizer{}.template ConvertTimePoint<Duration>(in_val));

Review comment:
       It might be possible to structure things more like the other kernels except perhaps return `year_month_day`. And then we can dispatch to the right one variant here and reuse the same implementation across the scalar/array and zoned/naive timestamp cases.

##########
File path: docs/source/cpp/compute.rst
##########
@@ -1390,29 +1392,31 @@ For timestamps inputs with non-empty timezone, localized timestamp components wi
 +--------------------+------------+-------------------+---------------+----------------------------+-------+
 | subsecond          | Unary      | Timestamp, Time   | Double        |                            |       |
 +--------------------+------------+-------------------+---------------+----------------------------+-------+
-| us_week            | Unary      | Temporal          | Int64         |                            | \(4)  |
+| us_week            | Unary      | Temporal          | Int64         |                            | \(5)  |
 +--------------------+------------+-------------------+---------------+----------------------------+-------+
-| week               | Unary      | Timestamp         | Int64         | :struct:`WeekOptions`      | \(5)  |
+| week               | Unary      | Timestamp         | Int64         | :struct:`WeekOptions`      | \(6)  |
 +--------------------+------------+-------------------+---------------+----------------------------+-------+
 | year               | Unary      | Temporal          | Int64         |                            |       |
 +--------------------+------------+-------------------+---------------+----------------------------+-------+
 
-* \(1) Outputs the number of the day of the week. By default week begins on Monday
+* \(1) Output is a ``{"year": output type, "month": output type, "day": output type}`` Struct.

Review comment:
       This should be something like `{"year": int64(), ...}` no?

##########
File path: cpp/src/arrow/compute/kernels/scalar_temporal_unary.cc
##########
@@ -833,6 +931,13 @@ const FunctionDoc day_doc{
      "cannot be found in the timezone database."),
     {"values"}};
 
+const FunctionDoc date_struct_doc{
+  "Extract (year, month, day) struct",
+  ("Null values emit null.\n"
+   "An error is returned in the values have a defined timezone but it\n"
+   "cannot be found in the timezone database."),

Review comment:
       ```suggestion
      "An error is returned if the values have a defined timezone but it\n"
      "cannot be found in the timezone database."),
   ```

##########
File path: cpp/src/arrow/compute/kernels/scalar_temporal_unary.cc
##########
@@ -185,6 +191,98 @@ struct Day {
   Localizer localizer_;
 };
 
+// ----------------------------------------------------------------------
+// Extract (year, month, day) struct from temporal types
+
+template <typename Duration, typename InType, typename BuilderType>
+struct DateStructVisitValueFunction {
+  static Result<std::function<Status(typename InType::c_type arg)>> Get(
+      const std::vector<BuilderType*>& field_builders, const ArrayData&,
+      StructBuilder* struct_builder) {
+    return [=](typename InType::c_type arg) {
+      const auto ymd = year_month_day(floor<days>(NonZonedLocalizer{}.template ConvertTimePoint<Duration>(arg)));;
+      field_builders[0]->UnsafeAppend(static_cast<const int32_t>(ymd.year()));
+      field_builders[1]->UnsafeAppend(static_cast<const uint32_t>(ymd.month()));
+      field_builders[2]->UnsafeAppend(static_cast<const uint32_t>(ymd.day()));
+      return struct_builder->Append();
+    };
+  };
+};  
+
+template <typename Duration, typename BuilderType>
+struct DateStructVisitValueFunction<Duration, TimestampType, BuilderType> {
+  static Result<std::function<Status(typename TimestampType::c_type arg)>> Get(
+      const std::vector<BuilderType*>& field_builders, const ArrayData& in,
+      StructBuilder* struct_builder) {
+    const auto& timezone = GetInputTimezone(in);
+    if (timezone.empty()) {
+      return [=](TimestampType::c_type arg) {
+        const auto ymd = year_month_day(floor<days>(NonZonedLocalizer{}.template ConvertTimePoint<Duration>(arg)));;
+        field_builders[0]->UnsafeAppend(static_cast<const int32_t>(ymd.year()));
+        field_builders[1]->UnsafeAppend(static_cast<const uint32_t>(ymd.month()));
+        field_builders[2]->UnsafeAppend(static_cast<const uint32_t>(ymd.day()));
+        return struct_builder->Append();
+      };
+    }
+    ARROW_ASSIGN_OR_RAISE(auto tz, LocateZone(timezone));
+    return [=](TimestampType::c_type arg) {
+        const auto ymd = year_month_day(floor<days>(ZonedLocalizer{tz}.template ConvertTimePoint<Duration>(arg)));      
+        field_builders[0]->UnsafeAppend(static_cast<const int32_t>(ymd.year()));
+        field_builders[1]->UnsafeAppend(static_cast<const uint32_t>(ymd.month()));
+        field_builders[2]->UnsafeAppend(static_cast<const uint32_t>(ymd.day()));
+      return struct_builder->Append();
+    };
+  }
+};
+
+template <typename Duration, typename InType>
+struct DateStruct {
+  static Status Call(KernelContext* ctx, const Scalar& in, Scalar* out) {
+    if (in.is_valid) {
+      const auto& in_val = internal::UnboxScalar<const InType>::Unbox(in);
+      const auto t = floor<days>(NonZonedLocalizer{}.template ConvertTimePoint<Duration>(in_val));

Review comment:
       What if the scalar type has a timezone?

##########
File path: cpp/src/arrow/compute/kernels/scalar_temporal_unary.cc
##########
@@ -185,6 +191,98 @@ struct Day {
   Localizer localizer_;
 };
 
+// ----------------------------------------------------------------------
+// Extract (year, month, day) struct from temporal types
+
+template <typename Duration, typename InType, typename BuilderType>
+struct DateStructVisitValueFunction {
+  static Result<std::function<Status(typename InType::c_type arg)>> Get(
+      const std::vector<BuilderType*>& field_builders, const ArrayData&,
+      StructBuilder* struct_builder) {
+    return [=](typename InType::c_type arg) {
+      const auto ymd = year_month_day(floor<days>(NonZonedLocalizer{}.template ConvertTimePoint<Duration>(arg)));;
+      field_builders[0]->UnsafeAppend(static_cast<const int32_t>(ymd.year()));
+      field_builders[1]->UnsafeAppend(static_cast<const uint32_t>(ymd.month()));
+      field_builders[2]->UnsafeAppend(static_cast<const uint32_t>(ymd.day()));
+      return struct_builder->Append();
+    };
+  };
+};  
+
+template <typename Duration, typename BuilderType>
+struct DateStructVisitValueFunction<Duration, TimestampType, BuilderType> {
+  static Result<std::function<Status(typename TimestampType::c_type arg)>> Get(
+      const std::vector<BuilderType*>& field_builders, const ArrayData& in,
+      StructBuilder* struct_builder) {
+    const auto& timezone = GetInputTimezone(in);
+    if (timezone.empty()) {
+      return [=](TimestampType::c_type arg) {
+        const auto ymd = year_month_day(floor<days>(NonZonedLocalizer{}.template ConvertTimePoint<Duration>(arg)));;
+        field_builders[0]->UnsafeAppend(static_cast<const int32_t>(ymd.year()));
+        field_builders[1]->UnsafeAppend(static_cast<const uint32_t>(ymd.month()));
+        field_builders[2]->UnsafeAppend(static_cast<const uint32_t>(ymd.day()));
+        return struct_builder->Append();
+      };
+    }
+    ARROW_ASSIGN_OR_RAISE(auto tz, LocateZone(timezone));
+    return [=](TimestampType::c_type arg) {
+        const auto ymd = year_month_day(floor<days>(ZonedLocalizer{tz}.template ConvertTimePoint<Duration>(arg)));      
+        field_builders[0]->UnsafeAppend(static_cast<const int32_t>(ymd.year()));
+        field_builders[1]->UnsafeAppend(static_cast<const uint32_t>(ymd.month()));
+        field_builders[2]->UnsafeAppend(static_cast<const uint32_t>(ymd.day()));
+      return struct_builder->Append();
+    };
+  }
+};
+
+template <typename Duration, typename InType>
+struct DateStruct {
+  static Status Call(KernelContext* ctx, const Scalar& in, Scalar* out) {
+    if (in.is_valid) {
+      const auto& in_val = internal::UnboxScalar<const InType>::Unbox(in);
+      const auto t = floor<days>(NonZonedLocalizer{}.template ConvertTimePoint<Duration>(in_val));
+      const auto ymd = year_month_day(t);
+      
+      ScalarVector values = {std::make_shared<Int64Scalar>(static_cast<const int32_t>(ymd.year())),
+	std::make_shared<Int64Scalar>(static_cast<const uint32_t>(ymd.month())),
+	std::make_shared<Int64Scalar>(static_cast<const uint32_t>(ymd.day()))};
+      *checked_cast<StructScalar*>(out) =
+          StructScalar(std::move(values), DateStructType());

Review comment:
       You're going to want to format the code. See https://arrow.apache.org/docs/developers/cpp/development.html#code-style-linting-and-ci




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org