You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2022/04/13 10:25:30 UTC

[GitHub] [arrow] pitrou commented on a diff in pull request #12657: ARROW-14821: [C++][R] Implement bindings for lubridate's floor_date, ceiling_date, and round_date

pitrou commented on code in PR #12657:
URL: https://github.com/apache/arrow/pull/12657#discussion_r849301387


##########
python/pyarrow/tests/test_compute.py:
##########
@@ -2104,6 +2116,7 @@ def test_round_temporal(unit):
         "1967-02-26 05:56:46.922376960",
         "1975-11-01 10:55:37.016146432",
         "1982-01-21 18:43:44.517366784",
+        "1992-01-01T00:00:00.100000000",

Review Comment:
   I'm curious, is the use of a "T" separator deliberate? Otherwise it might puzzle the reader.



##########
cpp/src/arrow/compute/kernels/scalar_temporal_unary.cc:
##########
@@ -705,21 +721,75 @@ year_month_day GetFlooredYmd(int64_t arg, int multiple, Localizer localizer_) {
     } else {
       total_months = (total_months - multiple + 1) / multiple * multiple;
     }
-    return year_month_day(year{1970} / jan / 0) + months{total_months};
+    return year{1970} / jan / 1 + months{total_months};
   }
 }
 
 template <typename Duration, typename Unit, typename Localizer>
-const Duration FloorTimePoint(const int64_t arg, const int64_t multiple,
+const Duration FloorTimePoint(const int64_t arg, const RoundTemporalOptions options,

Review Comment:
   Same here (pass by reference)?



##########
cpp/src/arrow/compute/kernels/scalar_temporal_unary.cc:
##########
@@ -705,21 +721,75 @@ year_month_day GetFlooredYmd(int64_t arg, int multiple, Localizer localizer_) {
     } else {
       total_months = (total_months - multiple + 1) / multiple * multiple;
     }
-    return year_month_day(year{1970} / jan / 0) + months{total_months};
+    return year{1970} / jan / 1 + months{total_months};
   }
 }
 
 template <typename Duration, typename Unit, typename Localizer>
-const Duration FloorTimePoint(const int64_t arg, const int64_t multiple,
+const Duration FloorTimePoint(const int64_t arg, const RoundTemporalOptions options,
                               Localizer localizer_, Status* st) {
   const auto t = localizer_.template ConvertTimePoint<Duration>(arg);
-  const Unit d = floor<Unit>(t).time_since_epoch();
 
-  if (multiple == 1) {
+  if (options.multiple == 1) {
+    const Unit d = floor<Unit>(t).time_since_epoch();
     return localizer_.template ConvertLocalToSys<Duration>(duration_cast<Duration>(d),
                                                            st);
+  } else if (options.calendar_based_origin) {
+    const Unit unit = Unit{options.multiple};
+
+    switch (options.unit) {
+      case compute::CalendarUnit::DAY: {
+        const auto origin =
+            localizer_.ConvertDays(year_month_day(floor<days>(t)).year() /
+                                   year_month_day(floor<days>(t)).month() / 1);
+        const auto m = (floor<days>(t) - origin) / unit * unit + origin;

Review Comment:
   Is `floor<days>` actually required here?



##########
cpp/src/arrow/compute/kernels/scalar_temporal_unary.cc:
##########
@@ -689,11 +689,27 @@ struct IsDaylightSavings {
 // Round temporal values to given frequency
 
 template <typename Duration, typename Localizer>
-year_month_day GetFlooredYmd(int64_t arg, int multiple, Localizer localizer_) {
+year_month_day GetFlooredYmd(int64_t arg, const int multiple,
+                             const RoundTemporalOptions options, Localizer localizer_) {

Review Comment:
   Probably better to pass the options by reference here?
   ```suggestion
   year_month_day GetFlooredYmd(int64_t arg, const int multiple,
                                const RoundTemporalOptions& options, Localizer localizer_) {
   ```



##########
cpp/src/arrow/compute/kernels/scalar_temporal_unary.cc:
##########
@@ -689,11 +689,27 @@ struct IsDaylightSavings {
 // Round temporal values to given frequency
 
 template <typename Duration, typename Localizer>
-year_month_day GetFlooredYmd(int64_t arg, int multiple, Localizer localizer_) {
+year_month_day GetFlooredYmd(int64_t arg, const int multiple,
+                             const RoundTemporalOptions options, Localizer localizer_) {
   year_month_day ymd{floor<days>(localizer_.template ConvertTimePoint<Duration>(arg))};
 
   if (multiple == 1) {
     return year_month_day(ymd.year() / ymd.month() / 1);
+  } else if (options.calendar_based_origin) {
+    switch (options.unit) {
+      case compute::CalendarUnit::MONTH: {
+        const auto m =
+            static_cast<uint32_t>(ymd.month()) / options.multiple * options.multiple;
+        return year_month_day(ymd.year() / 1 / 1) + months{m};
+      }
+      case compute::CalendarUnit::QUARTER: {
+        const auto m = static_cast<uint32_t>(ymd.month()) / (options.multiple * 3) *
+                       (options.multiple * 3);
+        return year_month_day(ymd.year() / 1 / 1) + months{m};
+      }
+      default:
+        return ymd;

Review Comment:
   What about `YEAR` and `DAY`?



##########
cpp/src/arrow/compute/kernels/scalar_temporal_unary.cc:
##########
@@ -705,21 +721,75 @@ year_month_day GetFlooredYmd(int64_t arg, int multiple, Localizer localizer_) {
     } else {
       total_months = (total_months - multiple + 1) / multiple * multiple;
     }
-    return year_month_day(year{1970} / jan / 0) + months{total_months};
+    return year{1970} / jan / 1 + months{total_months};

Review Comment:
   Was it a bug? If so, is a test added for it?



##########
python/pyarrow/_compute.pyx:
##########
@@ -901,10 +903,20 @@ class RoundTemporalOptions(_RoundTemporalOptions):
         "nanosecond".
     week_starts_monday : bool, default True
         If True, weeks start on Monday; if False, on Sunday.
+    change_on_boundary : bool, default False
+        If True times exactly on unit multiple boundary will be rounded
+        one unit multiple up. This applies for ceiling only.
+    calendar_based_origin : bool, default False
+        By default origin is 1970-01-01T00:00:00. By setting this to True,
+        rounding origin will be beginning of one less precise calendar unit.
+        E.g.: rounding to hours will use beginning of day as origin.
+
     """
 
-    def __init__(self, multiple=1, unit="day", week_starts_monday=True):
-        self._set_options(multiple, unit, week_starts_monday)
+    def __init__(self, multiple=1, unit="day", week_starts_monday=True,
+                 change_on_boundary=False, calendar_based_origin=False):

Review Comment:
   Hmm, can we make all boolean options keyword-only?
   ```suggestion
       def __init__(self, multiple=1, unit="day", *, week_starts_monday=True,
                    change_on_boundary=False, calendar_based_origin=False):
   ```



##########
cpp/src/arrow/compute/kernels/scalar_temporal_unary.cc:
##########
@@ -728,18 +798,31 @@ const Duration FloorTimePoint(const int64_t arg, const int64_t multiple,
 }
 
 template <typename Duration, typename Localizer>
-const Duration FloorWeekTimePoint(const int64_t arg, const int64_t multiple,
+const Duration FloorWeekTimePoint(const int64_t arg, const RoundTemporalOptions options,
                                   Localizer localizer_, const Duration weekday_offset,
                                   Status* st) {
   const auto t = localizer_.template ConvertTimePoint<Duration>(arg) + weekday_offset;
   const weeks d = floor<weeks>(t).time_since_epoch();
 
-  if (multiple == 1) {
+  if (options.multiple == 1) {
     return localizer_.template ConvertLocalToSys<Duration>(duration_cast<Duration>(d),
                                                            st) -
            weekday_offset;
+  } else if (options.calendar_based_origin) {
+    weekday wd_;

Review Comment:
   Can you add comments explaining the following calculation?



##########
cpp/src/arrow/compute/api_scalar.h:
##########
@@ -117,6 +119,13 @@ class ARROW_EXPORT RoundTemporalOptions : public FunctionOptions {
   CalendarUnit unit;
   /// What day does the week start with (Monday=true, Sunday=false)
   bool week_starts_monday;
+  /// Times exactly on unit multiple boundary will be rounded one unit multiple up.
+  /// This applies for ceiling only.
+  bool change_on_boundary;

Review Comment:
   Since this is only for `ceil`, perhaps rename it `strict_ceil` or something similar? @jorisvandenbossche what do you think?



##########
python/pyarrow/tests/test_compute.py:
##########
@@ -2078,6 +2078,19 @@ def _check_temporal_rounding(ts, values, unit):
         expected = ts.dt.round(frequency)
         np.testing.assert_array_equal(result, expected)
 
+    # TODO: should work for day

Review Comment:
   Why doesn't it? Is it a bug on our side or is it Pandas' fault?



##########
cpp/src/arrow/compute/kernels/scalar_temporal_unary.cc:
##########
@@ -705,21 +721,75 @@ year_month_day GetFlooredYmd(int64_t arg, int multiple, Localizer localizer_) {
     } else {
       total_months = (total_months - multiple + 1) / multiple * multiple;
     }
-    return year_month_day(year{1970} / jan / 0) + months{total_months};
+    return year{1970} / jan / 1 + months{total_months};
   }
 }
 
 template <typename Duration, typename Unit, typename Localizer>
-const Duration FloorTimePoint(const int64_t arg, const int64_t multiple,
+const Duration FloorTimePoint(const int64_t arg, const RoundTemporalOptions options,
                               Localizer localizer_, Status* st) {
   const auto t = localizer_.template ConvertTimePoint<Duration>(arg);
-  const Unit d = floor<Unit>(t).time_since_epoch();
 
-  if (multiple == 1) {
+  if (options.multiple == 1) {
+    const Unit d = floor<Unit>(t).time_since_epoch();
     return localizer_.template ConvertLocalToSys<Duration>(duration_cast<Duration>(d),
                                                            st);
+  } else if (options.calendar_based_origin) {
+    const Unit unit = Unit{options.multiple};
+
+    switch (options.unit) {
+      case compute::CalendarUnit::DAY: {
+        const auto origin =
+            localizer_.ConvertDays(year_month_day(floor<days>(t)).year() /
+                                   year_month_day(floor<days>(t)).month() / 1);
+        const auto m = (floor<days>(t) - origin) / unit * unit + origin;
+        return localizer_.template ConvertLocalToSys<Duration>(
+            duration_cast<Duration>(m.time_since_epoch()), st);
+      }
+      case compute::CalendarUnit::HOUR: {
+        const auto origin = localizer_.ConvertDays(year_month_day(floor<days>(t)));
+        const auto m = (t - origin) / unit * unit + origin;
+        return localizer_.template ConvertLocalToSys<Duration>(
+            duration_cast<Duration>(m.time_since_epoch()), st);

Review Comment:
   This snippet seems repeated in every `case` clause, perhaps move it out of the `switch` entirely?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org