You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2022/06/28 18:18:44 UTC

[GitHub] [arrow] jonkeane commented on a diff in pull request #12154: ARROW-14821: [R] Implement bindings for lubridate's floor_date, ceiling_date, and round_date

jonkeane commented on code in PR #12154:
URL: https://github.com/apache/arrow/pull/12154#discussion_r908796810


##########
r/tests/testthat/test-dplyr-funcs-datetime.R:
##########
@@ -46,6 +46,46 @@ test_df <- tibble::tibble(
   integer = 1:2
 )
 
+# a simplified test case using UTC timezone only
+test_df_v2 <- tibble::tibble(
+  datetime = c(as.POSIXct("2017-01-01 00:00:11.3456789", tz = "UTC"), NA) + 1,

Review Comment:
   I wonder if (1) we want to test non-UTC times too? Would that change anything? and (2) if we do use a timezone that's not UTC, we should also have times that would be on either side of midnight on different days _in_ UTC for cases where we rollup to days, we want to make sure that does the right thing.



##########
r/tests/testthat/test-dplyr-funcs-datetime.R:
##########
@@ -1965,3 +2005,385 @@ test_that("lubridate's fast_strptime", {
       collect()
   )
 })
+
+test_that("round/floor/ceiling on datetime (to nearest second)", {
+
+  skip_on_os("windows") # https://issues.apache.org/jira/browse/ARROW-13168
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(
+        out_1 = round_date(datetime),
+        out_2 = floor_date(datetime),
+        out_3 = ceiling_date(datetime, change_on_boundary = FALSE),
+      ) %>%
+      collect(),
+    test_df
+  )
+})
+
+test_that("period unit abbreviation", {
+
+  skip_on_os("windows") # https://issues.apache.org/jira/browse/ARROW-13168
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(
+        out_1 = round_date(datetime, "minute"),
+        out_2 = round_date(datetime, "minutes"),
+        out_3 = round_date(datetime, "mins"),
+      ) %>%
+      collect(),
+    test_df
+  )
+})
+
+test_that("period unit extracts integer multiples", {
+
+  skip_on_os("windows") # https://issues.apache.org/jira/browse/ARROW-13168
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(
+        out_1 = round_date(datetime, "1 minute"),
+        out_2 = round_date(datetime, "2 minutes"),
+        out_3 = round_date(datetime, "10 minutes")
+      ) %>%
+      collect(),
+    test_df
+  )
+})
+
+# lubridate errors when 60 sec/60 min/24 hour thresholds exceeded.
+# this test checks that arrow does too.
+test_that("period unit maxima are enforced", {
+
+  skip_on_os("windows") # https://issues.apache.org/jira/browse/ARROW-13168
+
+  expect_error(
+    call_binding("round_date", Expression$scalar(Sys.time()), "61 seconds"),
+    "Rounding with second > 60 is not supported"
+  )
+
+  expect_error(
+    call_binding("round_date", Expression$scalar(Sys.time()), "61 minutes"),
+    "Rounding with minute > 60 is not supported"
+  )
+
+  expect_error(
+    call_binding("round_date", Expression$scalar(Sys.time()), "25 hours"),
+    "Rounding with hour > 24 is not supported"
+  )
+
+})
+
+test_that("datetime rounding between 1sec and 1day", {
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(
+        out_1 = round_date(datetime, "second"),
+        out_2 = round_date(datetime, "minute"),
+        out_3 = round_date(datetime, "hour"),
+        out_4 = round_date(datetime, "day")
+      ) %>%
+      collect(),
+    test_df
+  )
+})
+
+# lubridate doesn't accept millisecond, microsecond or nanosecond descriptors:
+# instead it supports corresponding fractions of 1 second. these tests added to
+# that arrow verify that fractional second inputs to arrow mirror lubridate
+
+test_that("datetime rounding below 1sec", {
+
+  expect_equal(
+    test_df %>%
+      arrow_table() %>%
+      mutate(out = round_date(datetime, ".001 second")) %>%
+      collect(),
+
+    test_df %>%
+      arrow_table() %>%
+      mutate(out = round_date(datetime, "1 millisecond")) %>%
+      collect()
+  )
+
+  expect_equal(
+    test_df %>%
+      arrow_table() %>%
+      mutate(out = round_date(datetime, ".000001 second")) %>%
+      collect(),
+
+    test_df %>%
+      arrow_table() %>%
+      mutate(out = round_date(datetime, "1 microsecond")) %>%
+      collect()
+  )
+
+  expect_equal(
+    test_df %>%
+      arrow_table() %>%
+      mutate(out = round_date(datetime, ".000000001 second")) %>%
+      collect(),
+
+    test_df %>%
+      arrow_table() %>%
+      mutate(out = round_date(datetime, "1 nanosecond")) %>%
+      collect()
+  )
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(
+        out_1 = round_date(datetime, ".01 second"),
+        out_2 = round_date(datetime, ".001 second"),
+        out_3 = round_date(datetime, ".00001 second")
+      ) %>%
+      collect(),
+    test_df
+  )
+})
+
+test_that("datetime round/floor/ceil to month/quarter/year", {
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(
+        out_1 = round_date(datetime, "month"),
+        out_2 = round_date(datetime, "quarter"),
+        out_3 = round_date(datetime, "year"),
+      ) %>%
+      collect(),
+    test_df_v2
+  )
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(
+        out_1 = floor_date(datetime, "month"),
+        out_2 = floor_date(datetime, "quarter"),
+        out_3 = floor_date(datetime, "year"),
+      ) %>%
+      collect(),
+    test_df_v2
+  )
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(
+        out_1 = ceiling_date(datetime, "month", change_on_boundary = FALSE),
+        out_2 = ceiling_date(datetime, "quarter", change_on_boundary = FALSE),
+        out_3 = ceiling_date(datetime, "year", change_on_boundary = FALSE),
+      ) %>%
+      collect(),
+    test_df_v2
+  )
+})
+
+
+check_boundary_with_unit <- function(unit, ...) {
+  compare_dplyr_binding(
+    .input %>%
+      mutate(
+        cob_null = ceiling_date(datetime, unit, change_on_boundary = NULL),
+        cob_true = ceiling_date(datetime, unit, change_on_boundary = TRUE),
+        cob_false = ceiling_date(datetime, unit, change_on_boundary = FALSE)
+      ) %>%
+      collect(),
+    boundary_times,
+    ...
+  )
+}
+
+test_that("ceiling_time works with change_on_boundary: unit = day", {
+  check_boundary_with_unit("day")
+})
+test_that("ceiling_time works with change_on_boundary: unit = hour", {
+  check_boundary_with_unit("hour")
+})
+test_that("ceiling_time works with change_on_boundary: unit = minute", {
+  check_boundary_with_unit("minute", tolerance = .001) # floating point issue?
+})
+test_that("ceiling_time works with change_on_boundary: unit = second", {
+  check_boundary_with_unit("second")
+})
+test_that("ceiling_time works with change_on_boundary: unit = millisecond", {
+  check_boundary_with_unit(".001 second")
+})
+
+
+# NOTE: until 16142 is resolved, this test has to be written to avoid the
+# 32-bit temporal array misinterpreted as 64-bit bug. The easiest solution
+# is to never use an arrow array of length greater than 1:
+# https://issues.apache.org/jira/browse/ARROW-16142

Review Comment:
   Hmm is this in reference to the test just below it?



##########
r/R/dplyr-funcs-datetime.R:
##########
@@ -686,4 +689,61 @@ register_bindings_datetime_parsers <- function() {
 
     build_expr("assume_timezone", coalesce_output, options = list(timezone = tz))
   })
+
+}
+
+
+register_bindings_datetime_rounding <- function() {
+
+  register_binding("round_date", function(x, unit = "second",
+                                          week_start = getOption("lubridate.week.start", 7)) {
+    opts <- parse_period_unit(unit)
+
+    if (opts$unit == 7L) {
+      return(shift_temporal_to_week("round_temporal", x, week_start, options = opts))
+    }

Review Comment:
   Would it be possible to add a comment about which unit is `7L` here so it's easier to follow which one gets this special treatment?



##########
r/tests/testthat/test-dplyr-funcs-datetime.R:
##########
@@ -1965,3 +2005,385 @@ test_that("lubridate's fast_strptime", {
       collect()
   )
 })
+
+test_that("round/floor/ceiling on datetime (to nearest second)", {
+
+  skip_on_os("windows") # https://issues.apache.org/jira/browse/ARROW-13168
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(
+        out_1 = round_date(datetime),
+        out_2 = floor_date(datetime),
+        out_3 = ceiling_date(datetime, change_on_boundary = FALSE),
+      ) %>%
+      collect(),
+    test_df
+  )
+})
+
+test_that("period unit abbreviation", {
+
+  skip_on_os("windows") # https://issues.apache.org/jira/browse/ARROW-13168
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(
+        out_1 = round_date(datetime, "minute"),
+        out_2 = round_date(datetime, "minutes"),
+        out_3 = round_date(datetime, "mins"),
+      ) %>%
+      collect(),
+    test_df
+  )
+})
+
+test_that("period unit extracts integer multiples", {
+
+  skip_on_os("windows") # https://issues.apache.org/jira/browse/ARROW-13168
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(
+        out_1 = round_date(datetime, "1 minute"),
+        out_2 = round_date(datetime, "2 minutes"),
+        out_3 = round_date(datetime, "10 minutes")
+      ) %>%
+      collect(),
+    test_df
+  )
+})
+
+# lubridate errors when 60 sec/60 min/24 hour thresholds exceeded.
+# this test checks that arrow does too.
+test_that("period unit maxima are enforced", {
+
+  skip_on_os("windows") # https://issues.apache.org/jira/browse/ARROW-13168
+
+  expect_error(
+    call_binding("round_date", Expression$scalar(Sys.time()), "61 seconds"),
+    "Rounding with second > 60 is not supported"
+  )
+
+  expect_error(
+    call_binding("round_date", Expression$scalar(Sys.time()), "61 minutes"),
+    "Rounding with minute > 60 is not supported"
+  )
+
+  expect_error(
+    call_binding("round_date", Expression$scalar(Sys.time()), "25 hours"),
+    "Rounding with hour > 24 is not supported"
+  )
+
+})
+
+test_that("datetime rounding between 1sec and 1day", {
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(
+        out_1 = round_date(datetime, "second"),
+        out_2 = round_date(datetime, "minute"),
+        out_3 = round_date(datetime, "hour"),
+        out_4 = round_date(datetime, "day")
+      ) %>%
+      collect(),
+    test_df
+  )
+})
+
+# lubridate doesn't accept millisecond, microsecond or nanosecond descriptors:
+# instead it supports corresponding fractions of 1 second. these tests added to
+# that arrow verify that fractional second inputs to arrow mirror lubridate
+
+test_that("datetime rounding below 1sec", {
+
+  expect_equal(
+    test_df %>%
+      arrow_table() %>%
+      mutate(out = round_date(datetime, ".001 second")) %>%
+      collect(),
+
+    test_df %>%
+      arrow_table() %>%
+      mutate(out = round_date(datetime, "1 millisecond")) %>%
+      collect()
+  )
+
+  expect_equal(
+    test_df %>%
+      arrow_table() %>%
+      mutate(out = round_date(datetime, ".000001 second")) %>%
+      collect(),
+
+    test_df %>%
+      arrow_table() %>%
+      mutate(out = round_date(datetime, "1 microsecond")) %>%
+      collect()
+  )
+
+  expect_equal(
+    test_df %>%
+      arrow_table() %>%
+      mutate(out = round_date(datetime, ".000000001 second")) %>%
+      collect(),
+
+    test_df %>%
+      arrow_table() %>%
+      mutate(out = round_date(datetime, "1 nanosecond")) %>%
+      collect()
+  )
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(
+        out_1 = round_date(datetime, ".01 second"),
+        out_2 = round_date(datetime, ".001 second"),
+        out_3 = round_date(datetime, ".00001 second")
+      ) %>%
+      collect(),
+    test_df
+  )
+})
+
+test_that("datetime round/floor/ceil to month/quarter/year", {
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(
+        out_1 = round_date(datetime, "month"),
+        out_2 = round_date(datetime, "quarter"),
+        out_3 = round_date(datetime, "year"),
+      ) %>%
+      collect(),
+    test_df_v2
+  )
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(
+        out_1 = floor_date(datetime, "month"),
+        out_2 = floor_date(datetime, "quarter"),
+        out_3 = floor_date(datetime, "year"),
+      ) %>%
+      collect(),
+    test_df_v2
+  )
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(
+        out_1 = ceiling_date(datetime, "month", change_on_boundary = FALSE),
+        out_2 = ceiling_date(datetime, "quarter", change_on_boundary = FALSE),
+        out_3 = ceiling_date(datetime, "year", change_on_boundary = FALSE),
+      ) %>%
+      collect(),
+    test_df_v2
+  )
+})
+
+
+check_boundary_with_unit <- function(unit, ...) {
+  compare_dplyr_binding(
+    .input %>%
+      mutate(
+        cob_null = ceiling_date(datetime, unit, change_on_boundary = NULL),
+        cob_true = ceiling_date(datetime, unit, change_on_boundary = TRUE),
+        cob_false = ceiling_date(datetime, unit, change_on_boundary = FALSE)
+      ) %>%
+      collect(),
+    boundary_times,
+    ...
+  )
+}

Review Comment:
   This test helper is fantastic, it might be worth adding a small comment about what it does (it's pretty obvious, but I've thought that before and future-me has proven myself wrong!)



##########
r/R/dplyr-funcs-datetime.R:
##########
@@ -686,4 +689,61 @@ register_bindings_datetime_parsers <- function() {
 
     build_expr("assume_timezone", coalesce_output, options = list(timezone = tz))
   })
+
+}
+
+
+register_bindings_datetime_rounding <- function() {
+
+  register_binding("round_date", function(x, unit = "second",
+                                          week_start = getOption("lubridate.week.start", 7)) {
+    opts <- parse_period_unit(unit)
+
+    if (opts$unit == 7L) {
+      return(shift_temporal_to_week("round_temporal", x, week_start, options = opts))
+    }
+
+    Expression$create("round_temporal", x, options = opts)
+  })
+
+  register_binding("floor_date", function(x, unit = "second",
+                                          week_start = getOption("lubridate.week.start", 7)) {
+
+    opts <- parse_period_unit(unit)
+
+    if (opts$unit == 7L) {
+      return(shift_temporal_to_week("floor_temporal", x, week_start, options = opts))
+    }
+
+    return(Expression$create("floor_temporal", x, options = opts))
+  })
+
+  register_binding("ceiling_date", function(x, unit = "second",
+                                            change_on_boundary = NULL,
+                                            week_start = getOption("lubridate.week.start", 7)) {
+
+    opts <- parse_period_unit(unit)
+
+    if (is.null(change_on_boundary)) {
+      if (call_binding("is.Date", x)) {
+        change_on_boundary <- TRUE
+      } else {
+        change_on_boundary <- FALSE
+      }
+    }
+
+    if (change_on_boundary == FALSE) {
+      opts$ceil_is_strictly_greater <- FALSE
+    }
+    if (change_on_boundary == TRUE) {
+      opts$ceil_is_strictly_greater <- TRUE
+    }

Review Comment:
   An if/else might make this look slight more unified and that it is an either or option (though it's pretty obvious from the code that you're overwritting this variable)



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org