You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2022/01/14 12:52:29 UTC
[GitHub] [arrow] paleolimbot commented on a change in pull request #12154: ARROW-14281: [R] Implement bindings for lubridate's floor_date, ceiling_date, and round_date

paleolimbot commented on a change in pull request #12154:
URL: https://github.com/apache/arrow/pull/12154#discussion_r784807691



##########
File path: r/R/util.R
##########
@@ -209,3 +209,74 @@ handle_csv_read_error <- function(e, schema) {
 
   abort(e)
 }
+
+
+parse_period_unit <- function(x) {
+
+  # the regexp matches against fractional units, but per lubridate
+  # supports integer multiples of a known unit only
+  match_info <- regexpr(
+    pattern = " *(?<multiple>[0-9.,]+)? *(?<unit>[^ \t\n]+)",
+    text = x[[1]],
+    perl = TRUE
+  )
+
+  capture_start <- attr(match_info, "capture.start")
+  capture_length <- attr(match_info, "capture.length")
+  capture_end <- capture_start + capture_length - 1L
+
+  str_unit <- substr(x, capture_start[[2]], capture_end[[2]])
+  str_multiple <- substr(x, capture_start[[1]], capture_end[[1]])
+
+  known_units <- c("nanosecond", "microsecond", "millisecond", "second",
+                   "minute", "hour", "day", "week", "month", "quarter", "year")
+
+  # match the period unit
+  str_unit_start <- substr(str_unit, 1, 3)
+  unit <- as.integer(pmatch(str_unit_start, known_units)) - 1L
+
+  if(any(is.na(unit))) {

Review comment:
       I think `if` statements elsewhere in arrow R code are always followed by a space (`if (...) { ... }`)

##########
File path: r/R/dplyr-funcs-datetime.R
##########
@@ -130,4 +130,19 @@ register_bindings_datetime <- function() {
     inherits(x, "POSIXct") ||
       (inherits(x, "Expression") && x$type_id() %in% Type[c("TIMESTAMP")])
   })
+
+  # TODO:
+  #  support week_start (essential)
+  #  support timezones correctly (essential)
+  #  support change_on_boundary argument to ceiling_date
+  register_binding("round_date", function(x, unit = "second") {
+    Expression$create("round_temporal", x, options = parse_period_unit(unit))
+  }

Review comment:
       ```suggestion
     })
   ```

##########
File path: r/tests/testthat/test-dplyr-funcs-datetime.R
##########
@@ -616,3 +616,292 @@ test_that("extract yday from date", {
     test_df
   )
 })
+
+test_that("round/floor/ceiling on datetime (to nearest second)", {
+  compare_dplyr_binding(
+    .input %>%
+      mutate(
+        out_1 = round_date(datetime),
+        out_2 = floor_date(datetime),
+        out_3 = ceiling_date(datetime),
+      ) %>%
+      collect(),
+    test_df
+  )
+})
+
+test_that("period unit abbreviation", {
+  compare_dplyr_binding(
+    .input %>%
+      mutate(
+        out_1 = round_date(datetime, "minute"),
+        out_2 = round_date(datetime, "minutes"),
+        out_3 = round_date(datetime, "mins"),
+      ) %>%
+      collect(),
+    test_df
+  )
+})
+
+test_that("period unit extracts integer multiples", {
+  compare_dplyr_binding(
+    .input %>%
+      mutate(
+        out_1 = round_date(datetime, "1 minute"),
+        out_2 = round_date(datetime, "2 minutes"),
+        out_3 = round_date(datetime, "10 minutes")
+      ) %>%
+      collect(),
+    test_df
+  )
+})
+
+# lubridate errors when 60 sec/60 min/24 hour thresholds exceeded.
+# this test checks that arrow does too.
+test_that("period unit maxima are enforced", {
+
+  expect_error(suppressWarnings( # <- hack
+    test_df %>%
+      arrow_table() %>%
+      mutate(out = round_date(datetime, "61 seconds")) %>%
+      collect()
+  ))
+
+  expect_error(suppressWarnings(
+    test_df %>%
+      arrow_table() %>%
+      mutate(out = round_date(datetime, "61 minutes")) %>%
+      collect()
+  ))
+
+  expect_error(suppressWarnings(
+    test_df %>%
+      arrow_table() %>%
+      mutate(out = round_date(datetime, "25 hours")) %>%
+      collect()
+  ))
+
+})
+
+test_that("datetime rounding between 1sec and 1day", {
+  compare_dplyr_binding(
+    .input %>%
+      mutate(
+        out_1 = round_date(datetime, "second"),
+        out_2 = round_date(datetime, "minute"),
+        out_3 = round_date(datetime, "hour"),
+        out_4 = round_date(datetime, "day")
+      ) %>%
+      collect(),
+    test_df
+  )
+})
+
+# lubridate doesn't accept millisecond, microsecond or nanosecond descriptors:
+# instead it supports corresponding fractions of 1 second. these tests added to
+# that arrow verify that fractional second inputs to arrow mirror lubridate
+
+test_that("datetime rounding below 1sec", {
+
+  expect_equal(
+    test_df %>%
+      arrow_table() %>%
+      mutate(out = round_date(datetime, ".001 second")) %>%
+      collect(),
+
+    test_df %>%
+      arrow_table() %>%
+      mutate(out = round_date(datetime, "1 millisecond")) %>%
+      collect()
+  )
+
+  expect_equal(
+    test_df %>%
+      arrow_table() %>%
+      mutate(out = round_date(datetime, ".000001 second")) %>%
+      collect(),
+
+    test_df %>%
+      arrow_table() %>%
+      mutate(out = round_date(datetime, "1 microsecond")) %>%
+      collect()
+  )
+
+  expect_equal(
+    test_df %>%
+      arrow_table() %>%
+      mutate(out = round_date(datetime, ".000000001 second")) %>%
+      collect(),
+
+    test_df %>%
+      arrow_table() %>%
+      mutate(out = round_date(datetime, "1 nanosecond")) %>%
+      collect()
+  )
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(
+        out_1 = round_date(datetime, ".01 second"),
+        out_2 = round_date(datetime, ".001 second"),
+        out_3 = round_date(datetime, ".00001 second")
+      ) %>%
+      collect(),
+    test_df
+  )
+})
+
+
+# a simplified test case using UTC timezone only
+test_df_v2 <- tibble::tibble(
+  datetime = c(as.POSIXct("2017-01-01 00:00:11.3456789", tz = "UTC"), NA),
+  date = c(as.Date("2021-09-09"), NA),
+  integer = 1:2
+)

Review comment:
       I think this would be easier to find at the top of the file next to `test_df`

##########
File path: r/tests/testthat/test-dplyr-funcs-datetime.R
##########
@@ -616,3 +616,292 @@ test_that("extract yday from date", {
     test_df
   )
 })
+
+test_that("round/floor/ceiling on datetime (to nearest second)", {
+  compare_dplyr_binding(
+    .input %>%
+      mutate(
+        out_1 = round_date(datetime),
+        out_2 = floor_date(datetime),
+        out_3 = ceiling_date(datetime),
+      ) %>%
+      collect(),
+    test_df
+  )
+})
+
+test_that("period unit abbreviation", {
+  compare_dplyr_binding(
+    .input %>%
+      mutate(
+        out_1 = round_date(datetime, "minute"),
+        out_2 = round_date(datetime, "minutes"),
+        out_3 = round_date(datetime, "mins"),
+      ) %>%
+      collect(),
+    test_df
+  )
+})
+
+test_that("period unit extracts integer multiples", {
+  compare_dplyr_binding(
+    .input %>%
+      mutate(
+        out_1 = round_date(datetime, "1 minute"),
+        out_2 = round_date(datetime, "2 minutes"),
+        out_3 = round_date(datetime, "10 minutes")
+      ) %>%
+      collect(),
+    test_df
+  )
+})
+
+# lubridate errors when 60 sec/60 min/24 hour thresholds exceeded.
+# this test checks that arrow does too.
+test_that("period unit maxima are enforced", {
+
+  expect_error(suppressWarnings( # <- hack
+    test_df %>%
+      arrow_table() %>%
+      mutate(out = round_date(datetime, "61 seconds")) %>%
+      collect()
+  ))

Review comment:
       I think you can test this more directly with:
   
   ``` r
     expect_error(
       call_binding("round_date", Expression$scalar(Sys.time()), "61 seconds"),
       "Rounding with second > 60 is not supported"
     )
   ```

##########
File path: r/tests/testthat/test-dplyr-funcs-datetime.R
##########
@@ -616,3 +616,292 @@ test_that("extract yday from date", {
     test_df
   )
 })
+
+test_that("round/floor/ceiling on datetime (to nearest second)", {
+  compare_dplyr_binding(
+    .input %>%
+      mutate(
+        out_1 = round_date(datetime),
+        out_2 = floor_date(datetime),
+        out_3 = ceiling_date(datetime),
+      ) %>%
+      collect(),
+    test_df
+  )
+})
+
+test_that("period unit abbreviation", {
+  compare_dplyr_binding(
+    .input %>%
+      mutate(
+        out_1 = round_date(datetime, "minute"),
+        out_2 = round_date(datetime, "minutes"),
+        out_3 = round_date(datetime, "mins"),
+      ) %>%
+      collect(),
+    test_df
+  )
+})
+
+test_that("period unit extracts integer multiples", {
+  compare_dplyr_binding(
+    .input %>%
+      mutate(
+        out_1 = round_date(datetime, "1 minute"),
+        out_2 = round_date(datetime, "2 minutes"),
+        out_3 = round_date(datetime, "10 minutes")
+      ) %>%
+      collect(),
+    test_df
+  )
+})
+
+# lubridate errors when 60 sec/60 min/24 hour thresholds exceeded.
+# this test checks that arrow does too.
+test_that("period unit maxima are enforced", {
+
+  expect_error(suppressWarnings( # <- hack
+    test_df %>%
+      arrow_table() %>%
+      mutate(out = round_date(datetime, "61 seconds")) %>%
+      collect()
+  ))
+
+  expect_error(suppressWarnings(
+    test_df %>%
+      arrow_table() %>%
+      mutate(out = round_date(datetime, "61 minutes")) %>%
+      collect()
+  ))
+
+  expect_error(suppressWarnings(
+    test_df %>%
+      arrow_table() %>%
+      mutate(out = round_date(datetime, "25 hours")) %>%
+      collect()
+  ))
+
+})
+
+test_that("datetime rounding between 1sec and 1day", {
+  compare_dplyr_binding(
+    .input %>%
+      mutate(
+        out_1 = round_date(datetime, "second"),
+        out_2 = round_date(datetime, "minute"),
+        out_3 = round_date(datetime, "hour"),
+        out_4 = round_date(datetime, "day")
+      ) %>%
+      collect(),
+    test_df
+  )
+})
+
+# lubridate doesn't accept millisecond, microsecond or nanosecond descriptors:
+# instead it supports corresponding fractions of 1 second. these tests added to
+# that arrow verify that fractional second inputs to arrow mirror lubridate
+
+test_that("datetime rounding below 1sec", {
+
+  expect_equal(
+    test_df %>%
+      arrow_table() %>%
+      mutate(out = round_date(datetime, ".001 second")) %>%
+      collect(),
+
+    test_df %>%
+      arrow_table() %>%
+      mutate(out = round_date(datetime, "1 millisecond")) %>%
+      collect()
+  )
+
+  expect_equal(
+    test_df %>%
+      arrow_table() %>%
+      mutate(out = round_date(datetime, ".000001 second")) %>%
+      collect(),
+
+    test_df %>%
+      arrow_table() %>%
+      mutate(out = round_date(datetime, "1 microsecond")) %>%
+      collect()
+  )
+
+  expect_equal(
+    test_df %>%
+      arrow_table() %>%
+      mutate(out = round_date(datetime, ".000000001 second")) %>%
+      collect(),
+
+    test_df %>%
+      arrow_table() %>%
+      mutate(out = round_date(datetime, "1 nanosecond")) %>%
+      collect()
+  )
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(
+        out_1 = round_date(datetime, ".01 second"),
+        out_2 = round_date(datetime, ".001 second"),
+        out_3 = round_date(datetime, ".00001 second")
+      ) %>%
+      collect(),
+    test_df
+  )
+})
+
+
+# a simplified test case using UTC timezone only
+test_df_v2 <- tibble::tibble(
+  datetime = c(as.POSIXct("2017-01-01 00:00:11.3456789", tz = "UTC"), NA),
+  date = c(as.Date("2021-09-09"), NA),
+  integer = 1:2
+)
+
+test_that("datetime round/floor/ceil to month/quarter/year", {
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(
+        out_1 = round_date(datetime, "month"),
+        out_2 = round_date(datetime, "quarter"),
+        out_3 = round_date(datetime, "year"),
+      ) %>%
+      collect(),
+    test_df_v2
+  )
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(
+        out_1 = floor_date(datetime, "month"),
+        out_2 = floor_date(datetime, "quarter"),
+        out_3 = floor_date(datetime, "year"),
+      ) %>%
+      collect(),
+    test_df_v2
+  )
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(
+        out_1 = ceiling_date(datetime, "month"),
+        out_2 = ceiling_date(datetime, "quarter"),
+        out_3 = ceiling_date(datetime, "year"),
+      ) %>%
+      collect(),
+    test_df_v2
+  )
+})
+
+# NOTE: the hard coding of week_starts = 4 needs to be fixed. round_temporal()
+# treats 1970-01-01 as the beginning of week 1, i.e., week_starts on a Thursday
+
+# NOTE: arrow dplyr binding for ceiling_date() does not force dates up to the
+# next date. the logic mirrors lubridate prior to v1.6.0 (change_on_boundary = FALSE).
+# I'm not 100% sold on this implementation, but it's not obviously terrible
+
+test_that("datetime round/floor/ceil to week", {
+
+  expect_equal(
+    test_df_v2 %>%
+      arrow_table() %>%
+      mutate(out = round_date(datetime, "1 week")) %>%
+      collect(),
+    test_df_v2 %>%
+      mutate(out = round_date(datetime, "1 week", week_start = 4))
+  )
+
+  expect_equal(
+    test_df_v2 %>%
+      arrow_table() %>%
+      mutate(out = ceiling_date(datetime, "1 week")) %>%
+      collect(),
+    test_df_v2 %>%
+      mutate(out = ceiling_date(datetime, "1 week", week_start = 4, change_on_boundary = FALSE))
+  )
+
+  expect_equal(
+    test_df_v2 %>%
+      arrow_table() %>%
+      mutate(out = floor_date(datetime, "1 week")) %>%
+      collect(),
+    test_df_v2 %>%
+      mutate(out = floor_date(datetime, "1 week", week_start = 4))
+  )
+})
+
+# NOTE: lubridate::round_date() sometimes coerces output from Date to POSIXct.
+# this is not the default for the round_temporal() function in libarrow, which
+# is type stable: timestamps stay timestamps, and date32 stays date32. the
+# current implementation preserves the type stability property. consequently
+# there are edge cases where the arrow dplyr binding will not precisely mirror
+# the lubridate original. with that in mind, all tests for date32 rounding coerce
+# the lubridate equivalent back to Date

Review comment:
       The right way to go, I think!




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org