You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2022/04/06 11:43:45 UTC

[GitHub] [arrow] rok commented on a diff in pull request #12154: ARROW-14821: [R] Implement bindings for lubridate's floor_date, ceiling_date, and round_date

rok commented on code in PR #12154:
URL: https://github.com/apache/arrow/pull/12154#discussion_r843846485


##########
r/tests/testthat/test-dplyr-funcs-datetime.R:
##########
@@ -650,6 +658,345 @@ test_that("extract yday from date", {
   )
 })
 
+test_that("round/floor/ceiling on datetime (to nearest second)", {
+
+  skip_on_os("windows") # https://issues.apache.org/jira/browse/ARROW-13168
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(
+        out_1 = round_date(datetime),
+        out_2 = floor_date(datetime),
+        out_3 = ceiling_date(datetime, change_on_boundary = FALSE),
+      ) %>%
+      collect(),
+    test_df
+  )
+})
+
+test_that("period unit abbreviation", {
+
+  skip_on_os("windows") # https://issues.apache.org/jira/browse/ARROW-13168
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(
+        out_1 = round_date(datetime, "minute"),
+        out_2 = round_date(datetime, "minutes"),
+        out_3 = round_date(datetime, "mins"),
+      ) %>%
+      collect(),
+    test_df
+  )
+})
+
+test_that("period unit extracts integer multiples", {
+
+  skip_on_os("windows") # https://issues.apache.org/jira/browse/ARROW-13168
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(
+        out_1 = round_date(datetime, "1 minute"),
+        out_2 = round_date(datetime, "2 minutes"),
+        out_3 = round_date(datetime, "10 minutes")
+      ) %>%
+      collect(),
+    test_df
+  )
+})
+
+# lubridate errors when 60 sec/60 min/24 hour thresholds exceeded.
+# this test checks that arrow does too.
+test_that("period unit maxima are enforced", {
+
+  skip_on_os("windows") # https://issues.apache.org/jira/browse/ARROW-13168
+
+  expect_error(
+    call_binding("round_date", Expression$scalar(Sys.time()), "61 seconds"),
+    "Rounding with second > 60 is not supported"
+  )
+
+  expect_error(
+    call_binding("round_date", Expression$scalar(Sys.time()), "61 minutes"),
+    "Rounding with minute > 60 is not supported"
+  )
+
+  expect_error(
+    call_binding("round_date", Expression$scalar(Sys.time()), "25 hours"),
+    "Rounding with hour > 24 is not supported"
+  )
+
+})
+
+test_that("datetime rounding between 1sec and 1day", {
+
+  skip_on_os("windows") # https://issues.apache.org/jira/browse/ARROW-13168
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(
+        out_1 = round_date(datetime, "second"),
+        out_2 = round_date(datetime, "minute"),
+        out_3 = round_date(datetime, "hour"),
+        out_4 = round_date(datetime, "day")
+      ) %>%
+      collect(),
+    test_df
+  )
+})
+
+# lubridate doesn't accept millisecond, microsecond or nanosecond descriptors:
+# instead it supports corresponding fractions of 1 second. these tests added to
+# that arrow verify that fractional second inputs to arrow mirror lubridate
+
+test_that("datetime rounding below 1sec", {
+
+  skip_on_os("windows") # https://issues.apache.org/jira/browse/ARROW-13168
+
+  expect_equal(
+    test_df %>%
+      arrow_table() %>%
+      mutate(out = round_date(datetime, ".001 second")) %>%
+      collect(),
+
+    test_df %>%
+      arrow_table() %>%
+      mutate(out = round_date(datetime, "1 millisecond")) %>%
+      collect()
+  )
+
+  expect_equal(
+    test_df %>%
+      arrow_table() %>%
+      mutate(out = round_date(datetime, ".000001 second")) %>%
+      collect(),
+
+    test_df %>%
+      arrow_table() %>%
+      mutate(out = round_date(datetime, "1 microsecond")) %>%
+      collect()
+  )
+
+  expect_equal(
+    test_df %>%
+      arrow_table() %>%
+      mutate(out = round_date(datetime, ".000000001 second")) %>%
+      collect(),
+
+    test_df %>%
+      arrow_table() %>%
+      mutate(out = round_date(datetime, "1 nanosecond")) %>%
+      collect()
+  )
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(
+        out_1 = round_date(datetime, ".01 second"),
+        out_2 = round_date(datetime, ".001 second"),
+        out_3 = round_date(datetime, ".00001 second")
+      ) %>%
+      collect(),
+    test_df
+  )
+})
+
+test_that("datetime round/floor/ceil to month/quarter/year", {
+
+  skip_on_os("windows") # https://issues.apache.org/jira/browse/ARROW-13168
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(
+        out_1 = round_date(datetime, "month"),
+        out_2 = round_date(datetime, "quarter"),
+        out_3 = round_date(datetime, "year"),
+      ) %>%
+      collect(),
+    test_df_v2
+  )
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(
+        out_1 = floor_date(datetime, "month"),
+        out_2 = floor_date(datetime, "quarter"),
+        out_3 = floor_date(datetime, "year"),
+      ) %>%
+      collect(),
+    test_df_v2
+  )
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(
+        out_1 = ceiling_date(datetime, "month", change_on_boundary = FALSE),
+        out_2 = ceiling_date(datetime, "quarter", change_on_boundary = FALSE),
+        out_3 = ceiling_date(datetime, "year", change_on_boundary = FALSE),
+      ) %>%
+      collect(),
+    test_df_v2
+  )
+})
+
+
+test_that("change_on_boundary is respected in ceiling_time", {
+
+  boundary_times <- tibble::tibble(
+    datetime = as.POSIXct(strptime(c(
+      "2022-05-10 00:00:00", # boundary for week (Sunday / week_start = 7)
+      "2022-03-10 00:00:00", # boundary for: day, hour, minute, second, millisecond
+      "2022-03-10 00:00:01", # boundary for: second, millisecond
+      "2022-03-10 00:01:00", # boundary for: second, millisecond, minute
+      "2022-03-10 01:00:00"  # boundary for: second, millisecond, minute, hour
+    ), tz="UTC", format = "%F %T"))
+  )
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(
+        out_1 = ceiling_date(datetime, "day"),
+        out_2 = ceiling_date(datetime, "hour"),
+        out_3 = ceiling_date(datetime, "minute"),
+        out_4 = ceiling_date(datetime, "second"),
+        out_5 = ceiling_date(datetime, ".001 second")
+      ) %>%
+      collect(),
+    boundary_times
+  )
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(
+        out_1 = ceiling_date(datetime, "day", change_on_boundary = FALSE),
+        out_2 = ceiling_date(datetime, "hour", change_on_boundary = FALSE),
+        out_3 = ceiling_date(datetime, "minute", change_on_boundary = FALSE),
+        out_4 = ceiling_date(datetime, "second", change_on_boundary = FALSE),
+        out_5 = ceiling_date(datetime, ".001 second", change_on_boundary = FALSE)
+      ) %>%
+      collect(),
+    boundary_times
+  )
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(
+        out_1 = ceiling_date(datetime, "day", change_on_boundary = TRUE),
+        out_2 = ceiling_date(datetime, "hour", change_on_boundary = TRUE),
+        out_3 = ceiling_date(datetime, "minute", change_on_boundary = TRUE),
+        out_4 = ceiling_date(datetime, "second", change_on_boundary = TRUE),
+        out_5 = ceiling_date(datetime, ".001 second", change_on_boundary = TRUE)
+      ) %>%
+      collect(),
+    boundary_times
+  )
+
+  boundary_dates <- tibble::tibble(
+    date = as.Date(c(
+      "2001-05-10", # regular day
+      "2002-05-10", # regular day
+      "2003-05-10", # regular day
+      "2004-05-10", # regular day
+      "2005-05-10", # regular day
+      "2006-05-10", # regular day
+      "2007-05-10"  # regular day
+    ))
+  )
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(
+        out_1 = ceiling_date(date, "day", change_on_boundary = FALSE),
+        out_2 = ceiling_date(date, "day", change_on_boundary = TRUE),
+        out_3 = ceiling_date(date, "day")
+      ) %>%
+      collect(),
+    boundary_dates
+  )

Review Comment:
   Your reasoning with date64 accidentally becoming two date32s makes sense, but I've no idea how it might be happening. I'll take a look if I can reproduce this in C++ tests.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org