You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2022/04/06 05:54:56 UTC

[GitHub] [arrow] djnavarro commented on a diff in pull request #12154: ARROW-14821: [R] Implement bindings for lubridate's floor_date, ceiling_date, and round_date

djnavarro commented on code in PR #12154:
URL: https://github.com/apache/arrow/pull/12154#discussion_r843493108


##########
r/tests/testthat/test-dplyr-funcs-datetime.R:
##########
@@ -650,6 +658,345 @@ test_that("extract yday from date", {
   )
 })
 
+test_that("round/floor/ceiling on datetime (to nearest second)", {
+
+  skip_on_os("windows") # https://issues.apache.org/jira/browse/ARROW-13168
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(
+        out_1 = round_date(datetime),
+        out_2 = floor_date(datetime),
+        out_3 = ceiling_date(datetime, change_on_boundary = FALSE),
+      ) %>%
+      collect(),
+    test_df
+  )
+})
+
+test_that("period unit abbreviation", {
+
+  skip_on_os("windows") # https://issues.apache.org/jira/browse/ARROW-13168
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(
+        out_1 = round_date(datetime, "minute"),
+        out_2 = round_date(datetime, "minutes"),
+        out_3 = round_date(datetime, "mins"),
+      ) %>%
+      collect(),
+    test_df
+  )
+})
+
+test_that("period unit extracts integer multiples", {
+
+  skip_on_os("windows") # https://issues.apache.org/jira/browse/ARROW-13168
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(
+        out_1 = round_date(datetime, "1 minute"),
+        out_2 = round_date(datetime, "2 minutes"),
+        out_3 = round_date(datetime, "10 minutes")
+      ) %>%
+      collect(),
+    test_df
+  )
+})
+
+# lubridate errors when 60 sec/60 min/24 hour thresholds exceeded.
+# this test checks that arrow does too.
+test_that("period unit maxima are enforced", {
+
+  skip_on_os("windows") # https://issues.apache.org/jira/browse/ARROW-13168
+
+  expect_error(
+    call_binding("round_date", Expression$scalar(Sys.time()), "61 seconds"),
+    "Rounding with second > 60 is not supported"
+  )
+
+  expect_error(
+    call_binding("round_date", Expression$scalar(Sys.time()), "61 minutes"),
+    "Rounding with minute > 60 is not supported"
+  )
+
+  expect_error(
+    call_binding("round_date", Expression$scalar(Sys.time()), "25 hours"),
+    "Rounding with hour > 24 is not supported"
+  )
+
+})
+
+test_that("datetime rounding between 1sec and 1day", {
+
+  skip_on_os("windows") # https://issues.apache.org/jira/browse/ARROW-13168
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(
+        out_1 = round_date(datetime, "second"),
+        out_2 = round_date(datetime, "minute"),
+        out_3 = round_date(datetime, "hour"),
+        out_4 = round_date(datetime, "day")
+      ) %>%
+      collect(),
+    test_df
+  )
+})
+
+# lubridate doesn't accept millisecond, microsecond or nanosecond descriptors:
+# instead it supports corresponding fractions of 1 second. these tests added to
+# that arrow verify that fractional second inputs to arrow mirror lubridate
+
+test_that("datetime rounding below 1sec", {
+
+  skip_on_os("windows") # https://issues.apache.org/jira/browse/ARROW-13168
+
+  expect_equal(
+    test_df %>%
+      arrow_table() %>%
+      mutate(out = round_date(datetime, ".001 second")) %>%
+      collect(),
+
+    test_df %>%
+      arrow_table() %>%
+      mutate(out = round_date(datetime, "1 millisecond")) %>%
+      collect()
+  )
+
+  expect_equal(
+    test_df %>%
+      arrow_table() %>%
+      mutate(out = round_date(datetime, ".000001 second")) %>%
+      collect(),
+
+    test_df %>%
+      arrow_table() %>%
+      mutate(out = round_date(datetime, "1 microsecond")) %>%
+      collect()
+  )
+
+  expect_equal(
+    test_df %>%
+      arrow_table() %>%
+      mutate(out = round_date(datetime, ".000000001 second")) %>%
+      collect(),
+
+    test_df %>%
+      arrow_table() %>%
+      mutate(out = round_date(datetime, "1 nanosecond")) %>%
+      collect()
+  )
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(
+        out_1 = round_date(datetime, ".01 second"),
+        out_2 = round_date(datetime, ".001 second"),
+        out_3 = round_date(datetime, ".00001 second")
+      ) %>%
+      collect(),
+    test_df
+  )
+})
+
+test_that("datetime round/floor/ceil to month/quarter/year", {
+
+  skip_on_os("windows") # https://issues.apache.org/jira/browse/ARROW-13168
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(
+        out_1 = round_date(datetime, "month"),
+        out_2 = round_date(datetime, "quarter"),
+        out_3 = round_date(datetime, "year"),
+      ) %>%
+      collect(),
+    test_df_v2
+  )
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(
+        out_1 = floor_date(datetime, "month"),
+        out_2 = floor_date(datetime, "quarter"),
+        out_3 = floor_date(datetime, "year"),
+      ) %>%
+      collect(),
+    test_df_v2
+  )
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(
+        out_1 = ceiling_date(datetime, "month", change_on_boundary = FALSE),
+        out_2 = ceiling_date(datetime, "quarter", change_on_boundary = FALSE),
+        out_3 = ceiling_date(datetime, "year", change_on_boundary = FALSE),
+      ) %>%
+      collect(),
+    test_df_v2
+  )
+})
+
+
+test_that("change_on_boundary is respected in ceiling_time", {
+
+  boundary_times <- tibble::tibble(
+    datetime = as.POSIXct(strptime(c(
+      "2022-05-10 00:00:00", # boundary for week (Sunday / week_start = 7)
+      "2022-03-10 00:00:00", # boundary for: day, hour, minute, second, millisecond
+      "2022-03-10 00:00:01", # boundary for: second, millisecond
+      "2022-03-10 00:01:00", # boundary for: second, millisecond, minute
+      "2022-03-10 01:00:00"  # boundary for: second, millisecond, minute, hour
+    ), tz="UTC", format = "%F %T"))
+  )
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(
+        out_1 = ceiling_date(datetime, "day"),
+        out_2 = ceiling_date(datetime, "hour"),
+        out_3 = ceiling_date(datetime, "minute"),
+        out_4 = ceiling_date(datetime, "second"),
+        out_5 = ceiling_date(datetime, ".001 second")
+      ) %>%
+      collect(),
+    boundary_times
+  )
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(
+        out_1 = ceiling_date(datetime, "day", change_on_boundary = FALSE),
+        out_2 = ceiling_date(datetime, "hour", change_on_boundary = FALSE),
+        out_3 = ceiling_date(datetime, "minute", change_on_boundary = FALSE),
+        out_4 = ceiling_date(datetime, "second", change_on_boundary = FALSE),
+        out_5 = ceiling_date(datetime, ".001 second", change_on_boundary = FALSE)
+      ) %>%
+      collect(),
+    boundary_times
+  )
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(
+        out_1 = ceiling_date(datetime, "day", change_on_boundary = TRUE),
+        out_2 = ceiling_date(datetime, "hour", change_on_boundary = TRUE),
+        out_3 = ceiling_date(datetime, "minute", change_on_boundary = TRUE),
+        out_4 = ceiling_date(datetime, "second", change_on_boundary = TRUE),
+        out_5 = ceiling_date(datetime, ".001 second", change_on_boundary = TRUE)
+      ) %>%
+      collect(),
+    boundary_times
+  )
+
+  boundary_dates <- tibble::tibble(
+    date = as.Date(c(
+      "2001-05-10", # regular day
+      "2002-05-10", # regular day
+      "2003-05-10", # regular day
+      "2004-05-10", # regular day
+      "2005-05-10", # regular day
+      "2006-05-10", # regular day
+      "2007-05-10"  # regular day
+    ))
+  )
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(
+        out_1 = ceiling_date(date, "day", change_on_boundary = FALSE),
+        out_2 = ceiling_date(date, "day", change_on_boundary = TRUE),
+        out_3 = ceiling_date(date, "day")
+      ) %>%
+      collect(),
+    boundary_dates
+  )

Review Comment:
   Hi @rok. Making progress with this, but pausing to ping you because this test currently throws a very strange error. The input here is a vector of dates with years 2001, 2002, 2003, etc. The output should contain the same vectors, because the rounding unit here is just to the nearest day, i.e.,
   
   ```
   2001-05-10,
   2002-05-10,
   2003-05-10,
   2004-05-10,
   etc
   ```
   
   what we actually get is the correct answer but with the unix epoch interleaved, i.e.,
   
   ```
   2001-05-10,
   1970-01-01,
   2002-05-10,
   1970-01-01,
   2003-05-10,
   etc
   ```
   
   My guess is that this must be something to do with how the data buffer is being parsed? I'm guessing a single 64-bit date in the C++ library has been treated as if it is two 32-bit dates when it's being pulled back into R. Any thoughts on what might be going on here?  
   
   



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org