You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by am...@apache.org on 2022/05/19 09:57:40 UTC
[arrow] branch master updated: ARROW-16439: [R] Implement binding for `lubridate::fast_strptime`
This is an automated email from the ASF dual-hosted git repository.
amolina pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 7f31c9d2d9 ARROW-16439: [R] Implement binding for `lubridate::fast_strptime`
7f31c9d2d9 is described below
commit 7f31c9d2d95b5b30fa7997c2519d3b85919f1fd9
Author: Dragoș Moldovan-Grünfeld <dr...@gmail.com>
AuthorDate: Thu May 19 11:57:32 2022 +0200
ARROW-16439: [R] Implement binding for `lubridate::fast_strptime`
This PR adds:
* a binding emulating lubridate's `fast_strptime` functionality
This PR does not support the following arguments:
* `lt = TRUE` - this returns a POSIXlt object (a list) which cannot be easily used in a dplyr pipeline (currently it actually errors) => we have a different default `lt = FALSE` for the Arrow binding, and
* `cutoff_2000 = 68L` - for the `y%` format two-digit numbers smaller or equal to `cutoff_2000` are parsed as though starting with 20, otherwise parsed as though starting with 19. It would be nice to have this, so I raised [ARROW-16596](https://issues.apache.org/jira/browse/ARROW-16596). We can always suggest users they manipulate the strings before parsing, so I don't think this is crucial functionality.
The following code will be possible once the PR is merged:
``` r
library(lubridate, warn.conflicts = FALSE)
library(dplyr, warn.conflicts = FALSE)
library(arrow, warn.conflicts = FALSE)
dates_table <- tibble(
string_with_short_year = c("68-05-17", "69-05-17", "55-05-17")
)
dates_table %>%
mutate(
date = fast_strptime(
string_with_short_year,
format = "%y-%m-%d",
lt = FALSE
)
)
#> # A tibble: 3 × 2
#> string_with_short_year date
#> <chr> <dttm>
#> 1 68-05-17 2068-05-17 00:00:00
#> 2 69-05-17 1969-05-17 00:00:00
#> 3 55-05-17 2055-05-17 00:00:00
dates_table %>%
arrow_table() %>%
mutate(
date = fast_strptime(
string_with_short_year,
format = "%y-%m-%d",
lt = FALSE
)
) %>%
collect()
#> # A tibble: 3 × 2
#> string_with_short_year date
#> <chr> <dttm>
#> 1 68-05-17 2068-05-17 00:00:00
#> 2 69-05-17 1969-05-17 00:00:00
#> 3 55-05-17 2055-05-17 00:00:00
```
<sup>Created on 2022-05-18 by the [reprex package](https://reprex.tidyverse.org) (v2.0.1)</sup>
Closes #13174 from dragosmg/fast_strptime_binding
Authored-by: Dragoș Moldovan-Grünfeld <dr...@gmail.com>
Signed-off-by: Alessandro Molina <am...@turbogears.org>
---
r/R/dplyr-funcs-datetime.R | 36 ++++++++
r/tests/testthat/test-dplyr-funcs-datetime.R | 130 +++++++++++++++++++++++++++
2 files changed, 166 insertions(+)
diff --git a/r/R/dplyr-funcs-datetime.R b/r/R/dplyr-funcs-datetime.R
index 02ec35bda2..64847f41fd 100644
--- a/r/R/dplyr-funcs-datetime.R
+++ b/r/R/dplyr-funcs-datetime.R
@@ -618,4 +618,40 @@ register_bindings_datetime_parsers <- function() {
for (ymd_order in ymd_parser_vec) {
register_binding(ymd_order, ymd_parser_map_factory(ymd_order))
}
+
+ register_binding("fast_strptime", function(x,
+ format,
+ tz = "UTC",
+ lt = FALSE,
+ cutoff_2000 = 68L) {
+ # `lt` controls the output `lt = TRUE` returns a POSIXlt (which doesn't play
+ # well with mutate, for example)
+ if (lt) {
+ arrow_not_supported("`lt = TRUE` argument")
+ }
+
+ # TODO revisit once https://issues.apache.org/jira/browse/ARROW-16596 is done
+ if (cutoff_2000 != 68L) {
+ arrow_not_supported("`cutoff_2000` != 68L argument")
+ }
+
+ parse_attempt_expressions <- list()
+
+ parse_attempt_expressions <- map(
+ format,
+ ~ build_expr(
+ "strptime",
+ x,
+ options = list(
+ format = .x,
+ unit = 0L,
+ error_is_null = TRUE
+ )
+ )
+ )
+
+ coalesce_output <- build_expr("coalesce", args = parse_attempt_expressions)
+
+ build_expr("assume_timezone", coalesce_output, options = list(timezone = tz))
+ })
}
diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R
index b122363015..86b1862ab4 100644
--- a/r/tests/testthat/test-dplyr-funcs-datetime.R
+++ b/r/tests/testthat/test-dplyr-funcs-datetime.R
@@ -1812,3 +1812,133 @@ test_that("ym, my & yq parsers", {
test_df
)
})
+
+test_that("lubridate's fast_strptime", {
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(
+ y =
+ fast_strptime(
+ x,
+ format = "%Y-%m-%d %H:%M:%S",
+ lt = FALSE
+ )
+ ) %>%
+ collect(),
+ tibble(
+ x = c("2018-10-07 19:04:05", "2022-05-17 21:23:45", NA)
+ )#,
+ # arrow does not preserve the `tzone` attribute
+ # test ignore_attr = TRUE
+ )
+
+ # R object
+ compare_dplyr_binding(
+ .input %>%
+ mutate(
+ y =
+ fast_strptime(
+ "68-10-07 19:04:05",
+ format = "%y-%m-%d %H:%M:%S",
+ lt = FALSE
+ )
+ ) %>%
+ collect(),
+ tibble(
+ x = c("2018-10-07 19:04:05", NA)
+ )#,
+ # test ignore_attr = TRUE
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(
+ date_multi_formats =
+ fast_strptime(
+ x,
+ format = c("%Y-%m-%d %H:%M:%S", "%m-%d-%Y %H:%M:%S"),
+ lt = FALSE
+ )
+ ) %>%
+ collect(),
+ tibble(
+ x = c("2018-10-07 19:04:05", "10-07-1968 19:04:05")
+ )
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(
+ dttm_with_tz = fast_strptime(
+ dttm_as_string,
+ format = "%Y-%m-%d %H:%M:%S",
+ tz = "Pacific/Marquesas",
+ lt = FALSE
+ )
+ ) %>%
+ collect(),
+ tibble(
+ dttm_as_string =
+ c("2018-10-07 19:04:05", "1969-10-07 19:04:05", NA)
+ )
+ )
+
+ # fast_strptime()'s `cutoff_2000` argument is not supported, but its value is
+ # implicitly set to 68L both in lubridate and in Arrow
+ compare_dplyr_binding(
+ .input %>%
+ mutate(
+ date_short_year =
+ fast_strptime(
+ x,
+ format = "%y-%m-%d %H:%M:%S",
+ lt = FALSE
+ )
+ ) %>%
+ collect(),
+ tibble(
+ x =
+ c("68-10-07 19:04:05", "69-10-07 19:04:05", NA)
+ )#,
+ # arrow does not preserve the `tzone` attribute
+ # test ignore_attr = TRUE
+ )
+
+ # the arrow binding errors for a value different from 68L for `cutoff_2000`
+ compare_dplyr_binding(
+ .input %>%
+ mutate(
+ date_short_year =
+ fast_strptime(
+ x,
+ format = "%y-%m-%d %H:%M:%S",
+ lt = FALSE,
+ cutoff_2000 = 69L
+ )
+ ) %>%
+ collect(),
+ tibble(
+ x = c("68-10-07 19:04:05", "69-10-07 19:04:05", NA)
+ ),
+ warning = TRUE
+ )
+
+ # compare_dplyr_binding would not work here since lt = TRUE returns a list
+ # and it also errors in regular dplyr pipelines
+ expect_warning(
+ tibble(
+ x = c("68-10-07 19:04:05", "69-10-07 19:04:05", NA)
+ ) %>%
+ arrow_table() %>%
+ mutate(
+ date_short_year =
+ fast_strptime(
+ x,
+ format = "%y-%m-%d %H:%M:%S",
+ lt = TRUE
+ )
+ ) %>%
+ collect()
+ )
+})