You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by th...@apache.org on 2022/07/05 11:35:33 UTC
[arrow] branch master updated: ARROW-15804: [R] Improve as.Date() error message when supplying several tryFormats
This is an automated email from the ASF dual-hosted git repository.
thisisnic pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 3f205d4977 ARROW-15804: [R] Improve as.Date() error message when supplying several tryFormats
3f205d4977 is described below
commit 3f205d4977e5f4955683fc1a824cb7889d048eba
Author: Dragoș Moldovan-Grünfeld <dr...@gmail.com>
AuthorDate: Tue Jul 5 12:35:19 2022 +0100
ARROW-15804: [R] Improve as.Date() error message when supplying several tryFormats
**Update**: this PR will only improve the message users get when trying to pass multiple formats to `tryFormats`. The new message will include an advice to use the dedicated parsing functions. This conclusion / recommendation can be found here (first comment): https://issues.apache.org/jira/browse/ARROW-15805
=================
**Old description**: This PR will allow users to try parsing with several formats via the `tryFormats` argument to `as.Date()` and `as_date()`
> _tryFormats_ - character vector of format strings to try if format is not specified.
`as_date()` does not have the `tryFormats` argument, but will assume an ISO 8601 format if nothing is passed via the `format` argument.
https://github.com/tidyverse/lubridate/blob/c4714046ec412e669906ab8d1cd9a8e4fb125e13/R/coercion.r#L725
I implemented `tryFormats` with the `parse_date_time()` binding to coalesce through the format strings. One implication is that `as.Date()` is no longer erroring when not being able to parse, but rather return an `NA`.
**A thought**: should we only support `tryFormats` only when `RE2` is available? As it currently is, this PR will remove the `as.Date` functionality if `format` is `NULL` for Windows R 3.6 users.
Closes #13070 from dragosmg/asDate_with_tryFormats
Authored-by: Dragoș Moldovan-Grünfeld <dr...@gmail.com>
Signed-off-by: Nic Crane <th...@gmail.com>
---
r/R/dplyr-datetime-helpers.R | 3 -
r/R/dplyr-funcs-datetime.R | 10 +++
r/tests/testthat/test-dplyr-funcs-datetime.R | 117 +++++++++++++++++++--------
3 files changed, 93 insertions(+), 37 deletions(-)
diff --git a/r/R/dplyr-datetime-helpers.R b/r/R/dplyr-datetime-helpers.R
index 60771c8624..7099d79c78 100644
--- a/r/R/dplyr-datetime-helpers.R
+++ b/r/R/dplyr-datetime-helpers.R
@@ -104,9 +104,6 @@ binding_as_date <- function(x,
format = NULL,
tryFormats = "%Y-%m-%d",
origin = "1970-01-01") {
- if (is.null(format) && length(tryFormats) > 1) {
- abort("`as.Date()` with multiple `tryFormats` is not supported in Arrow")
- }
if (call_binding("is.Date", x)) {
return(x)
diff --git a/r/R/dplyr-funcs-datetime.R b/r/R/dplyr-funcs-datetime.R
index 8ecb80b6b4..f7d948fdf2 100644
--- a/r/R/dplyr-funcs-datetime.R
+++ b/r/R/dplyr-funcs-datetime.R
@@ -297,6 +297,16 @@ register_bindings_datetime_conversion <- function() {
tryFormats = "%Y-%m-%d",
origin = "1970-01-01",
tz = "UTC") {
+
+ if (is.null(format) && length(tryFormats) > 1) {
+ abort(
+ paste(
+ "`as.Date()` with multiple `tryFormats` is not supported in Arrow,",
+ "consider using the lubridate specialised parsing functions such as, `ymd()`, `ymd()`, etc."
+ )
+ )
+ }
+
# base::as.Date() and lubridate::as_date() differ in the way they use the
# `tz` argument. Both cast to the desired timezone, if present. The
# difference appears when the `tz` argument is not set: `as.Date()` uses the
diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R
index dc9e5609ea..183170ff83 100644
--- a/r/tests/testthat/test-dplyr-funcs-datetime.R
+++ b/r/tests/testthat/test-dplyr-funcs-datetime.R
@@ -1508,16 +1508,19 @@ test_that("make_difftime()", {
test_that("`as.Date()` and `as_date()`", {
test_df <- tibble::tibble(
- posixct_var = as.POSIXct("2022-02-25 00:00:01", tz = "Pacific/Marquesas"),
- dt_europe = ymd_hms("2010-08-03 00:50:50", tz = "Europe/London"),
- dt_utc = ymd_hms("2010-08-03 00:50:50"),
- date_var = as.Date("2022-02-25"),
- difference_date = ymd_hms("2010-08-03 00:50:50", tz = "Pacific/Marquesas"),
- character_ymd_var = "2022-02-25 00:00:01",
- character_ydm_var = "2022/25/02 00:00:01",
- integer_var = 32L,
- integerish_var = 32,
- double_var = 34.56
+ posixct_var = as.POSIXct(c("2022-02-25 00:00:01", "1987-11-24 12:34:56", NA), tz = "Pacific/Marquesas"),
+ dt_europe = ymd_hms("2010-08-03 00:50:50", "1987-11-24 12:34:56", NA, tz = "Europe/London"),
+ dt_utc = ymd_hms("2010-08-03 00:50:50", "1987-11-24 12:34:56", NA),
+ date_var = as.Date(c("2022-02-25", "1987-11-24", NA)),
+ difference_date = ymd_hms("2010-08-03 00:50:50", "1987-11-24 12:34:56", NA, tz = "Pacific/Marquesas"),
+ try_formats_string = c(NA, "2022-01-01", "2022/01/01"),
+ character_ymd_hms_var = c("2022-02-25 00:00:01", "1987-11-24 12:34:56", NA),
+ character_ydm_hms_var = c("2022/25/02 00:00:01", "1987/24/11 12:34:56", NA),
+ character_ymd_var = c("2022-02-25", "1987-11-24", NA),
+ character_ydm_var = c("2022/25/02", "1987/24/11", NA),
+ integer_var = c(21L, 32L, NA),
+ integerish_var = c(21, 32, NA),
+ double_var = c(12.34, 56.78, NA)
)
compare_dplyr_binding(
@@ -1528,8 +1531,8 @@ test_that("`as.Date()` and `as_date()`", {
date_pv_tz1 = as.Date(posixct_var, tz = "Pacific/Marquesas"),
date_utc1 = as.Date(dt_utc),
date_europe1 = as.Date(dt_europe),
- date_char_ymd1 = as.Date(character_ymd_var, format = "%Y-%m-%d %H:%M:%S"),
- date_char_ydm1 = as.Date(character_ydm_var, format = "%Y/%d/%m %H:%M:%S"),
+ date_char_ymd_hms1 = as.Date(character_ymd_hms_var, format = "%Y-%m-%d %H:%M:%S"),
+ date_char_ydm_hms1 = as.Date(character_ydm_hms_var, format = "%Y/%d/%m %H:%M:%S"),
date_int1 = as.Date(integer_var, origin = "1970-01-01"),
date_int_origin1 = as.Date(integer_var, origin = "1970-01-03"),
date_integerish1 = as.Date(integerish_var, origin = "1970-01-01"),
@@ -1538,8 +1541,8 @@ test_that("`as.Date()` and `as_date()`", {
date_pv_tz2 = as_date(posixct_var, tz = "Pacific/Marquesas"),
date_utc2 = as_date(dt_utc),
date_europe2 = as_date(dt_europe),
- date_char_ymd2 = as_date(character_ymd_var, format = "%Y-%m-%d %H:%M:%S"),
- date_char_ydm2 = as_date(character_ydm_var, format = "%Y/%d/%m %H:%M:%S"),
+ date_char_ymd2 = as_date(character_ymd_hms_var, format = "%Y-%m-%d %H:%M:%S"),
+ date_char_ydm2 = as_date(character_ydm_hms_var, format = "%Y/%d/%m %H:%M:%S"),
date_int2 = as_date(integer_var, origin = "1970-01-01"),
date_int_origin2 = as_date(integer_var, origin = "1970-01-03"),
date_integerish2 = as_date(integerish_var, origin = "1970-01-01")
@@ -1549,35 +1552,63 @@ test_that("`as.Date()` and `as_date()`", {
)
# we do not support multiple tryFormats
- compare_dplyr_binding(
- .input %>%
- mutate(date_char_ymd = as.Date(character_ymd_var,
- tryFormats = c("%Y-%m-%d", "%Y/%m/%d")
- )) %>%
+ # this is not a simple warning, therefore we cannot use compare_dplyr_binding()
+ # with `warning = TRUE`
+ # arrow_table test
+ expect_warning(
+ test_df %>%
+ arrow_table() %>%
+ mutate(
+ date_char_ymd = as.Date(
+ character_ymd_var,
+ tryFormats = c("%Y-%m-%d", "%Y/%m/%d")
+ )
+ ) %>%
collect(),
- test_df,
- warning = TRUE
+ regexp = "consider using the lubridate specialised parsing functions"
)
- # strptime does not support a partial format - testing an error surfaced from
- # C++ (hence not testing the content of the error message)
- # TODO revisit once - https://issues.apache.org/jira/browse/ARROW-15813
- expect_error(
+ # record batch test
+ expect_warning(
test_df %>%
- arrow_table() %>%
- mutate(date_char_ymd = as_date(character_ymd_var)) %>%
- collect()
+ record_batch() %>%
+ mutate(
+ date_char_ymd = as.Date(
+ character_ymd_var,
+ tryFormats = c("%Y-%m-%d", "%Y/%m/%d")
+ )
+ ) %>%
+ collect(),
+ regexp = "consider using the lubridate specialised parsing functions"
)
+ # strptime does not support a partial format - Arrow returns NA, while
+ # lubridate parses correctly
+ # TODO revisit once - https://issues.apache.org/jira/browse/ARROW-15813
expect_error(
- test_df %>%
- arrow_table() %>%
- mutate(date_char_ymd = as.Date(character_ymd_var)) %>%
- collect(),
- regexp = "Failed to parse string: '2022-02-25 00:00:01' as a scalar of type timestamp[s]",
- fixed = TRUE
+ expect_equal(
+ test_df %>%
+ arrow_table() %>%
+ mutate(date_char_ymd_hms = as_date(character_ymd_hms_var)) %>%
+ collect(),
+ test_df %>%
+ mutate(date_char_ymd_hms = as_date(character_ymd_hms_var)) %>%
+ collect()
+ )
)
+ # same as above
+ expect_error(
+ expect_equal(
+ test_df %>%
+ arrow_table() %>%
+ mutate(date_char_ymd_hms = as.Date(character_ymd_hms_var)) %>%
+ collect(),
+ test_df %>%
+ mutate(date_char_ymd_hms = as.Date(character_ymd_hms_var)) %>%
+ collect()
+ )
+ )
# we do not support as.Date() with double/ float (error surfaced from C++)
# TODO revisit after https://issues.apache.org/jira/browse/ARROW-15798
@@ -1613,6 +1644,24 @@ test_that("`as.Date()` and `as_date()`", {
)
})
+test_that("`as_date()` and `as.Date()` work with R objects", {
+ compare_dplyr_binding(
+ .input %>%
+ mutate(
+ date1 = as.Date("2022-05-10"),
+ date2 = as.Date(12, origin = "2022-05-01"),
+ date3 = as.Date("2022-10-03", tryFormats = "%Y-%m-%d"),
+ date4 = as_date("2022-05-10"),
+ date5 = as_date(12, origin = "2022-05-01"),
+ date6 = as_date("2022-10-03")
+ ) %>%
+ collect(),
+ tibble(
+ a = 1
+ )
+ )
+})
+
test_that("`as_datetime()`", {
test_df <- tibble(
date = as.Date(c("2022-03-22", "2021-07-30", NA)),