You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by pa...@apache.org on 2022/07/29 01:35:26 UTC
[arrow] branch master updated: ARROW-16653: [R] All formats are supported with the lubridate `parse_date_time` binding (#13506)
This is an automated email from the ASF dual-hosted git repository.
paleolimbot pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 3e87c66ae0 ARROW-16653: [R] All formats are supported with the lubridate `parse_date_time` binding (#13506)
3e87c66ae0 is described below
commit 3e87c66ae00cd25cb9e5e42424b90ace988856ac
Author: Rok Mihevc <ro...@mihevc.org>
AuthorDate: Fri Jul 29 03:35:19 2022 +0200
ARROW-16653: [R] All formats are supported with the lubridate `parse_date_time` binding (#13506)
This is to resolve [ARROW-16653](https://issues.apache.org/jira/browse/ARROW-16653).
Please note the intent here is to map out currently available formats via a new test. That is to support [ARROW-16395](https://issues.apache.org/jira/browse/ARROW-16395) which will add `ymd_hms() ymd_hm() ymd_h() dmy_hms() dmy_hm() dmy_h() mdy_hms() mdy_hm() mdy_h() ydm_hms() ydm_hm() ydm_h()`.
Currently most [lubridate supported flags](https://lubridate.tidyverse.org/reference/parse_date_time.html#details) are available and this adds a test. Remaining are `%q` and `%Op` that we don't need to resolve [ARROW-16395](https://issues.apache.org/jira/browse/ARROW-16395). We could open a ticket for adding support for the two remaining flags to C++ `strptime` or wait for users to request them.
Lead-authored-by: Rok <ro...@mihevc.org>
Co-authored-by: Rok Mihevc <ro...@mihevc.org>
Signed-off-by: Dewey Dunnington <de...@fishandwhistle.net>
---
r/R/dplyr-datetime-helpers.R | 110 ++++++-----
r/tests/testthat/test-dplyr-funcs-datetime.R | 265 ++++++++++++++++++++++++---
2 files changed, 292 insertions(+), 83 deletions(-)
diff --git a/r/R/dplyr-datetime-helpers.R b/r/R/dplyr-datetime-helpers.R
index efcc62ff4e..4c9a8d1bf0 100644
--- a/r/R/dplyr-datetime-helpers.R
+++ b/r/R/dplyr-datetime-helpers.R
@@ -163,15 +163,31 @@ build_formats <- function(orders) {
# process the `orders` (even if supplied in the desired format)
# Processing is needed (instead of passing
# formats as-is) due to the processing of the character vector in parse_date_time()
+
orders <- gsub("[^A-Za-z]", "", orders)
orders <- gsub("Y", "y", orders)
+ valid_formats <- "[a|A|b|B|d|H|I|j|m|Om|M|Op|p|q|OS|S|U|w|W|y|Y|r|R|T|z]"
+ invalid_orders <- nchar(gsub(valid_formats, "", orders)) > 0
+
+ if (any(invalid_orders)) {
+ arrow_not_supported(
+ paste0(
+ oxford_paste(
+ orders[invalid_orders]
+ ),
+ " `orders`"
+ )
+ )
+ }
+
# we separate "ym', "my", and "yq" from the rest of the `orders` vector and
# transform them. `ym` and `yq` -> `ymd` & `my` -> `myd`
# this is needed for 2 reasons:
# 1. strptime does not parse "2022-05" -> we add "-01", thus changing the format,
# 2. for equivalence to lubridate, which parses `ym` to the first day of the month
- short_orders <- c("ym", "my")
+ short_orders <- c("ym", "my", "yOm", "Omy")
+ quarter_orders <- c("yq", "qy")
if (any(orders %in% short_orders)) {
orders1 <- setdiff(orders, short_orders)
@@ -179,51 +195,10 @@ build_formats <- function(orders) {
orders2 <- paste0(orders2, "d")
orders <- unique(c(orders2, orders1))
}
-
- if (any(orders == "yq")) {
- orders1 <- setdiff(orders, "yq")
- orders2 <- "ymd"
- orders <- unique(c(orders1, orders2))
- }
-
- if (any(orders == "qy")) {
- orders1 <- setdiff(orders, "qy")
- orders2 <- "ymd"
- orders <- unique(c(orders1, orders2))
- }
-
- ymd_orders <- c("ymd", "ydm", "mdy", "myd", "dmy", "dym")
- ymd_hms_orders <- c(
- "ymd_HMS", "ymd_HM", "ymd_H", "dmy_HMS", "dmy_HM", "dmy_H", "mdy_HMS",
- "mdy_HM", "mdy_H", "ydm_HMS", "ydm_HM", "ydm_H"
- )
- # support "%I" hour formats
- ymd_ims_orders <- gsub("H", "I", ymd_hms_orders)
-
- supported_orders <- c(
- ymd_orders,
- ymd_hms_orders,
- gsub("_", " ", ymd_hms_orders), # allow "_", " " and "" as order separators
- gsub("_", "", ymd_hms_orders),
- ymd_ims_orders,
- gsub("_", " ", ymd_ims_orders), # allow "_", " " and "" as order separators
- gsub("_", "", ymd_ims_orders)
- )
-
- unsupported_passed_orders <- setdiff(orders, supported_orders)
- supported_passed_orders <- intersect(orders, supported_orders)
-
- # error only if there isn't at least one valid order we can try
- if (length(supported_passed_orders) == 0) {
- arrow_not_supported(
- paste0(
- oxford_paste(
- unsupported_passed_orders
- ),
- " `orders`"
- )
- )
+ if (any(orders %in% quarter_orders)) {
+ orders <- c(setdiff(orders, quarter_orders), "ymd")
}
+ orders <- unique(orders)
formats_list <- map(orders, build_format_from_order)
formats <- purrr::flatten_chr(formats_list)
@@ -239,26 +214,47 @@ build_formats <- function(orders) {
#'
#' @noRd
build_format_from_order <- function(order) {
+ month_formats <- c("%m", "%B", "%b")
+ week_formats <- c("%a", "%A")
+ year_formats <- c("%y", "%Y")
char_list <- list(
- "y" = c("%y", "%Y"),
- "m" = c("%m", "%B", "%b"),
- "d" = "%d",
- "H" = "%H",
- "M" = "%M",
- "S" = "%S",
- "I" = "%I"
+ "%y" = year_formats,
+ "%Y" = year_formats,
+ "%m" = month_formats,
+ "%Om" = month_formats,
+ "%b" = month_formats,
+ "%B" = month_formats,
+ "%a" = week_formats,
+ "%A" = week_formats,
+ "%d" = "%d",
+ "%H" = "%H",
+ "%j" = "%j",
+ "%OS" = "%OS",
+ "%I" = "%I",
+ "%S" = "%S",
+ "%q" = "%q",
+ "%M" = "%M",
+ "%U" = "%U",
+ "%w" = "%w",
+ "%W" = "%W",
+ "%p" = "%p",
+ "%Op" = "%Op",
+ "%z" = "%z",
+ "%r" = c("%H", "%I-%p"),
+ "%R" = c("%H-%M", "%I-%M-%p"),
+ "%T" = c("%I-%M-%S-%p", "%H-%M-%S", "%H-%M-%OS")
)
- split_order <- strsplit(order, split = "")[[1]]
-
+ split_order <- regmatches(order, gregexpr("(O{0,1}[a-zA-Z])", order))[[1]]
+ split_order <- paste0("%", split_order)
outcome <- expand.grid(char_list[split_order])
+
# we combine formats with and without the "-" separator, we will later
# coalesce through all of them (benchmarking indicated this is a more
# computationally efficient approach rather than figuring out if a string has
- # separators or not and applying only )
- # during parsing if the string to be parsed does not contain a separator
+ # separators or not and applying the relevant order afterwards)
formats_with_sep <- do.call(paste, c(outcome, sep = "-"))
- formats_without_sep <- do.call(paste, c(outcome, sep = ""))
+ formats_without_sep <- gsub("-", "", formats_with_sep)
c(formats_with_sep, formats_without_sep)
}
diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R
index b8bd28e970..25fe23a28d 100644
--- a/r/tests/testthat/test-dplyr-funcs-datetime.R
+++ b/r/tests/testthat/test-dplyr-funcs-datetime.R
@@ -150,6 +150,71 @@ test_that("strptime", {
as.POSIXct(tstamp),
ignore_attr = "tzone"
)
+
+ # these functions' internals use some string processing which requires the
+ # RE2 library (not available on Windows with R 3.6)
+ skip_if_not_available("re2")
+
+ tz <- "Pacific/Marquesas"
+ set.seed(42)
+ times <- seq(as.POSIXct("1999-02-07", tz = tz), as.POSIXct("2000-01-01", tz = tz), by = "sec")
+ times <- sample(times, 100)
+
+ # Op format is currently not supported by strptime
+ formats <- c(
+ "%d", "%H", "%j", "%m", "%T",
+ "%S", "%q", "%M", "%U", "%w", "%W", "%y", "%Y", "%R", "%T"
+ )
+ formats2 <- c(
+ "a", "A", "b", "B", "d", "H", "j", "m", "Om", "T", "OS", "Ip",
+ "S", "q", "M", "U", "w", "W", "y", "Y", "r", "R", "Tz"
+ )
+ base_format <- "%Y-%m-%d"
+ base_format2 <- "ymd"
+
+ # Some formats are not supported on Windows
+ if (!tolower(Sys.info()[["sysname"]]) == "windows") {
+ formats <- c(formats, "%a", "%A", "%b", "%B", "%Om", "%OS", "%I%p", "%r", "%T%z")
+ }
+
+ for (fmt in formats) {
+ fmt <- paste(base_format, fmt)
+ test_df <- tibble::tibble(x = strftime(times, format = fmt))
+ expect_equal(
+ test_df %>%
+ arrow_table() %>%
+ mutate(x = strptime(x, format = fmt)) %>%
+ collect(),
+ test_df %>%
+ mutate(x = as.POSIXct(strptime(x, format = fmt))) %>%
+ collect()
+ )
+ }
+
+ for (fmt in formats2) {
+ fmt2 <- paste(base_format2, fmt)
+ fmt <- paste(base_format, paste0("%", fmt))
+ test_df <- tibble::tibble(x = strftime(times, format = fmt))
+ expect_equal(
+ test_df %>%
+ arrow_table() %>%
+ mutate(x = strptime(x, format = fmt2)) %>%
+ collect(),
+ test_df %>%
+ mutate(x = as.POSIXct(strptime(x, format = fmt2))) %>%
+ collect()
+ )
+ }
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(
+ parsed_date_ymd = parse_date_time(string_1, orders = "Y-%m-d-%T")
+ ) %>%
+ collect(),
+ tibble::tibble(string_1 = c("2022-02-11-12:23:45", NA))
+ )
+
})
test_that("strptime returns NA when format doesn't match the data", {
@@ -2045,6 +2110,118 @@ test_that("ym, my & yq parsers", {
)
})
+test_that("parse_date_time's other formats", {
+ # these functions' internals use some string processing which requires the
+ # RE2 library (not available on Windows with R 3.6)
+ skip_if_not_available("re2")
+
+ # q, OS, Op, z formats are currently not supported by strptime
+ test_df <- tibble(
+ string_a = c("2023-12-30-Sat", NA),
+ string_A = c("2023-12-30-Saturday", NA),
+ string_b = c("2023-12-30-Dec", NA),
+ string_B = c("2023-12-30-December", NA),
+ string_H = c("2023-12-30-01", NA),
+ string_I = c("2023-12-30-01", NA),
+ string_j = c("2023-12-30-364", NA),
+ string_M = c("2023-12-30-00", NA),
+ string_p = c("2023-12-30-AM", NA),
+ string_S = c("2023-12-30-00", NA),
+ string_U = c("2023-12-30-52", NA),
+ string_w = c("2023-12-30-6", NA),
+ string_W = c("2023-12-30-52", NA),
+ string_y = c("23-12-30", NA),
+ string_Y = c("2023-12-30", NA),
+ string_Om = c("2023-01-30", NA),
+ string_r = c("2023-12-30-01", NA),
+ string_R = c("2023-12-30-01:00", NA),
+ string_T = c("2023-12-30-01:00:00", NA)
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(
+ parsed_H = parse_date_time(string_H, orders = "%Y-%m-%d-%H"),
+ parsed_I = parse_date_time(string_I, orders = "%Y-%m-%d-%I"),
+ parsed_j = parse_date_time(string_j, orders = "%Y-%m-%d-%j"),
+ parsed_M = parse_date_time(string_M, orders = "%Y-%m-%d-%M"),
+ parsed_S = parse_date_time(string_S, orders = "%Y-%m-%d-%S"),
+ parsed_U = parse_date_time(string_U, orders = "%Y-%m-%d-%U"),
+ parsed_w = parse_date_time(string_w, orders = "%Y-%m-%d-%w"),
+ parsed_W = parse_date_time(string_W, orders = "%Y-%m-%d-%W"),
+ parsed_y = parse_date_time(string_y, orders = "%y-%m-%d"),
+ parsed_Y = parse_date_time(string_Y, orders = "%Y-%m-%d"),
+ parsed_R = parse_date_time(string_R, orders = "%Y-%m-%d-%R"),
+ parsed_T = parse_date_time(string_T, orders = "%Y-%m-%d-%T")
+ ) %>%
+ collect(),
+ test_df
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(
+ parsed_H = parse_date_time(string_H, orders = "ymdH"),
+ parsed_I = parse_date_time(string_I, orders = "ymdI"),
+ parsed_j = parse_date_time(string_j, orders = "ymdj"),
+ parsed_M = parse_date_time(string_M, orders = "ymdM"),
+ parsed_S = parse_date_time(string_S, orders = "ymdS"),
+ parsed_U = parse_date_time(string_U, orders = "ymdU"),
+ parsed_w = parse_date_time(string_w, orders = "ymdw"),
+ parsed_W = parse_date_time(string_W, orders = "ymdW"),
+ parsed_y = parse_date_time(string_y, orders = "ymd"),
+ parsed_Y = parse_date_time(string_Y, orders = "Ymd"),
+ parsed_R = parse_date_time(string_R, orders = "ymdR"),
+ parsed_T = parse_date_time(string_T, orders = "ymdT")
+ ) %>%
+ collect(),
+ test_df
+ )
+
+ # Some formats are not supported on Windows
+ if (!tolower(Sys.info()[["sysname"]]) == "windows") {
+ compare_dplyr_binding(
+ .input %>%
+ mutate(
+ parsed_a = parse_date_time(string_a, orders = "%Y-%m-%d-%a"),
+ parsed_A = parse_date_time(string_A, orders = "%Y-%m-%d-%A"),
+ parsed_b = parse_date_time(string_b, orders = "%Y-%m-%d-%b"),
+ parsed_B = parse_date_time(string_B, orders = "%Y-%m-%d-%B"),
+ parsed_Om = parse_date_time(string_Om, orders = "%Y-%Om-%d"),
+ parsed_p = parse_date_time(string_p, orders = "%Y-%m-%d-%p"),
+ parsed_r = parse_date_time(string_r, orders = "%Y-%m-%d-%r")
+ ) %>%
+ collect(),
+ test_df
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(
+ parsed_a = parse_date_time(string_a, orders = "ymda"),
+ parsed_A = parse_date_time(string_A, orders = "ymdA"),
+ parsed_b = parse_date_time(string_b, orders = "ymdb"),
+ parsed_B = parse_date_time(string_B, orders = "ymdB"),
+ parsed_Om = parse_date_time(string_Om, orders = "yOmd"),
+ parsed_p = parse_date_time(string_p, orders = "ymdp"),
+ parsed_r = parse_date_time(string_r, orders = "ymdr")
+ ) %>%
+ collect(),
+ test_df
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(
+ parsed_date_ymd = parse_date_time(string_1, orders = "Y-%b-d-%T")
+ ) %>%
+ collect(),
+ tibble::tibble(string_1 = c("2022-Feb-11-12:23:45", NA))
+ )
+ }
+
+})
+
test_that("lubridate's fast_strptime", {
compare_dplyr_binding(
.input %>%
@@ -2578,6 +2755,19 @@ test_that("parse_date_time with `exact = TRUE`, and with regular R objects", {
})
test_that("build_formats() and build_format_from_order()", {
+
+ ymd_formats <- c(
+ "%y-%m-%d", "%Y-%m-%d", "%y-%B-%d", "%Y-%B-%d", "%y-%b-%d", "%Y-%b-%d",
+ "%y%m%d", "%Y%m%d", "%y%B%d", "%Y%B%d", "%y%b%d", "%Y%b%d"
+ )
+
+ ymd_hms_formats <- c(
+ "%y-%m-%d-%H-%M-%S", "%Y-%m-%d-%H-%M-%S", "%y-%B-%d-%H-%M-%S",
+ "%Y-%B-%d-%H-%M-%S", "%y-%b-%d-%H-%M-%S", "%Y-%b-%d-%H-%M-%S",
+ "%y%m%d%H%M%S", "%Y%m%d%H%M%S", "%y%B%d%H%M%S", "%Y%B%d%H%M%S",
+ "%y%b%d%H%M%S", "%Y%b%d%H%M%S"
+ )
+
expect_equal(
build_formats(c("ym", "myd", "%Y-%d-%m")),
c(
@@ -2595,20 +2785,11 @@ test_that("build_formats() and build_format_from_order()", {
expect_equal(
build_formats("ymd_HMS"),
- c(
- "%y-%m-%d-%H-%M-%S", "%Y-%m-%d-%H-%M-%S", "%y-%B-%d-%H-%M-%S",
- "%Y-%B-%d-%H-%M-%S", "%y-%b-%d-%H-%M-%S", "%Y-%b-%d-%H-%M-%S",
- "%y%m%d%H%M%S", "%Y%m%d%H%M%S", "%y%B%d%H%M%S", "%Y%B%d%H%M%S",
- "%y%b%d%H%M%S", "%Y%b%d%H%M%S"
- )
+ ymd_hms_formats
)
- # when order is one of "yq", "qy", "ym" or"my" the data is augmented to "ymd"
+ # when order is one of "yq", "qy", "ym" or "my" the data is augmented to "ymd"
# or "ydm" and the formats are built accordingly
- ymd_formats <- c(
- "%y-%m-%d", "%Y-%m-%d", "%y-%B-%d", "%Y-%B-%d", "%y-%b-%d", "%Y-%b-%d",
- "%y%m%d", "%Y%m%d", "%y%B%d", "%Y%B%d", "%y%b%d", "%Y%b%d"
- )
expect_equal(
build_formats("yq"),
ymd_formats
@@ -2638,33 +2819,42 @@ test_that("build_formats() and build_format_from_order()", {
)
)
- # ab not supported yet
+ expect_equal(
+ build_format_from_order("abp"),
+ c(
+ "%a-%m-%p", "%A-%m-%p", "%a-%B-%p", "%A-%B-%p", "%a-%b-%p", "%A-%b-%p",
+ "%a%m%p", "%A%m%p", "%a%B%p", "%A%B%p", "%a%b%p", "%A%b%p"
+ )
+ )
+
expect_error(
- build_formats("abd"),
- '"abd" `orders` not supported in Arrow'
+ build_formats(c("vu", "ymd")),
+ '"vu" `orders` not supported in Arrow'
)
expect_error(
- build_formats("vup"),
- '"vup" `orders` not supported in Arrow'
+ build_formats(c("abc")),
+ '"abc" `orders` not supported in Arrow'
+ )
+
+ expect_equal(
+ build_formats("wIpz"),
+ c("%w-%I-%p-%z", "%w%I%p%z")
+ )
+
+ expect_equal(
+ build_formats("yOmd"),
+ ymd_formats
)
expect_equal(
build_format_from_order("ymd"),
- c(
- "%y-%m-%d", "%Y-%m-%d", "%y-%B-%d", "%Y-%B-%d", "%y-%b-%d", "%Y-%b-%d",
- "%y%m%d", "%Y%m%d", "%y%B%d", "%Y%B%d", "%y%b%d", "%Y%b%d"
- )
+ ymd_formats
)
expect_equal(
build_format_from_order("ymdHMS"),
- c(
- "%y-%m-%d-%H-%M-%S", "%Y-%m-%d-%H-%M-%S", "%y-%B-%d-%H-%M-%S",
- "%Y-%B-%d-%H-%M-%S", "%y-%b-%d-%H-%M-%S", "%Y-%b-%d-%H-%M-%S",
- "%y%m%d%H%M%S", "%Y%m%d%H%M%S", "%y%B%d%H%M%S", "%Y%B%d%H%M%S",
- "%y%b%d%H%M%S", "%Y%b%d%H%M%S"
- )
+ ymd_hms_formats
)
expect_equal(
@@ -2686,6 +2876,29 @@ test_that("build_formats() and build_format_from_order()", {
"%y%b%d%H", "%Y%b%d%H"
)
)
+
+ expect_equal(
+ build_formats("y-%b-d-%T"),
+ c(
+ "%y-%m-%d-%I-%M-%S-%p", "%Y-%m-%d-%I-%M-%S-%p", "%y-%B-%d-%I-%M-%S-%p", "%Y-%B-%d-%I-%M-%S-%p",
+ "%y-%b-%d-%I-%M-%S-%p", "%Y-%b-%d-%I-%M-%S-%p", "%y-%m-%d-%H-%M-%S", "%Y-%m-%d-%H-%M-%S",
+ "%y-%B-%d-%H-%M-%S", "%Y-%B-%d-%H-%M-%S", "%y-%b-%d-%H-%M-%S", "%Y-%b-%d-%H-%M-%S",
+ "%y-%m-%d-%H-%M-%OS", "%Y-%m-%d-%H-%M-%OS", "%y-%B-%d-%H-%M-%OS", "%Y-%B-%d-%H-%M-%OS",
+ "%y-%b-%d-%H-%M-%OS", "%Y-%b-%d-%H-%M-%OS", "%y%m%d%I%M%S%p", "%Y%m%d%I%M%S%p",
+ "%y%B%d%I%M%S%p", "%Y%B%d%I%M%S%p", "%y%b%d%I%M%S%p", "%Y%b%d%I%M%S%p", "%y%m%d%H%M%S",
+ "%Y%m%d%H%M%S", "%y%B%d%H%M%S", "%Y%B%d%H%M%S", "%y%b%d%H%M%S", "%Y%b%d%H%M%S", "%y%m%d%H%M%OS",
+ "%Y%m%d%H%M%OS", "%y%B%d%H%M%OS", "%Y%B%d%H%M%OS", "%y%b%d%H%M%OS", "%Y%b%d%H%M%OS"
+ )
+ )
+
+ expect_equal(
+ build_formats("%YdmH%p"),
+ c(
+ "%y-%d-%m-%H-%p", "%Y-%d-%m-%H-%p", "%y-%d-%B-%H-%p", "%Y-%d-%B-%H-%p",
+ "%y-%d-%b-%H-%p", "%Y-%d-%b-%H-%p", "%y%d%m%H%p", "%Y%d%m%H%p",
+ "%y%d%B%H%p", "%Y%d%B%H%p", "%y%d%b%H%p", "%Y%d%b%H%p"
+ )
+ )
})