You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by pa...@apache.org on 2022/07/29 01:35:26 UTC

[arrow] branch master updated: ARROW-16653: [R] All formats are supported with the lubridate `parse_date_time` binding (#13506)

This is an automated email from the ASF dual-hosted git repository.

paleolimbot pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 3e87c66ae0 ARROW-16653: [R] All formats are supported with the lubridate `parse_date_time` binding (#13506)
3e87c66ae0 is described below

commit 3e87c66ae00cd25cb9e5e42424b90ace988856ac
Author: Rok Mihevc <ro...@mihevc.org>
AuthorDate: Fri Jul 29 03:35:19 2022 +0200

    ARROW-16653: [R] All formats are supported with the lubridate `parse_date_time` binding (#13506)
    
    This is to resolve [ARROW-16653](https://issues.apache.org/jira/browse/ARROW-16653).
    
    Please note the intent here is to map out currently available formats via a new test. That is to support [ARROW-16395](https://issues.apache.org/jira/browse/ARROW-16395) which will add `ymd_hms() ymd_hm() ymd_h() dmy_hms() dmy_hm() dmy_h() mdy_hms() mdy_hm() mdy_h() ydm_hms() ydm_hm() ydm_h()`.
    
    Currently most [lubridate supported flags](https://lubridate.tidyverse.org/reference/parse_date_time.html#details) are available and this adds a test. Remaining are `%q` and `%Op` that we don't need to resolve [ARROW-16395](https://issues.apache.org/jira/browse/ARROW-16395). We could open a ticket for adding support for the two remaining flags to C++ `strptime` or wait for users to request them.
    
    Lead-authored-by: Rok <ro...@mihevc.org>
    Co-authored-by: Rok Mihevc <ro...@mihevc.org>
    Signed-off-by: Dewey Dunnington <de...@fishandwhistle.net>
---
 r/R/dplyr-datetime-helpers.R                 | 110 ++++++-----
 r/tests/testthat/test-dplyr-funcs-datetime.R | 265 ++++++++++++++++++++++++---
 2 files changed, 292 insertions(+), 83 deletions(-)

diff --git a/r/R/dplyr-datetime-helpers.R b/r/R/dplyr-datetime-helpers.R
index efcc62ff4e..4c9a8d1bf0 100644
--- a/r/R/dplyr-datetime-helpers.R
+++ b/r/R/dplyr-datetime-helpers.R
@@ -163,15 +163,31 @@ build_formats <- function(orders) {
   # process the `orders` (even if supplied in the desired format)
   # Processing is needed (instead of passing
   # formats as-is) due to the processing of the character vector in parse_date_time()
+
   orders <- gsub("[^A-Za-z]", "", orders)
   orders <- gsub("Y", "y", orders)
 
+  valid_formats <- "[a|A|b|B|d|H|I|j|m|Om|M|Op|p|q|OS|S|U|w|W|y|Y|r|R|T|z]"
+  invalid_orders <- nchar(gsub(valid_formats, "", orders)) > 0
+
+  if (any(invalid_orders)) {
+    arrow_not_supported(
+      paste0(
+        oxford_paste(
+          orders[invalid_orders]
+        ),
+        " `orders`"
+      )
+    )
+  }
+
   # we separate "ym', "my", and "yq" from the rest of the `orders` vector and
   # transform them. `ym` and `yq` -> `ymd` & `my` -> `myd`
   # this is needed for 2 reasons:
   # 1. strptime does not parse "2022-05" -> we add "-01", thus changing the format,
   # 2. for equivalence to lubridate, which parses `ym` to the first day of the month
-  short_orders <- c("ym", "my")
+  short_orders <- c("ym", "my", "yOm", "Omy")
+  quarter_orders <- c("yq", "qy")
 
   if (any(orders %in% short_orders)) {
     orders1 <- setdiff(orders, short_orders)
@@ -179,51 +195,10 @@ build_formats <- function(orders) {
     orders2 <- paste0(orders2, "d")
     orders <- unique(c(orders2, orders1))
   }
-
-  if (any(orders == "yq")) {
-    orders1 <- setdiff(orders, "yq")
-    orders2 <- "ymd"
-    orders <- unique(c(orders1, orders2))
-  }
-
-  if (any(orders == "qy")) {
-    orders1 <- setdiff(orders, "qy")
-    orders2 <- "ymd"
-    orders <- unique(c(orders1, orders2))
-  }
-
-  ymd_orders <- c("ymd", "ydm", "mdy", "myd", "dmy", "dym")
-  ymd_hms_orders <- c(
-    "ymd_HMS", "ymd_HM", "ymd_H", "dmy_HMS", "dmy_HM", "dmy_H", "mdy_HMS",
-    "mdy_HM", "mdy_H", "ydm_HMS", "ydm_HM", "ydm_H"
-  )
-  # support "%I" hour formats
-  ymd_ims_orders <- gsub("H", "I", ymd_hms_orders)
-
-  supported_orders <- c(
-    ymd_orders,
-    ymd_hms_orders,
-    gsub("_", " ", ymd_hms_orders), # allow "_", " " and "" as order separators
-    gsub("_", "", ymd_hms_orders),
-    ymd_ims_orders,
-    gsub("_", " ", ymd_ims_orders), # allow "_", " " and "" as order separators
-    gsub("_", "", ymd_ims_orders)
-  )
-
-  unsupported_passed_orders <- setdiff(orders, supported_orders)
-  supported_passed_orders <- intersect(orders, supported_orders)
-
-  # error only if there isn't at least one valid order we can try
-  if (length(supported_passed_orders) == 0) {
-    arrow_not_supported(
-      paste0(
-        oxford_paste(
-          unsupported_passed_orders
-        ),
-        " `orders`"
-      )
-    )
+  if (any(orders %in% quarter_orders)) {
+    orders <- c(setdiff(orders, quarter_orders), "ymd")
   }
+  orders <- unique(orders)
 
   formats_list <- map(orders, build_format_from_order)
   formats <- purrr::flatten_chr(formats_list)
@@ -239,26 +214,47 @@ build_formats <- function(orders) {
 #'
 #' @noRd
 build_format_from_order <- function(order) {
+  month_formats <- c("%m", "%B", "%b")
+  week_formats <- c("%a", "%A")
+  year_formats <- c("%y", "%Y")
   char_list <- list(
-    "y" = c("%y", "%Y"),
-    "m" = c("%m", "%B", "%b"),
-    "d" = "%d",
-    "H" = "%H",
-    "M" = "%M",
-    "S" = "%S",
-    "I" = "%I"
+    "%y" = year_formats,
+    "%Y" = year_formats,
+    "%m" = month_formats,
+    "%Om" = month_formats,
+    "%b" = month_formats,
+    "%B" = month_formats,
+    "%a" = week_formats,
+    "%A" = week_formats,
+    "%d" = "%d",
+    "%H" = "%H",
+    "%j" = "%j",
+    "%OS" = "%OS",
+    "%I" = "%I",
+    "%S" = "%S",
+    "%q" = "%q",
+    "%M" = "%M",
+    "%U" = "%U",
+    "%w" = "%w",
+    "%W" = "%W",
+    "%p" = "%p",
+    "%Op" = "%Op",
+    "%z" = "%z",
+    "%r" = c("%H", "%I-%p"),
+    "%R" = c("%H-%M", "%I-%M-%p"),
+    "%T" = c("%I-%M-%S-%p", "%H-%M-%S", "%H-%M-%OS")
   )
 
-  split_order <- strsplit(order, split = "")[[1]]
-
+  split_order <- regmatches(order, gregexpr("(O{0,1}[a-zA-Z])", order))[[1]]
+  split_order <- paste0("%", split_order)
   outcome <- expand.grid(char_list[split_order])
+
   # we combine formats with and without the "-" separator, we will later
   # coalesce through all of them (benchmarking indicated this is a more
   # computationally efficient approach rather than figuring out if a string has
-  # separators or not and applying only )
-  # during parsing if the string to be parsed does not contain a separator
+  # separators or not and applying the relevant order afterwards)
   formats_with_sep <- do.call(paste, c(outcome, sep = "-"))
-  formats_without_sep <- do.call(paste, c(outcome, sep = ""))
+  formats_without_sep <- gsub("-", "", formats_with_sep)
   c(formats_with_sep, formats_without_sep)
 }
 
diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R
index b8bd28e970..25fe23a28d 100644
--- a/r/tests/testthat/test-dplyr-funcs-datetime.R
+++ b/r/tests/testthat/test-dplyr-funcs-datetime.R
@@ -150,6 +150,71 @@ test_that("strptime", {
     as.POSIXct(tstamp),
     ignore_attr = "tzone"
   )
+
+  # these functions' internals use some string processing which requires the
+  # RE2 library (not available on Windows with R 3.6)
+  skip_if_not_available("re2")
+
+  tz <- "Pacific/Marquesas"
+  set.seed(42)
+  times <- seq(as.POSIXct("1999-02-07", tz = tz), as.POSIXct("2000-01-01", tz = tz), by = "sec")
+  times <- sample(times, 100)
+
+  # Op format is currently not supported by strptime
+  formats <- c(
+    "%d", "%H", "%j", "%m", "%T",
+    "%S", "%q", "%M", "%U", "%w", "%W", "%y", "%Y", "%R", "%T"
+  )
+  formats2 <- c(
+    "a", "A", "b", "B", "d", "H", "j", "m", "Om", "T", "OS", "Ip",
+    "S", "q", "M", "U", "w", "W", "y", "Y", "r", "R", "Tz"
+  )
+  base_format <- "%Y-%m-%d"
+  base_format2 <- "ymd"
+
+  # Some formats are not supported on Windows
+  if (!tolower(Sys.info()[["sysname"]]) == "windows") {
+    formats <- c(formats, "%a", "%A", "%b", "%B", "%Om", "%OS", "%I%p", "%r", "%T%z")
+  }
+
+  for (fmt in formats) {
+    fmt <- paste(base_format, fmt)
+    test_df <- tibble::tibble(x = strftime(times, format = fmt))
+    expect_equal(
+      test_df %>%
+        arrow_table() %>%
+          mutate(x = strptime(x, format = fmt)) %>%
+          collect(),
+      test_df %>%
+        mutate(x = as.POSIXct(strptime(x, format = fmt))) %>%
+        collect()
+    )
+  }
+
+  for (fmt in formats2) {
+    fmt2 <- paste(base_format2, fmt)
+    fmt <- paste(base_format, paste0("%", fmt))
+    test_df <- tibble::tibble(x = strftime(times, format = fmt))
+    expect_equal(
+      test_df %>%
+        arrow_table() %>%
+          mutate(x = strptime(x, format = fmt2)) %>%
+          collect(),
+      test_df %>%
+        mutate(x = as.POSIXct(strptime(x, format = fmt2))) %>%
+        collect()
+    )
+  }
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(
+        parsed_date_ymd = parse_date_time(string_1, orders = "Y-%m-d-%T")
+      ) %>%
+      collect(),
+    tibble::tibble(string_1 = c("2022-02-11-12:23:45", NA))
+  )
+
 })
 
 test_that("strptime returns NA when format doesn't match the data", {
@@ -2045,6 +2110,118 @@ test_that("ym, my & yq parsers", {
   )
 })
 
+test_that("parse_date_time's other formats", {
+  # these functions' internals use some string processing which requires the
+  # RE2 library (not available on Windows with R 3.6)
+  skip_if_not_available("re2")
+
+  # q, OS, Op, z formats are currently not supported by strptime
+  test_df <- tibble(
+    string_a = c("2023-12-30-Sat", NA),
+    string_A = c("2023-12-30-Saturday", NA),
+    string_b = c("2023-12-30-Dec", NA),
+    string_B = c("2023-12-30-December", NA),
+    string_H = c("2023-12-30-01", NA),
+    string_I = c("2023-12-30-01", NA),
+    string_j = c("2023-12-30-364", NA),
+    string_M = c("2023-12-30-00", NA),
+    string_p = c("2023-12-30-AM", NA),
+    string_S = c("2023-12-30-00", NA),
+    string_U = c("2023-12-30-52", NA),
+    string_w = c("2023-12-30-6", NA),
+    string_W = c("2023-12-30-52", NA),
+    string_y = c("23-12-30", NA),
+    string_Y = c("2023-12-30", NA),
+    string_Om = c("2023-01-30", NA),
+    string_r = c("2023-12-30-01", NA),
+    string_R = c("2023-12-30-01:00", NA),
+    string_T = c("2023-12-30-01:00:00", NA)
+  )
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(
+        parsed_H = parse_date_time(string_H, orders = "%Y-%m-%d-%H"),
+        parsed_I = parse_date_time(string_I, orders = "%Y-%m-%d-%I"),
+        parsed_j = parse_date_time(string_j, orders = "%Y-%m-%d-%j"),
+        parsed_M = parse_date_time(string_M, orders = "%Y-%m-%d-%M"),
+        parsed_S = parse_date_time(string_S, orders = "%Y-%m-%d-%S"),
+        parsed_U = parse_date_time(string_U, orders = "%Y-%m-%d-%U"),
+        parsed_w = parse_date_time(string_w, orders = "%Y-%m-%d-%w"),
+        parsed_W = parse_date_time(string_W, orders = "%Y-%m-%d-%W"),
+        parsed_y = parse_date_time(string_y, orders = "%y-%m-%d"),
+        parsed_Y = parse_date_time(string_Y, orders = "%Y-%m-%d"),
+        parsed_R = parse_date_time(string_R, orders = "%Y-%m-%d-%R"),
+        parsed_T = parse_date_time(string_T, orders = "%Y-%m-%d-%T")
+      ) %>%
+      collect(),
+    test_df
+  )
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(
+        parsed_H = parse_date_time(string_H, orders = "ymdH"),
+        parsed_I = parse_date_time(string_I, orders = "ymdI"),
+        parsed_j = parse_date_time(string_j, orders = "ymdj"),
+        parsed_M = parse_date_time(string_M, orders = "ymdM"),
+        parsed_S = parse_date_time(string_S, orders = "ymdS"),
+        parsed_U = parse_date_time(string_U, orders = "ymdU"),
+        parsed_w = parse_date_time(string_w, orders = "ymdw"),
+        parsed_W = parse_date_time(string_W, orders = "ymdW"),
+        parsed_y = parse_date_time(string_y, orders = "ymd"),
+        parsed_Y = parse_date_time(string_Y, orders = "Ymd"),
+        parsed_R = parse_date_time(string_R, orders = "ymdR"),
+        parsed_T = parse_date_time(string_T, orders = "ymdT")
+      ) %>%
+      collect(),
+    test_df
+  )
+
+  # Some formats are not supported on Windows
+  if (!tolower(Sys.info()[["sysname"]]) == "windows") {
+    compare_dplyr_binding(
+      .input %>%
+        mutate(
+          parsed_a = parse_date_time(string_a, orders = "%Y-%m-%d-%a"),
+          parsed_A = parse_date_time(string_A, orders = "%Y-%m-%d-%A"),
+          parsed_b = parse_date_time(string_b, orders = "%Y-%m-%d-%b"),
+          parsed_B = parse_date_time(string_B, orders = "%Y-%m-%d-%B"),
+          parsed_Om = parse_date_time(string_Om, orders = "%Y-%Om-%d"),
+          parsed_p = parse_date_time(string_p, orders = "%Y-%m-%d-%p"),
+          parsed_r = parse_date_time(string_r, orders = "%Y-%m-%d-%r")
+        ) %>%
+        collect(),
+      test_df
+    )
+
+    compare_dplyr_binding(
+      .input %>%
+        mutate(
+          parsed_a = parse_date_time(string_a, orders = "ymda"),
+          parsed_A = parse_date_time(string_A, orders = "ymdA"),
+          parsed_b = parse_date_time(string_b, orders = "ymdb"),
+          parsed_B = parse_date_time(string_B, orders = "ymdB"),
+          parsed_Om = parse_date_time(string_Om, orders = "yOmd"),
+          parsed_p = parse_date_time(string_p, orders = "ymdp"),
+          parsed_r = parse_date_time(string_r, orders = "ymdr")
+        ) %>%
+        collect(),
+      test_df
+    )
+
+    compare_dplyr_binding(
+      .input %>%
+        mutate(
+          parsed_date_ymd = parse_date_time(string_1, orders = "Y-%b-d-%T")
+        ) %>%
+        collect(),
+      tibble::tibble(string_1 = c("2022-Feb-11-12:23:45", NA))
+    )
+  }
+
+})
+
 test_that("lubridate's fast_strptime", {
   compare_dplyr_binding(
     .input %>%
@@ -2578,6 +2755,19 @@ test_that("parse_date_time with `exact = TRUE`, and with regular R objects", {
 })
 
 test_that("build_formats() and build_format_from_order()", {
+
+  ymd_formats <- c(
+    "%y-%m-%d", "%Y-%m-%d", "%y-%B-%d", "%Y-%B-%d", "%y-%b-%d", "%Y-%b-%d",
+    "%y%m%d", "%Y%m%d", "%y%B%d", "%Y%B%d", "%y%b%d", "%Y%b%d"
+  )
+
+  ymd_hms_formats <- c(
+    "%y-%m-%d-%H-%M-%S", "%Y-%m-%d-%H-%M-%S", "%y-%B-%d-%H-%M-%S",
+    "%Y-%B-%d-%H-%M-%S", "%y-%b-%d-%H-%M-%S", "%Y-%b-%d-%H-%M-%S",
+    "%y%m%d%H%M%S", "%Y%m%d%H%M%S", "%y%B%d%H%M%S", "%Y%B%d%H%M%S",
+    "%y%b%d%H%M%S", "%Y%b%d%H%M%S"
+  )
+
   expect_equal(
     build_formats(c("ym", "myd", "%Y-%d-%m")),
     c(
@@ -2595,20 +2785,11 @@ test_that("build_formats() and build_format_from_order()", {
 
   expect_equal(
     build_formats("ymd_HMS"),
-    c(
-      "%y-%m-%d-%H-%M-%S", "%Y-%m-%d-%H-%M-%S", "%y-%B-%d-%H-%M-%S",
-      "%Y-%B-%d-%H-%M-%S", "%y-%b-%d-%H-%M-%S", "%Y-%b-%d-%H-%M-%S",
-      "%y%m%d%H%M%S", "%Y%m%d%H%M%S", "%y%B%d%H%M%S", "%Y%B%d%H%M%S",
-      "%y%b%d%H%M%S", "%Y%b%d%H%M%S"
-    )
+    ymd_hms_formats
   )
 
-  # when order is one of "yq", "qy", "ym" or"my" the data is augmented to "ymd"
+  # when order is one of "yq", "qy", "ym" or "my" the data is augmented to "ymd"
   # or "ydm" and the formats are built accordingly
-  ymd_formats <- c(
-    "%y-%m-%d", "%Y-%m-%d", "%y-%B-%d", "%Y-%B-%d", "%y-%b-%d", "%Y-%b-%d",
-    "%y%m%d", "%Y%m%d", "%y%B%d", "%Y%B%d", "%y%b%d", "%Y%b%d"
-  )
   expect_equal(
     build_formats("yq"),
     ymd_formats
@@ -2638,33 +2819,42 @@ test_that("build_formats() and build_format_from_order()", {
     )
   )
 
-  # ab not supported yet
+  expect_equal(
+    build_format_from_order("abp"),
+    c(
+      "%a-%m-%p", "%A-%m-%p", "%a-%B-%p", "%A-%B-%p", "%a-%b-%p", "%A-%b-%p",
+      "%a%m%p", "%A%m%p", "%a%B%p", "%A%B%p", "%a%b%p", "%A%b%p"
+    )
+  )
+
   expect_error(
-    build_formats("abd"),
-    '"abd" `orders` not supported in Arrow'
+    build_formats(c("vu", "ymd")),
+    '"vu" `orders` not supported in Arrow'
   )
 
   expect_error(
-    build_formats("vup"),
-    '"vup" `orders` not supported in Arrow'
+    build_formats(c("abc")),
+    '"abc" `orders` not supported in Arrow'
+  )
+
+  expect_equal(
+    build_formats("wIpz"),
+    c("%w-%I-%p-%z", "%w%I%p%z")
+  )
+
+  expect_equal(
+    build_formats("yOmd"),
+    ymd_formats
   )
 
   expect_equal(
     build_format_from_order("ymd"),
-    c(
-      "%y-%m-%d", "%Y-%m-%d", "%y-%B-%d", "%Y-%B-%d", "%y-%b-%d", "%Y-%b-%d",
-      "%y%m%d", "%Y%m%d", "%y%B%d", "%Y%B%d", "%y%b%d", "%Y%b%d"
-    )
+    ymd_formats
   )
 
   expect_equal(
     build_format_from_order("ymdHMS"),
-    c(
-      "%y-%m-%d-%H-%M-%S", "%Y-%m-%d-%H-%M-%S", "%y-%B-%d-%H-%M-%S",
-      "%Y-%B-%d-%H-%M-%S", "%y-%b-%d-%H-%M-%S", "%Y-%b-%d-%H-%M-%S",
-      "%y%m%d%H%M%S", "%Y%m%d%H%M%S", "%y%B%d%H%M%S", "%Y%B%d%H%M%S",
-      "%y%b%d%H%M%S", "%Y%b%d%H%M%S"
-    )
+    ymd_hms_formats
   )
 
   expect_equal(
@@ -2686,6 +2876,29 @@ test_that("build_formats() and build_format_from_order()", {
       "%y%b%d%H", "%Y%b%d%H"
     )
   )
+
+  expect_equal(
+    build_formats("y-%b-d-%T"),
+    c(
+      "%y-%m-%d-%I-%M-%S-%p", "%Y-%m-%d-%I-%M-%S-%p", "%y-%B-%d-%I-%M-%S-%p", "%Y-%B-%d-%I-%M-%S-%p",
+      "%y-%b-%d-%I-%M-%S-%p", "%Y-%b-%d-%I-%M-%S-%p", "%y-%m-%d-%H-%M-%S", "%Y-%m-%d-%H-%M-%S",
+      "%y-%B-%d-%H-%M-%S", "%Y-%B-%d-%H-%M-%S", "%y-%b-%d-%H-%M-%S", "%Y-%b-%d-%H-%M-%S",
+      "%y-%m-%d-%H-%M-%OS", "%Y-%m-%d-%H-%M-%OS", "%y-%B-%d-%H-%M-%OS", "%Y-%B-%d-%H-%M-%OS",
+      "%y-%b-%d-%H-%M-%OS", "%Y-%b-%d-%H-%M-%OS", "%y%m%d%I%M%S%p", "%Y%m%d%I%M%S%p",
+      "%y%B%d%I%M%S%p", "%Y%B%d%I%M%S%p", "%y%b%d%I%M%S%p", "%Y%b%d%I%M%S%p", "%y%m%d%H%M%S",
+      "%Y%m%d%H%M%S", "%y%B%d%H%M%S", "%Y%B%d%H%M%S", "%y%b%d%H%M%S", "%Y%b%d%H%M%S", "%y%m%d%H%M%OS",
+      "%Y%m%d%H%M%OS", "%y%B%d%H%M%OS", "%Y%B%d%H%M%OS", "%y%b%d%H%M%OS", "%Y%b%d%H%M%OS"
+    )
+  )
+
+  expect_equal(
+    build_formats("%YdmH%p"),
+    c(
+      "%y-%d-%m-%H-%p", "%Y-%d-%m-%H-%p", "%y-%d-%B-%H-%p", "%Y-%d-%B-%H-%p",
+      "%y-%d-%b-%H-%p", "%Y-%d-%b-%H-%p", "%y%d%m%H%p", "%Y%d%m%H%p",
+      "%y%d%B%H%p", "%Y%d%B%H%p", "%y%d%b%H%p", "%Y%d%b%H%p"
+    )
+  )
 })