You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by am...@apache.org on 2022/05/19 09:57:40 UTC

[arrow] branch master updated: ARROW-16439: [R] Implement binding for `lubridate::fast_strptime`

This is an automated email from the ASF dual-hosted git repository.

amolina pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 7f31c9d2d9 ARROW-16439: [R] Implement binding for `lubridate::fast_strptime`
7f31c9d2d9 is described below

commit 7f31c9d2d95b5b30fa7997c2519d3b85919f1fd9
Author: Dragoș Moldovan-Grünfeld <dr...@gmail.com>
AuthorDate: Thu May 19 11:57:32 2022 +0200

    ARROW-16439: [R] Implement binding for `lubridate::fast_strptime`
    
    This PR adds:
    * a binding emulating lubridate's `fast_strptime` functionality
    
    This PR does not support the following arguments:
    * `lt = TRUE` - this returns a POSIXlt object (a list) which cannot be easily used in a dplyr pipeline (currently it actually errors) => we have a different default `lt = FALSE` for the Arrow binding, and
    * `cutoff_2000 = 68L` - for the `y%` format two-digit numbers smaller or equal to `cutoff_2000` are parsed as though starting with 20, otherwise parsed as though starting with 19. It would be nice to have this, so I raised [ARROW-16596](https://issues.apache.org/jira/browse/ARROW-16596). We can always suggest users they manipulate the strings before parsing, so I don't think this is crucial functionality.
    
    The following code will be possible once the PR is merged:
    ``` r
    library(lubridate, warn.conflicts = FALSE)
    library(dplyr, warn.conflicts = FALSE)
    library(arrow, warn.conflicts = FALSE)
    
    dates_table <- tibble(
      string_with_short_year = c("68-05-17", "69-05-17", "55-05-17")
    )
    
    dates_table %>%
      mutate(
        date = fast_strptime(
          string_with_short_year,
          format = "%y-%m-%d",
          lt = FALSE
        )
      )
    #> # A tibble: 3 × 2
    #>   string_with_short_year date
    #>   <chr>                  <dttm>
    #> 1 68-05-17               2068-05-17 00:00:00
    #> 2 69-05-17               1969-05-17 00:00:00
    #> 3 55-05-17               2055-05-17 00:00:00
    
    dates_table %>%
      arrow_table() %>%
      mutate(
        date = fast_strptime(
          string_with_short_year,
          format = "%y-%m-%d",
          lt = FALSE
        )
      ) %>%
      collect()
    #> # A tibble: 3 × 2
    #>   string_with_short_year date
    #>   <chr>                  <dttm>
    #> 1 68-05-17               2068-05-17 00:00:00
    #> 2 69-05-17               1969-05-17 00:00:00
    #> 3 55-05-17               2055-05-17 00:00:00
    ```
    
    <sup>Created on 2022-05-18 by the [reprex package](https://reprex.tidyverse.org) (v2.0.1)</sup>
    
    Closes #13174 from dragosmg/fast_strptime_binding
    
    Authored-by: Dragoș Moldovan-Grünfeld <dr...@gmail.com>
    Signed-off-by: Alessandro Molina <am...@turbogears.org>
---
 r/R/dplyr-funcs-datetime.R                   |  36 ++++++++
 r/tests/testthat/test-dplyr-funcs-datetime.R | 130 +++++++++++++++++++++++++++
 2 files changed, 166 insertions(+)

diff --git a/r/R/dplyr-funcs-datetime.R b/r/R/dplyr-funcs-datetime.R
index 02ec35bda2..64847f41fd 100644
--- a/r/R/dplyr-funcs-datetime.R
+++ b/r/R/dplyr-funcs-datetime.R
@@ -618,4 +618,40 @@ register_bindings_datetime_parsers <- function() {
   for (ymd_order in ymd_parser_vec) {
     register_binding(ymd_order, ymd_parser_map_factory(ymd_order))
   }
+
+  register_binding("fast_strptime", function(x,
+                                             format,
+                                             tz = "UTC",
+                                             lt = FALSE,
+                                             cutoff_2000 = 68L) {
+    # `lt` controls the output `lt = TRUE` returns a POSIXlt (which doesn't play
+    # well with mutate, for example)
+    if (lt) {
+      arrow_not_supported("`lt = TRUE` argument")
+    }
+
+    # TODO revisit once https://issues.apache.org/jira/browse/ARROW-16596 is done
+    if (cutoff_2000 != 68L) {
+      arrow_not_supported("`cutoff_2000` != 68L argument")
+    }
+
+    parse_attempt_expressions <- list()
+
+    parse_attempt_expressions <- map(
+      format,
+      ~ build_expr(
+        "strptime",
+        x,
+        options = list(
+          format = .x,
+          unit = 0L,
+          error_is_null = TRUE
+        )
+      )
+    )
+
+    coalesce_output <- build_expr("coalesce", args = parse_attempt_expressions)
+
+    build_expr("assume_timezone", coalesce_output, options = list(timezone = tz))
+  })
 }
diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R
index b122363015..86b1862ab4 100644
--- a/r/tests/testthat/test-dplyr-funcs-datetime.R
+++ b/r/tests/testthat/test-dplyr-funcs-datetime.R
@@ -1812,3 +1812,133 @@ test_that("ym, my & yq parsers", {
     test_df
   )
 })
+
+test_that("lubridate's fast_strptime", {
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(
+        y =
+          fast_strptime(
+            x,
+            format = "%Y-%m-%d %H:%M:%S",
+            lt = FALSE
+          )
+      ) %>%
+      collect(),
+    tibble(
+      x = c("2018-10-07 19:04:05", "2022-05-17 21:23:45", NA)
+    )#,
+    # arrow does not preserve the `tzone` attribute
+    # test ignore_attr = TRUE
+  )
+
+  # R object
+  compare_dplyr_binding(
+    .input %>%
+      mutate(
+        y =
+          fast_strptime(
+            "68-10-07 19:04:05",
+            format = "%y-%m-%d %H:%M:%S",
+            lt = FALSE
+          )
+      ) %>%
+      collect(),
+    tibble(
+      x = c("2018-10-07 19:04:05", NA)
+    )#,
+    # test ignore_attr = TRUE
+  )
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(
+        date_multi_formats =
+          fast_strptime(
+            x,
+            format = c("%Y-%m-%d %H:%M:%S", "%m-%d-%Y %H:%M:%S"),
+            lt = FALSE
+          )
+      ) %>%
+      collect(),
+    tibble(
+      x = c("2018-10-07 19:04:05", "10-07-1968 19:04:05")
+    )
+  )
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(
+        dttm_with_tz = fast_strptime(
+          dttm_as_string,
+          format = "%Y-%m-%d %H:%M:%S",
+          tz = "Pacific/Marquesas",
+          lt = FALSE
+        )
+      ) %>%
+      collect(),
+    tibble(
+      dttm_as_string =
+        c("2018-10-07 19:04:05", "1969-10-07 19:04:05", NA)
+    )
+  )
+
+  # fast_strptime()'s `cutoff_2000` argument is not supported, but its value is
+  # implicitly set to 68L both in lubridate and in Arrow
+  compare_dplyr_binding(
+    .input %>%
+      mutate(
+        date_short_year =
+          fast_strptime(
+            x,
+            format = "%y-%m-%d %H:%M:%S",
+            lt = FALSE
+          )
+      ) %>%
+      collect(),
+    tibble(
+      x =
+        c("68-10-07 19:04:05", "69-10-07 19:04:05", NA)
+    )#,
+    # arrow does not preserve the `tzone` attribute
+    # test ignore_attr = TRUE
+  )
+
+  # the arrow binding errors for a value different from 68L for `cutoff_2000`
+  compare_dplyr_binding(
+    .input %>%
+      mutate(
+        date_short_year =
+          fast_strptime(
+            x,
+            format = "%y-%m-%d %H:%M:%S",
+            lt = FALSE,
+            cutoff_2000 = 69L
+          )
+      ) %>%
+      collect(),
+    tibble(
+      x = c("68-10-07 19:04:05", "69-10-07 19:04:05", NA)
+    ),
+    warning = TRUE
+  )
+
+  # compare_dplyr_binding would not work here since lt = TRUE returns a list
+  # and it also errors in regular dplyr pipelines
+  expect_warning(
+    tibble(
+      x = c("68-10-07 19:04:05", "69-10-07 19:04:05", NA)
+    ) %>%
+      arrow_table() %>%
+      mutate(
+        date_short_year =
+          fast_strptime(
+            x,
+            format = "%y-%m-%d %H:%M:%S",
+            lt = TRUE
+          )
+      ) %>%
+      collect()
+  )
+})