You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by jo...@apache.org on 2022/04/11 18:16:22 UTC

[arrow] branch master updated: ARROW-14810 [R] Implement bindings for lubridate's `date_decimal()` and `decimal_date()`

This is an automated email from the ASF dual-hosted git repository.

jonkeane pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 80bba5cbde ARROW-14810 [R] Implement bindings for lubridate's `date_decimal()` and `decimal_date()`
80bba5cbde is described below

commit 80bba5cbdef77e809a7b9bfec36eb5d6a61f0b5d
Author: Dragoș Moldovan-Grünfeld <dr...@gmail.com>
AuthorDate: Mon Apr 11 13:16:13 2022 -0500

    ARROW-14810 [R] Implement bindings for lubridate's `date_decimal()` and `decimal_date()`
    
    This would allow the following operations:
    
    ``` r
    library(dplyr, warn.conflicts = FALSE)
    library(lubridate, warn.conflicts = FALSE)
    library(arrow, warn.conflicts = FALSE)
    
    test_df <- tibble(
      a = c(2007.38998954347, 1970.77732069883, 2020.96061799722,
            2009.43465948477, 1975.71251467871, NA),
      b = as.POSIXct(
        c("2007-05-23 08:18:30", "1970-10-11 17:19:45", "2020-12-17 14:04:06",
          "2009-06-08 15:37:01", "1975-09-18 01:37:42", NA)
      )
    )
    
    test_df %>%
      mutate(
        decimal_date_from_date = decimal_date(b),
        date_from_decimal = date_decimal(a)
      )
    #> # A tibble: 6 × 4
    #>       a b                   decimal_date_from_date date_from_decimal
    #>   <dbl> <dttm>                               <dbl> <dttm>
    #> 1 2007. 2007-05-23 08:18:30                  2007. 2007-05-23 08:18:30
    #> 2 1971. 1970-10-11 17:19:45                  1971. 1970-10-11 17:19:45
    #> 3 2021. 2020-12-17 14:04:06                  2021. 2020-12-17 14:04:06
    #> 4 2009. 2009-06-08 15:37:01                  2009. 2009-06-08 15:37:01
    #> 5 1976. 1975-09-18 01:37:42                  1976. 1975-09-18 01:37:42
    #> 6   NA  NA                                     NA  NA
    
    test_df %>%
      arrow_table() %>%
      mutate(
        decimal_date_from_date = decimal_date(b),
        date_from_decimal = date_decimal(a)
      ) %>%
      collect()
    #> # A tibble: 6 × 4
    #>       a b                   decimal_date_from_date date_from_decimal
    #>   <dbl> <dttm>                               <dbl> <dttm>
    #> 1 2007. 2007-05-23 08:18:30                  2007. 2007-05-23 08:18:30
    #> 2 1971. 1970-10-11 17:19:45                  1971. 1970-10-11 17:19:45
    #> 3 2021. 2020-12-17 14:04:06                  2021. 2020-12-17 14:04:06
    #> 4 2009. 2009-06-08 15:37:01                  2009. 2009-06-08 15:37:01
    #> 5 1976. 1975-09-18 01:37:42                  1976. 1975-09-18 01:37:42
    #> 6   NA  NA                                     NA  NA
    ```
    
    <sup>Created on 2022-03-28 by the [reprex package](https://reprex.tidyverse.org) (v2.0.1)</sup>
    
    Closes #12707 from dragosmg/decimal_dates
    
    Authored-by: Dragoș Moldovan-Grünfeld <dr...@gmail.com>
    Signed-off-by: Jonathan Keane <jk...@gmail.com>
---
 r/NEWS.md                                    |  7 +++--
 r/R/dplyr-funcs-datetime.R                   | 42 ++++++++++++++++++++++++++++
 r/tests/testthat/test-dplyr-funcs-datetime.R | 31 +++++++++++++++++++-
 3 files changed, 76 insertions(+), 4 deletions(-)

diff --git a/r/NEWS.md b/r/NEWS.md
index 0a7d30d2a3..1a1f198e0f 100644
--- a/r/NEWS.md
+++ b/r/NEWS.md
@@ -22,10 +22,11 @@
 * `read_csv_arrow()`'s readr-style type `T` is now mapped to `timestamp(unit = "ns")` instead of `timestamp(unit = "s")`.
 * `lubridate`:
   * component extraction functions: `tz()` (timezone), `semester()` (semester), `dst()` (daylight savings time indicator), `date()` (extract date), `epiyear()` (epiyear), improvements to `month()`, which now works with integer inputs.
-  * `make_date()` & `make_datetime()` + `ISOdatetime()` & `ISOdate()` to create date-times from numeric representations. 
+  * Added `make_date()` & `make_datetime()` + `ISOdatetime()` & `ISOdate()` to create date-times from numeric representations. 
+  * Added `decimal_date()` and `date_decimal()`
 * date-time functionality:
-  * `difftime` and `as.difftime()` 
-  * `as.Date()` to convert to date
+  * Added `difftime` and `as.difftime()` 
+  * Added `as.Date()` to convert to date
 
 # arrow 7.0.0
 
diff --git a/r/R/dplyr-funcs-datetime.R b/r/R/dplyr-funcs-datetime.R
index 754d02a436..1ca485f56e 100644
--- a/r/R/dplyr-funcs-datetime.R
+++ b/r/R/dplyr-funcs-datetime.R
@@ -270,6 +270,20 @@ register_bindings_duration <- function() {
       time2 <- build_expr("cast", time2, options = cast_options(to_type = timestamp(timezone = "UTC")))
     }
 
+    # if time1 or time2 are timestamps they cannot be expressed in "s" /seconds
+    # otherwise they cannot be added subtracted with durations
+    # TODO delete the casting to "us" once
+    # https://issues.apache.org/jira/browse/ARROW-16060 is solved
+    if (inherits(time1, "Expression") &&
+        time1$type_id() %in% Type[c("TIMESTAMP")] && time1$type()$unit() != 2L) {
+      time1 <- build_expr("cast", time1, options = cast_options(to_type = timestamp("us")))
+    }
+
+    if (inherits(time2, "Expression") &&
+        time2$type_id() %in% Type[c("TIMESTAMP")] && time2$type()$unit() != 2L) {
+      time2 <- build_expr("cast", time2, options = cast_options(to_type = timestamp("us")))
+    }
+
     # we need to go build the subtract expression instead of `time1 - time2` to
     # prevent complaints when we try to subtract an R object from an Expression
     subtract_output <- build_expr("-", time1, time2)
@@ -309,6 +323,34 @@ register_bindings_duration <- function() {
 
     build_expr("cast", x, options = cast_options(to_type = duration(unit = "s")))
   })
+  register_binding("decimal_date", function(date) {
+    y <- build_expr("year", date)
+    start <- call_binding("make_datetime", year = y, tz = "UTC")
+    sofar <- call_binding("difftime", date, start, units = "secs")
+    total <- call_binding(
+      "if_else",
+      build_expr("is_leap_year", date),
+      Expression$scalar(31622400L), # number of seconds in a leap year (366 days)
+      Expression$scalar(31536000L)  # number of seconds in a regular year (365 days)
+    )
+    y + sofar$cast(int64()) / total
+  })
+  register_binding("date_decimal", function(decimal, tz = "UTC") {
+    y <- build_expr("floor", decimal)
+
+    start <- call_binding("make_datetime", year = y, tz = tz)
+    seconds <- call_binding(
+      "if_else",
+      build_expr("is_leap_year", start),
+      Expression$scalar(31622400L), # number of seconds in a leap year (366 days)
+      Expression$scalar(31536000L)  # number of seconds in a regular year (365 days)
+    )
+
+    fraction <- decimal - y
+    delta <- build_expr("floor", seconds * fraction)
+    delta <- delta$cast(int64())
+    start + delta$cast(duration("s"))
+  })
 }
 
 binding_format_datetime <- function(x, format = "", tz = "", usetz = FALSE) {
diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R
index c901742f65..b9277c08c4 100644
--- a/r/tests/testthat/test-dplyr-funcs-datetime.R
+++ b/r/tests/testthat/test-dplyr-funcs-datetime.R
@@ -1145,7 +1145,7 @@ test_that("difftime works correctly", {
     .input %>%
       mutate(
         secs2 = difftime(
-          as.POSIXct("2022-03-07", tz = "Europe/Bucharest"),
+          as.POSIXct("2022-03-07", tz = "Pacific/Marquesas"),
           time1,
           units = "secs"
         )
@@ -1224,3 +1224,32 @@ test_that("as.difftime()", {
       collect()
   )
 })
+
+test_that("`decimal_date()` and `date_decimal()`", {
+  test_df <- tibble(
+    a = c(2007.38998954347, 1970.77732069883, 2020.96061799722,
+          2009.43465948477, 1975.71251467871, NA),
+    b = as.POSIXct(
+      c("2007-05-23 08:18:30", "1970-10-11 17:19:45", "2020-12-17 14:04:06",
+        "2009-06-08 15:37:01", "1975-09-18 01:37:42", NA)
+    ),
+    c = as.Date(
+      c("2007-05-23", "1970-10-11", "2020-12-17", "2009-06-08", "1975-09-18", NA)
+    )
+  )
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(
+        decimal_date_from_POSIXct = decimal_date(b),
+        decimal_date_from_r_POSIXct_obj = decimal_date(as.POSIXct("2022-03-25 15:37:01")),
+        decimal_date_from_r_date_obj = decimal_date(ymd("2022-03-25")),
+        decimal_date_from_date = decimal_date(c),
+        date_from_decimal = date_decimal(a),
+        date_from_decimal_r_obj = date_decimal(2022.178)
+      ) %>%
+      collect(),
+    test_df,
+    ignore_attr = "tzone"
+  )
+})