You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2022/03/28 19:44:57 UTC

[GitHub] [arrow] jonkeane commented on a change in pull request #12677: ARROW-15800 [R] bindings for `lubridate::as_date()` and `lubridate::as_datetime()`

jonkeane commented on a change in pull request #12677:
URL: https://github.com/apache/arrow/pull/12677#discussion_r836786450



##########
File path: r/tests/testthat/test-dplyr-funcs-datetime.R
##########
@@ -1213,3 +1213,203 @@ test_that("as.difftime()", {
       collect()
   )
 })
+
+test_that("as.Date() converts successfully from date, timestamp, integer, char and double", {
+  test_df <- tibble::tibble(
+    posixct_var = as.POSIXct("2022-02-25 00:00:01", tz = "Europe/London"),
+    date_var = as.Date("2022-02-25"),
+    character_ymd_var = "2022-02-25 00:00:01",
+    character_ydm_var = "2022/25/02 00:00:01",
+    integer_var = 32L,
+    integerish_var = 32,
+    double_var = 34.56
+  )
+
+  # casting from POSIXct treated separately so we can skip on Windows
+  # TODO move the test for casting from POSIXct below once ARROW-13168 is done
+  compare_dplyr_binding(
+    .input %>%
+      mutate(
+        date_dv = as.Date(date_var),
+        date_char_ymd = as.Date(character_ymd_var, format = "%Y-%m-%d %H:%M:%S"),
+        date_char_ydm = as.Date(character_ydm_var, format = "%Y/%d/%m %H:%M:%S"),
+        date_int = as.Date(integer_var, origin = "1970-01-01"),
+        date_integerish = as.Date(integerish_var, origin = "1970-01-01")
+      ) %>%
+      collect(),
+    test_df
+  )
+
+  # currently we do not support an origin different to "1970-01-01"
+  compare_dplyr_binding(
+    .input %>%
+      mutate(date_int = as.Date(integer_var, origin = "1970-01-03")) %>%
+      collect(),
+    test_df,
+    warning = TRUE
+  )
+
+  # we do not support multiple tryFormats
+  compare_dplyr_binding(
+    .input %>%
+      mutate(date_char_ymd = as.Date(character_ymd_var,
+                                     tryFormats = c("%Y-%m-%d", "%Y/%m/%d"))) %>%
+      collect(),
+    test_df,
+    warning = TRUE
+  )
+
+  expect_error(
+    test_df %>%
+      arrow_table() %>%
+      mutate(date_char_ymd = as.Date(character_ymd_var)) %>%
+      collect(),
+    regexp = "Failed to parse string: '2022-02-25 00:00:01' as a scalar of type timestamp[s]",
+    fixed = TRUE
+  )
+
+  # we do not support as.Date() with double/ float (error surfaced from C++)
+  expect_error(
+    test_df %>%
+      arrow_table() %>%
+      mutate(date_double = as.Date(double_var, origin = "1970-01-01")) %>%
+      collect()
+  )
+
+  skip_on_os("windows") # https://issues.apache.org/jira/browse/ARROW-13168
+  compare_dplyr_binding(
+    .input %>%
+      mutate(
+        date_pv = as.Date(posixct_var),
+        date_pv_tz = as.Date(posixct_var, tz = "Pacific/Marquesas")
+      ) %>%
+      collect(),
+    test_df
+  )
+})
+
+test_that("as_date()", {
+  test_df <- tibble::tibble(
+    posixct_var = as.POSIXct("2022-02-25 00:00:01", tz = "Pacific/Marquesas"),
+    date_var = as.Date("2022-02-25"),
+    difference_date = ymd_hms("2010-08-03 00:50:50", tz= "Pacific/Marquesas"),
+    character_ymd_var = "2022-02-25 00:00:01",
+    character_ydm_var = "2022/25/02 00:00:01",
+    integer_var = 32L,
+    integerish_var = 32,
+    double_var = 34.56
+  )

Review comment:
       This fixture looks very similar (though subtly different in some ways) as the one for `as.Date()` would it make sense to combine them into a single test_that block so they can use one fixture and we don't have to redefine it + make sure we update it in both places, etc?
   
   If that's not feasible, should we move the fixture out to `helpers-data.R` so that it's reusable?

##########
File path: r/R/dplyr-funcs-datetime.R
##########
@@ -239,66 +303,108 @@ register_bindings_duration <- function() {
                                        tz = "UTC") {
     call_binding("make_datetime", year, month, day, hour, min, sec, tz)
   })
-  register_binding("difftime", function(time1,
-                                        time2,
-                                        tz,
-                                        units = "secs") {
-    if (units != "secs") {
-      abort("`difftime()` with units other than `secs` not supported in Arrow")
-    }
+  register_binding("as.Date", function(x,
+                                       format = NULL,
+                                       tryFormats = "%Y-%m-%d",
+                                       origin = "1970-01-01",
+                                       tz = "UTC") {
 
-    if (!missing(tz)) {
-      warn("`tz` argument is not supported in Arrow, so it will be ignored")
+    # the origin argument will be better supported once we implement temporal
+    # arithmetic (https://issues.apache.org/jira/browse/ARROW-14947)
+    # TODO revisit once the above has been sorted
+    if (call_binding("is.numeric", x) & origin != "1970-01-01") {
+      abort("`as.Date()` with an `origin` different than '1970-01-01' is not supported in Arrow")
     }
 
-    # cast to timestamp if time1 and time2 are not dates or timestamp expressions
-    # (the subtraction of which would output a `duration`)
-    if (!call_binding("is.instant", time1)) {
-      time1 <- build_expr("cast", time1, options = cast_options(to_type = timestamp(timezone = "UTC")))
+    # this could be improved with tryFormats once strptime returns NA and we
+    # can use coalesce - https://issues.apache.org/jira/browse/ARROW-15659
+    # TODO revisit once https://issues.apache.org/jira/browse/ARROW-15659 is done
+    if (is.null(format) && length(tryFormats) > 1) {
+      abort("`as.Date()` with multiple `tryFormats` is not supported in Arrow")
     }
 
-    if (!call_binding("is.instant", time2)) {
-      time2 <- build_expr("cast", time2, options = cast_options(to_type = timestamp(timezone = "UTC")))
-    }
+    if (call_binding("is.Date", x)) {
+      return(x)
 
-    # we need to go build the subtract expression instead of `time1 - time2` to
-    # prevent complaints when we try to subtract an R object from an Expression
-    subtract_output <- build_expr("-", time1, time2)
-    build_expr("cast", subtract_output, options = cast_options(to_type = duration("s")))
+      # cast from POSIXct
+    } else if (call_binding("is.POSIXct", x)) {
+      # base::as.Date() first converts to the desired timezone and then extracts
+      # the date, which is why we need to go through timestamp() first
+      x <- build_expr("cast", x, options = cast_options(to_type = timestamp(timezone = tz)))
+
+      # cast from character
+    } else if (call_binding("is.character", x)) {
+      format <- format %||% tryFormats[[1]]
+      # unit = 0L is the identifier for seconds in valid_time32_units
+      x <- build_expr("strptime", x, options = list(format = format, unit = 0L))
+
+      # cast from numeric
+    } else if (call_binding("is.numeric", x) & !call_binding("is.integer", x)) {
+      # Arrow does not support direct casting from double to date32(), but for
+      # integer-like values we can go via int32()
+      # https://issues.apache.org/jira/browse/ARROW-15798
+      # TODO revisit if arrow decides to support double -> date casting
+      x <- build_expr("cast", x, options = cast_options(to_type = int32()))
+    }
+    build_expr("cast", x, options = cast_options(to_type = date32()))
   })
-  register_binding("as.difftime", function(x,
-                                           format = "%X",
-                                           units = "secs") {
-    # windows doesn't seem to like "%X"
-    if (format == "%X" & tolower(Sys.info()[["sysname"]]) == "windows") {
-      format <- "%H:%M:%S"
+  register_binding("as_date", function(x,
+                                       format = NULL,
+                                       origin = "1970-01-01",
+                                       tz = "UTC") {
+    # the origin argument will be better supported once we implement temporal
+    # arithmetic (https://issues.apache.org/jira/browse/ARROW-14947)
+    # TODO revisit once the above has been sorted
+    if (call_binding("is.numeric", x) & origin != "1970-01-01") {
+      abort("`as.Date()` with an `origin` different than '1970-01-01' is not supported in Arrow")
     }
 
-    if (units != "secs") {
-      abort("`as.difftime()` with units other than 'secs' not supported in Arrow")
+    # assume format is ISO if unspecified (to align with lubridate::as_date)
+    if (is.null(format)) {
+      format <- "%Y-%m-%d"
     }
 
-    if (call_binding("is.character", x)) {
-      x <- build_expr("strptime", x, options = list(format = format, unit = 0L))
-      # complex casting only due to cast type restrictions: time64 -> int64 -> duration(us)
-      # and then we cast to duration ("s") at the end
-      x <- x$cast(time64("us"))$cast(int64())$cast(duration("us"))
-    }
+    if (call_binding("is.Date", x)) {
+      return(x)
 
-    # numeric -> duration not supported in Arrow yet so we use int64() as an
-    # intermediate step
-    # TODO revisit if https://issues.apache.org/jira/browse/ARROW-15862 results
-    # in numeric -> duration support
+      # cast from POSIXct
+    } else if (call_binding("is.POSIXct", x)) {
+      # this is where as_date() differs from as.Date()
+      if (!missing(tz)) {
+        x <- build_expr("cast", x, options = cast_options(to_type = timestamp(timezone = tz)))
+      }
+      # POSIXct is of type double -> we need this to prevent going down the
+      # "double" branch
+      x <- x
+
+      # cast from character
+    } else if (call_binding("is.character", x)) {
+      # unit = 0L is the identifier for seconds in valid_time32_units
+      x <- build_expr("strptime", x, options = list(format = format, unit = 0L))
 
+      # cast from numeric
+    } else if (call_binding("is.numeric", x) & !call_binding("is.integer", x)) {
+      # Arrow does not support direct casting from double to date32(), but for
+      # integer-like values we can go via int32()
+      # https://issues.apache.org/jira/browse/ARROW-15798
+      # TODO revisit if arrow decides to support double -> date casting
+      x <- build_expr("cast", x, options = cast_options(to_type = int32()))
+    }
+    build_expr("cast", x, options = cast_options(to_type = date32()))
+  })
+  register_binding("as_datetime", function(x,
+                                           origin = "1970-01-01",
+                                           tz = "UTC") {
     if (call_binding("is.numeric", x)) {
-      # coerce x to be int64(). it should work for integer-like doubles and fail
-      # for pure doubles
-      # if we abort for all doubles, we risk erroring in cases in which
-      # coercion to int64() would work
+      delta <- call_binding("difftime", origin, "1970-01-01")
+      delta <- build_expr("cast", delta, options = cast_options(to_type = int64()))
       x <- build_expr("cast", x, options = cast_options(to_type = int64()))
+      output <- build_expr("+", x, delta)
+      output <- build_expr("cast", output, options = cast_options(to_type = timestamp()))
+    } else {
+      output <- build_expr("cast", x, options = cast_options(to_type = timestamp()))
     }
-
-    build_expr("cast", x, options = cast_options(to_type = duration(unit = "s")))
+    build_expr("assume_timezone", output, options = list(timezone = tz))

Review comment:
       I can't suggest an edit here for some reason, but could we not rename to `output` here and then have one line that does `build_expr("cast", x, options = cast_options(to_type = timestamp()))` for both? (which could be outside of the if|else entirely, and we wouldn't even need an `else` at all?)

##########
File path: r/R/dplyr-funcs-datetime.R
##########
@@ -239,66 +303,108 @@ register_bindings_duration <- function() {
                                        tz = "UTC") {
     call_binding("make_datetime", year, month, day, hour, min, sec, tz)
   })
-  register_binding("difftime", function(time1,
-                                        time2,
-                                        tz,
-                                        units = "secs") {
-    if (units != "secs") {
-      abort("`difftime()` with units other than `secs` not supported in Arrow")
-    }
+  register_binding("as.Date", function(x,
+                                       format = NULL,
+                                       tryFormats = "%Y-%m-%d",
+                                       origin = "1970-01-01",
+                                       tz = "UTC") {
 
-    if (!missing(tz)) {
-      warn("`tz` argument is not supported in Arrow, so it will be ignored")
+    # the origin argument will be better supported once we implement temporal
+    # arithmetic (https://issues.apache.org/jira/browse/ARROW-14947)
+    # TODO revisit once the above has been sorted
+    if (call_binding("is.numeric", x) & origin != "1970-01-01") {
+      abort("`as.Date()` with an `origin` different than '1970-01-01' is not supported in Arrow")
     }
 
-    # cast to timestamp if time1 and time2 are not dates or timestamp expressions
-    # (the subtraction of which would output a `duration`)
-    if (!call_binding("is.instant", time1)) {
-      time1 <- build_expr("cast", time1, options = cast_options(to_type = timestamp(timezone = "UTC")))
+    # this could be improved with tryFormats once strptime returns NA and we
+    # can use coalesce - https://issues.apache.org/jira/browse/ARROW-15659
+    # TODO revisit once https://issues.apache.org/jira/browse/ARROW-15659 is done
+    if (is.null(format) && length(tryFormats) > 1) {
+      abort("`as.Date()` with multiple `tryFormats` is not supported in Arrow")
     }
 
-    if (!call_binding("is.instant", time2)) {
-      time2 <- build_expr("cast", time2, options = cast_options(to_type = timestamp(timezone = "UTC")))
-    }
+    if (call_binding("is.Date", x)) {
+      return(x)
 
-    # we need to go build the subtract expression instead of `time1 - time2` to
-    # prevent complaints when we try to subtract an R object from an Expression
-    subtract_output <- build_expr("-", time1, time2)
-    build_expr("cast", subtract_output, options = cast_options(to_type = duration("s")))
+      # cast from POSIXct
+    } else if (call_binding("is.POSIXct", x)) {
+      # base::as.Date() first converts to the desired timezone and then extracts
+      # the date, which is why we need to go through timestamp() first
+      x <- build_expr("cast", x, options = cast_options(to_type = timestamp(timezone = tz)))
+
+      # cast from character
+    } else if (call_binding("is.character", x)) {
+      format <- format %||% tryFormats[[1]]
+      # unit = 0L is the identifier for seconds in valid_time32_units
+      x <- build_expr("strptime", x, options = list(format = format, unit = 0L))
+
+      # cast from numeric
+    } else if (call_binding("is.numeric", x) & !call_binding("is.integer", x)) {
+      # Arrow does not support direct casting from double to date32(), but for
+      # integer-like values we can go via int32()
+      # https://issues.apache.org/jira/browse/ARROW-15798
+      # TODO revisit if arrow decides to support double -> date casting
+      x <- build_expr("cast", x, options = cast_options(to_type = int32()))
+    }
+    build_expr("cast", x, options = cast_options(to_type = date32()))
   })
-  register_binding("as.difftime", function(x,
-                                           format = "%X",
-                                           units = "secs") {
-    # windows doesn't seem to like "%X"
-    if (format == "%X" & tolower(Sys.info()[["sysname"]]) == "windows") {
-      format <- "%H:%M:%S"
+  register_binding("as_date", function(x,
+                                       format = NULL,
+                                       origin = "1970-01-01",
+                                       tz = "UTC") {
+    # the origin argument will be better supported once we implement temporal
+    # arithmetic (https://issues.apache.org/jira/browse/ARROW-14947)
+    # TODO revisit once the above has been sorted
+    if (call_binding("is.numeric", x) & origin != "1970-01-01") {
+      abort("`as.Date()` with an `origin` different than '1970-01-01' is not supported in Arrow")
     }
 
-    if (units != "secs") {
-      abort("`as.difftime()` with units other than 'secs' not supported in Arrow")
+    # assume format is ISO if unspecified (to align with lubridate::as_date)
+    if (is.null(format)) {
+      format <- "%Y-%m-%d"
     }
 
-    if (call_binding("is.character", x)) {
-      x <- build_expr("strptime", x, options = list(format = format, unit = 0L))
-      # complex casting only due to cast type restrictions: time64 -> int64 -> duration(us)
-      # and then we cast to duration ("s") at the end
-      x <- x$cast(time64("us"))$cast(int64())$cast(duration("us"))
-    }
+    if (call_binding("is.Date", x)) {
+      return(x)
 
-    # numeric -> duration not supported in Arrow yet so we use int64() as an
-    # intermediate step
-    # TODO revisit if https://issues.apache.org/jira/browse/ARROW-15862 results
-    # in numeric -> duration support
+      # cast from POSIXct
+    } else if (call_binding("is.POSIXct", x)) {
+      # this is where as_date() differs from as.Date()
+      if (!missing(tz)) {
+        x <- build_expr("cast", x, options = cast_options(to_type = timestamp(timezone = tz)))
+      }
+      # POSIXct is of type double -> we need this to prevent going down the
+      # "double" branch
+      x <- x
+
+      # cast from character
+    } else if (call_binding("is.character", x)) {
+      # unit = 0L is the identifier for seconds in valid_time32_units
+      x <- build_expr("strptime", x, options = list(format = format, unit = 0L))
 
+      # cast from numeric
+    } else if (call_binding("is.numeric", x) & !call_binding("is.integer", x)) {
+      # Arrow does not support direct casting from double to date32(), but for
+      # integer-like values we can go via int32()
+      # https://issues.apache.org/jira/browse/ARROW-15798
+      # TODO revisit if arrow decides to support double -> date casting
+      x <- build_expr("cast", x, options = cast_options(to_type = int32()))
+    }
+    build_expr("cast", x, options = cast_options(to_type = date32()))
+  })

Review comment:
       It looks like a lot of this functionality is duplicated in `as.Date` up above, could we factor those out into an abstract function that does both?




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org