You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by np...@apache.org on 2022/10/14 01:00:20 UTC

[arrow] branch master updated: ARROW-15602: [R][Docs] Update docs to explain how to read timestamp with timezone columns (#13877)

This is an automated email from the ASF dual-hosted git repository.

npr pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 7ef4b4a0ae ARROW-15602: [R][Docs] Update docs to explain how to read timestamp with timezone columns (#13877)
7ef4b4a0ae is described below

commit 7ef4b4a0ae0c6c15a45ec439e348e26e1e80523d
Author: eitsupi <50...@users.noreply.github.com>
AuthorDate: Fri Oct 14 10:00:10 2022 +0900

    ARROW-15602: [R][Docs] Update docs to explain how to read timestamp with timezone columns (#13877)
    
    If users expect `read_csv_arrow` to behave the same as `readr::read_csv`, they will be confused by the presence or absence of a time zone, so adds a note is provided in the example.
    Adds the same example to the test to verify that the error occurs.
    
    Also update the type description to link to the Arrow type documentation.
    
    Authored-by: SHIMA Tatsuya <ts...@gmail.com>
    Signed-off-by: Neal Richardson <ne...@gmail.com>
---
 r/R/csv.R                   | 33 ++++++++++++++++++++++-----------
 r/man/read_delim_arrow.Rd   | 33 ++++++++++++++++++++++-----------
 r/tests/testthat/test-csv.R | 19 +++++++++++++++++--
 3 files changed, 61 insertions(+), 24 deletions(-)

diff --git a/r/R/csv.R b/r/R/csv.R
index 71e01971f4..7b474c137e 100644
--- a/r/R/csv.R
+++ b/r/R/csv.R
@@ -54,17 +54,17 @@
 #' single string, one character per column, where the characters map to Arrow
 #' types analogously to the `readr` type mapping:
 #'
-#' * "c": `utf8()`
-#' * "i": `int32()`
-#' * "n": `float64()`
-#' * "d": `float64()`
-#' * "l": `bool()`
-#' * "f": `dictionary()`
-#' * "D": `date32()`
-#' * "T": `timestamp(unit = "ns")`
-#' * "t": `time32()` (The `unit` arg is set to the default value `"ms"`)
-#' * "_": `null()`
-#' * "-": `null()`
+#' * "c": [utf8()]
+#' * "i": [int32()]
+#' * "n": [float64()]
+#' * "d": [float64()]
+#' * "l": [bool()]
+#' * "f": [dictionary()]
+#' * "D": [date32()]
+#' * "T": [`timestamp(unit = "ns")`][timestamp()]
+#' * "t": [time32()] (The `unit` arg is set to the default value `"ms"`)
+#' * "_": [null()]
+#' * "-": [null()]
 #' * "?": infer the type from the data
 #'
 #' If you use the compact string representation for `col_types`, you must also
@@ -143,6 +143,17 @@
 #' read_csv_arrow(tf, schema = schema(x = int32(), y = utf8()), skip = 1)
 #' read_csv_arrow(tf, col_types = schema(y = utf8()))
 #' read_csv_arrow(tf, col_types = "ic", col_names = c("x", "y"), skip = 1)
+#'
+#' # Note that if a timestamp column contains time zones, type inference won't work,
+#' # whether automatic or via the string "T" `col_types` specification.
+#' # To parse timestamps with time zones, provide a [Schema] to `col_types`
+#' # and specify the time zone in the type object:
+#' tf <- tempfile()
+#' write.csv(data.frame(x = "1970-01-01T12:00:00+12:00"), file = tf, row.names = FALSE)
+#' read_csv_arrow(
+#'   tf,
+#'   col_types = schema(x = timestamp(unit = "us", timezone = "UTC"))
+#' )
 read_delim_arrow <- function(file,
                              delim = ",",
                              quote = '"',
diff --git a/r/man/read_delim_arrow.Rd b/r/man/read_delim_arrow.Rd
index f322c56c17..5b91fc0ec9 100644
--- a/r/man/read_delim_arrow.Rd
+++ b/r/man/read_delim_arrow.Rd
@@ -180,17 +180,17 @@ that \code{readr} uses to the \code{col_types} argument. This means you provide
 single string, one character per column, where the characters map to Arrow
 types analogously to the \code{readr} type mapping:
 \itemize{
-\item "c": \code{utf8()}
-\item "i": \code{int32()}
-\item "n": \code{float64()}
-\item "d": \code{float64()}
-\item "l": \code{bool()}
-\item "f": \code{dictionary()}
-\item "D": \code{date32()}
-\item "T": \code{timestamp(unit = "ns")}
-\item "t": \code{time32()} (The \code{unit} arg is set to the default value \code{"ms"})
-\item "_": \code{null()}
-\item "-": \code{null()}
+\item "c": \code{\link[=utf8]{utf8()}}
+\item "i": \code{\link[=int32]{int32()}}
+\item "n": \code{\link[=float64]{float64()}}
+\item "d": \code{\link[=float64]{float64()}}
+\item "l": \code{\link[=bool]{bool()}}
+\item "f": \code{\link[=dictionary]{dictionary()}}
+\item "D": \code{\link[=date32]{date32()}}
+\item "T": \code{\link[=timestamp]{timestamp(unit = "ns")}}
+\item "t": \code{\link[=time32]{time32()}} (The \code{unit} arg is set to the default value \code{"ms"})
+\item "_": \code{\link[=null]{null()}}
+\item "-": \code{\link[=null]{null()}}
 \item "?": infer the type from the data
 }
 
@@ -219,4 +219,15 @@ write.csv(data.frame(x = c(1, 3), y = c(2, 4)), file = tf, row.names = FALSE)
 read_csv_arrow(tf, schema = schema(x = int32(), y = utf8()), skip = 1)
 read_csv_arrow(tf, col_types = schema(y = utf8()))
 read_csv_arrow(tf, col_types = "ic", col_names = c("x", "y"), skip = 1)
+
+# Note that if a timestamp column contains time zones, type inference won't work,
+# whether automatic or via the string "T" `col_types` specification.
+# To parse timestamps with time zones, provide a [Schema] to `col_types`
+# and specify the time zone in the type object:
+tf <- tempfile()
+write.csv(data.frame(x = "1970-01-01T12:00:00+12:00"), file = tf, row.names = FALSE)
+read_csv_arrow(
+  tf,
+  col_types = schema(x = timestamp(unit = "us", timezone = "UTC"))
+)
 }
diff --git a/r/tests/testthat/test-csv.R b/r/tests/testthat/test-csv.R
index cd8da2625c..6033253517 100644
--- a/r/tests/testthat/test-csv.R
+++ b/r/tests/testthat/test-csv.R
@@ -225,8 +225,11 @@ test_that("read_csv_arrow() can read timestamps", {
   # time zones are being read in as time zone-naive, hence ignore_attr = "tzone"
   expect_equal(tbl, df, ignore_attr = "tzone")
 
-  df <- read_csv_arrow(tf, col_types = "T", col_names = "time", skip = 1)
-  expect_equal(tbl, df, ignore_attr = "tzone")
+  # work with schema to specify timestamp with time zone type
+  tbl <- tibble::tibble(time = "1970-01-01T12:00:00+12:00")
+  write.csv(tbl, tf, row.names = FALSE)
+  df <- read_csv_arrow(tf, col_types = schema(time = timestamp(unit = "us", timezone = "UTC")))
+  expect_equal(df, tibble::tibble(time = as.POSIXct("1970-01-01 00:00:00", tz = "UTC")))
 })
 
 test_that("read_csv_arrow(timestamp_parsers=)", {
@@ -610,3 +613,15 @@ test_that("read_csv_arrow() can read sub-second timestamps with col_types T sett
   expected <- as.POSIXct(tbl$time, tz = "UTC")
   expect_equal(df$time, expected, ignore_attr = "tzone")
 })
+
+test_that("Shows an error message when trying to read a timestamp with time zone with col_types = T (ARROW-17429)", {
+  tbl <- tibble::tibble(time = c("1970-01-01T12:00:00+12:00"))
+  csv_file <- tempfile()
+  on.exit(unlink(csv_file))
+  write.csv(tbl, csv_file, row.names = FALSE)
+
+  expect_error(
+    read_csv_arrow(csv_file, col_types = "T", col_names = "time", skip = 1),
+    "CSV conversion error to timestamp\\[ns\\]: expected no zone offset in"
+  )
+})