You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ro...@apache.org on 2019/06/12 11:51:55 UTC

[arrow] branch master updated: ARROW-5504 [R]: move use_threads argument to global option

This is an automated email from the ASF dual-hosted git repository.

romainfrancois pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new f6e3c43  ARROW-5504 [R]: move use_threads argument to global option
f6e3c43 is described below

commit f6e3c437f0b330fb8ffbc6959838bc1fe3bf24f4
Author: Romain Francois <ro...@rstudio.com>
AuthorDate: Wed Jun 12 13:51:44 2019 +0200

    ARROW-5504 [R]: move use_threads argument to global option
    
    At this point the function is not exported or documented and threads are always used, users would need to set `options(arrow.use_threads)` to turn them off.
    
    Author: Romain Francois <ro...@rstudio.com>
    
    Closes #4515 from romainfrancois/ARROW-5504/use_threads and squashes the following commits:
    
    445364ab <Romain Francois> s/as_tibble()/as.data.frame()/
    4afa7d4d <Romain Francois> + option_use_threads() function
---
 r/DESCRIPTION                   |  2 +-
 r/NAMESPACE                     |  1 +
 r/R/ChunkedArray.R              |  1 -
 r/R/R6.R                        |  6 ------
 r/R/RecordBatch.R               |  2 +-
 r/R/Table.R                     |  2 +-
 r/R/array.R                     |  1 -
 r/R/{zzz.R => arrow-package.R}  | 13 +++++++++++--
 r/R/csv.R                       |  6 ++----
 r/R/feather.R                   |  5 ++---
 r/R/json.R                      |  2 +-
 r/R/parquet.R                   |  6 ++----
 r/R/read_table.R                |  2 --
 r/README.Rmd                    |  2 +-
 r/README.md                     | 22 +++++++++++-----------
 r/man/arrow-package.Rd          | 40 ++++++++++++++++++++++++++++++++++++++++
 r/man/csv_read_options.Rd       |  4 +---
 r/man/read_feather.Rd           |  5 +----
 r/man/read_parquet.Rd           |  5 +----
 r/man/read_table.Rd             |  4 +---
 r/tests/testthat/test-json.R    |  4 ++--
 r/tests/testthat/test-parquet.R |  7 -------
 22 files changed, 80 insertions(+), 62 deletions(-)

diff --git a/r/DESCRIPTION b/r/DESCRIPTION
index c38e5a1..f38f0de 100644
--- a/r/DESCRIPTION
+++ b/r/DESCRIPTION
@@ -60,6 +60,7 @@ Collate:
     'Table.R'
     'array.R'
     'arrowExports.R'
+    'arrow-package.R'
     'buffer.R'
     'io.R'
     'compression.R'
@@ -75,4 +76,3 @@ Collate:
     'read_table.R'
     'reexports-bit64.R'
     'write_arrow.R'
-    'zzz.R'
diff --git a/r/NAMESPACE b/r/NAMESPACE
index d535ea9..3f91568 100644
--- a/r/NAMESPACE
+++ b/r/NAMESPACE
@@ -180,6 +180,7 @@ importFrom(purrr,map_int)
 importFrom(rlang,"%||%")
 importFrom(rlang,abort)
 importFrom(rlang,dots_n)
+importFrom(rlang,is_false)
 importFrom(rlang,list2)
 importFrom(rlang,warn)
 useDynLib(arrow, .registration = TRUE)
diff --git a/r/R/ChunkedArray.R b/r/R/ChunkedArray.R
index 339a416..69a0224 100644
--- a/r/R/ChunkedArray.R
+++ b/r/R/ChunkedArray.R
@@ -60,7 +60,6 @@
 #' @param \dots Vectors to coerce
 #' @param type currently ignored
 #'
-#' @importFrom rlang list2 %||%
 #' @export
 chunked_array <- function(..., type = NULL){
   shared_ptr(`arrow::ChunkedArray`, ChunkedArray__from_list(list2(...), type))
diff --git a/r/R/R6.R b/r/R/R6.R
index 26c679f..e343116 100644
--- a/r/R/R6.R
+++ b/r/R/R6.R
@@ -16,12 +16,6 @@
 # under the License.
 
 #' @include enums.R
-#' @importFrom R6 R6Class
-#' @importFrom glue glue
-#' @importFrom purrr map map_int map2
-#' @importFrom rlang dots_n
-#' @importFrom assertthat assert_that
-
 `arrow::Object` <- R6Class("arrow::Object",
   public = list(
     initialize = function(xp) self$set_pointer(xp),
diff --git a/r/R/RecordBatch.R b/r/R/RecordBatch.R
index 3ebd81b..d60c823 100644
--- a/r/R/RecordBatch.R
+++ b/r/R/RecordBatch.R
@@ -86,7 +86,7 @@
 
 #' @export
 `as.data.frame.arrow::RecordBatch` <- function(x, row.names = NULL, optional = FALSE, use_threads = TRUE, ...){
-  RecordBatch__to_dataframe(x, use_threads = use_threads)
+  RecordBatch__to_dataframe(x, use_threads = option_use_threads())
 }
 
 #' Create an [arrow::RecordBatch][arrow__RecordBatch] from a data frame
diff --git a/r/R/Table.R b/r/R/Table.R
index 4c434b0..6d50394 100644
--- a/r/R/Table.R
+++ b/r/R/Table.R
@@ -67,7 +67,7 @@ table <- function(..., schema = NULL){
 
 #' @export
 `as.data.frame.arrow::Table` <- function(x, row.names = NULL, optional = FALSE, use_threads = TRUE, ...){
-  Table__to_dataframe(x, use_threads = use_threads)
+  Table__to_dataframe(x, use_threads = option_use_threads())
 }
 
 #' @export
diff --git a/r/R/array.R b/r/R/array.R
index ccb8521..244cee0 100644
--- a/r/R/array.R
+++ b/r/R/array.R
@@ -122,7 +122,6 @@
 #' @param x R object
 #' @param type Explicit [type][arrow__DataType], or NULL (the default) to infer from the data
 #'
-#' @importFrom rlang warn
 #' @export
 array <- function(x, type = NULL){
   `arrow::Array`$dispatch(Array__from_vector(x, type))
diff --git a/r/R/zzz.R b/r/R/arrow-package.R
similarity index 76%
rename from r/R/zzz.R
rename to r/R/arrow-package.R
index eab9ad4..41cbc2a 100644
--- a/r/R/zzz.R
+++ b/r/R/arrow-package.R
@@ -15,9 +15,15 @@
 # specific language governing permissions and limitations
 # under the License.
 
-#' @useDynLib arrow, .registration = TRUE
+#' @importFrom R6 R6Class
+#' @importFrom glue glue
+#' @importFrom purrr map map_int map2
+#' @importFrom assertthat assert_that
+#' @importFrom rlang list2 %||% is_false abort dots_n warn
 #' @importFrom Rcpp sourceCpp
-NULL
+#' @useDynLib arrow, .registration = TRUE
+#' @keywords internal
+"_PACKAGE"
 
 #' Is the C++ Arrow library available
 #'
@@ -26,3 +32,6 @@ arrow_available <- function() {
   .Call(`_arrow_available`)
 }
 
+option_use_threads <- function() {
+  !is_false(getOption("arrow.use_threads"))
+}
diff --git a/r/R/csv.R b/r/R/csv.R
index 5199279..d34ddcb 100644
--- a/r/R/csv.R
+++ b/r/R/csv.R
@@ -29,14 +29,13 @@
 
 #' read options for the csv reader
 #'
-#' @param use_threads Whether to use the global CPU thread pool
 #' @param block_size Block size we request from the IO layer; also determines the size of chunks when use_threads is `TRUE`
 #'
 #' @export
-csv_read_options <- function(use_threads = TRUE, block_size = 1048576L) {
+csv_read_options <- function(block_size = 1048576L) {
   shared_ptr(`arrow::csv::ReadOptions`, csv___ReadOptions__initialize(
     list(
-      use_threads = use_threads,
+      use_threads = option_use_threads(),
       block_size = block_size
     )
   ))
@@ -107,7 +106,6 @@ csv_table_reader <- function(file,
   UseMethod("csv_table_reader")
 }
 
-#' @importFrom rlang abort
 #' @export
 csv_table_reader.default <- function(file,
   read_options = csv_read_options(),
diff --git a/r/R/feather.R b/r/R/feather.R
index c65ea9e..998f39b 100644
--- a/r/R/feather.R
+++ b/r/R/feather.R
@@ -160,16 +160,15 @@ FeatherTableReader.fs_path <- function(file, mmap = TRUE, ...) {
 #' @param file a arrow::ipc::feather::TableReader or whatever the [FeatherTableReader()] function can handle
 #' @param columns names if the columns to read. The default `NULL` means all columns
 #' @param as_tibble should the [arrow::Table][arrow__Table] be converted to a tibble.
-#' @param use_threads Use threads when converting to a tibble.
 #' @param ... additional parameters
 #'
 #' @return a data frame if `as_tibble` is `TRUE` (the default), or a [arrow::Table][arrow__Table] otherwise
 #'
 #' @export
-read_feather <- function(file, columns = NULL, as_tibble = TRUE, use_threads = TRUE, ...){
+read_feather <- function(file, columns = NULL, as_tibble = TRUE, ...){
   out <- FeatherTableReader(file, ...)$Read(columns)
   if (isTRUE(as_tibble)) {
-    out <- as.data.frame(out, use_threads = use_threads)
+    out <- as.data.frame(out)
   }
   out
 }
diff --git a/r/R/json.R b/r/R/json.R
index 2de8b94..e51051d 100644
--- a/r/R/json.R
+++ b/r/R/json.R
@@ -153,7 +153,7 @@ json_table_reader.default <- function(file,
 read_json_arrow <- function(..., as_tibble = TRUE) {
   tab <- json_table_reader(...)$Read()
   if (isTRUE(as_tibble)) {
-    tab <- as_tibble(tab)
+    tab <- as.data.frame(tab)
   }
   tab
 }
diff --git a/r/R/parquet.R b/r/R/parquet.R
index 8caf356..0efed1f 100644
--- a/r/R/parquet.R
+++ b/r/R/parquet.R
@@ -23,8 +23,6 @@
 #' @param file a file path
 #' @param as_tibble Should the [arrow::Table][arrow__Table] be converted to a
 #' tibble? Default is `TRUE`.
-#' @param use_threads Use threads when converting to a tibble? Default is
-#' '`TRUE`. Only relevant if `as_tibble` is `TRUE`.
 #' @param ... Additional arguments, currently ignored
 #'
 #' @return A [arrow::Table][arrow__Table], or a `tbl_df` if `as_tibble` is
@@ -36,10 +34,10 @@
 #' }
 #'
 #' @export
-read_parquet <- function(file, as_tibble = TRUE, use_threads = TRUE, ...) {
+read_parquet <- function(file, as_tibble = TRUE, ...) {
   tab <- shared_ptr(`arrow::Table`, read_parquet_file(file))
   if (isTRUE(as_tibble)) {
-    tab <- as.data.frame(tab, use_threads = use_threads)
+    tab <- as.data.frame(tab)
   }
   tab
 }
diff --git a/r/R/read_table.R b/r/R/read_table.R
index f7a7987..d5122a8 100644
--- a/r/R/read_table.R
+++ b/r/R/read_table.R
@@ -33,8 +33,6 @@
 #'
 #'  - a raw vector: read using a [arrow::ipc::RecordBatchStreamReader][arrow__ipc__RecordBatchStreamReader]
 #'
-#' @param use_threads Use threads when converting to a tibble
-#'
 #' @return
 #'
 #'  - `read_table` returns an [arrow::Table][arrow__Table]
diff --git a/r/README.Rmd b/r/README.Rmd
index 4b6ab14..f718732 100644
--- a/r/README.Rmd
+++ b/r/README.Rmd
@@ -91,7 +91,7 @@ library(arrow)
 tab <- arrow::table(x = 1:10, y = rnorm(10))
 tab$schema
 tab
-as_tibble(tab)
+as.data.frame(tab)
 ```
 
 ## Developing
diff --git a/r/README.md b/r/README.md
index c87b8c6..b584486 100644
--- a/r/README.md
+++ b/r/README.md
@@ -104,20 +104,20 @@ tab$schema
 #> y: double
 tab
 #> arrow::Table
-as_tibble(tab)
+as.data.frame(tab)
 #> # A tibble: 10 x 2
 #>        x      y
 #>    <int>  <dbl>
-#>  1     1  0.524
-#>  2     2 -0.606
-#>  3     3 -0.655
-#>  4     4  1.37 
-#>  5     5  1.53 
-#>  6     6  1.96 
-#>  7     7  1.80 
-#>  8     8  1.27 
-#>  9     9  0.698
-#> 10    10 -0.661
+#>  1     1 -1.56 
+#>  2     2 -0.147
+#>  3     3 -1.16 
+#>  4     4  0.106
+#>  5     5  1.14 
+#>  6     6  0.340
+#>  7     7  0.184
+#>  8     8 -1.01 
+#>  9     9  1.77 
+#> 10    10  0.344
 ```
 
 ## Developing
diff --git a/r/man/arrow-package.Rd b/r/man/arrow-package.Rd
new file mode 100644
index 0000000..e2cbbbe
--- /dev/null
+++ b/r/man/arrow-package.Rd
@@ -0,0 +1,40 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/arrow-package.R
+\docType{package}
+\name{arrow-package}
+\alias{arrow}
+\alias{arrow-package}
+\title{arrow: Integration to 'Apache' 'Arrow'}
+\description{
+'Apache' 'Arrow' <https://arrow.apache.org/> is a cross-language
+    development platform for in-memory data. It specifies a standardized
+    language-independent columnar memory format for flat and hierarchical data,
+    organized for efficient analytic operations on modern hardware. This
+    package provides an interface to the Arrow C++ library.
+}
+\seealso{
+Useful links:
+\itemize{
+  \item \url{https://arrow.apache.org/docs/r/}
+  \item \url{https://github.com/apache/arrow/}
+  \item Report bugs at \url{https://issues.apache.org/jira/projects/ARROW/issues}
+}
+
+}
+\author{
+\strong{Maintainer}: Romain François \email{romain@rstudio.com}
+
+Authors:
+\itemize{
+  \item Jeroen Ooms \email{jeroen@berkeley.edu}
+  \item Apache Arrow \email{dev@arrow.apache.org} [copyright holder]
+}
+
+Other contributors:
+\itemize{
+  \item Javier Luraschi \email{javier@rstudio.com} [contributor]
+  \item Jeffrey Wong \email{jeffreyw@netflix.com} [contributor]
+}
+
+}
+\keyword{internal}
diff --git a/r/man/csv_read_options.Rd b/r/man/csv_read_options.Rd
index 3fa2d8c..952138a 100644
--- a/r/man/csv_read_options.Rd
+++ b/r/man/csv_read_options.Rd
@@ -4,11 +4,9 @@
 \alias{csv_read_options}
 \title{read options for the csv reader}
 \usage{
-csv_read_options(use_threads = TRUE, block_size = 1048576L)
+csv_read_options(block_size = 1048576L)
 }
 \arguments{
-\item{use_threads}{Whether to use the global CPU thread pool}
-
 \item{block_size}{Block size we request from the IO layer; also determines the size of chunks when use_threads is \code{TRUE}}
 }
 \description{
diff --git a/r/man/read_feather.Rd b/r/man/read_feather.Rd
index 4509c7d..31fd36a 100644
--- a/r/man/read_feather.Rd
+++ b/r/man/read_feather.Rd
@@ -4,8 +4,7 @@
 \alias{read_feather}
 \title{Read a feather file}
 \usage{
-read_feather(file, columns = NULL, as_tibble = TRUE,
-  use_threads = TRUE, ...)
+read_feather(file, columns = NULL, as_tibble = TRUE, ...)
 }
 \arguments{
 \item{file}{a arrow::ipc::feather::TableReader or whatever the \code{\link[=FeatherTableReader]{FeatherTableReader()}} function can handle}
@@ -14,8 +13,6 @@ read_feather(file, columns = NULL, as_tibble = TRUE,
 
 \item{as_tibble}{should the \link[=arrow__Table]{arrow::Table} be converted to a tibble.}
 
-\item{use_threads}{Use threads when converting to a tibble.}
-
 \item{...}{additional parameters}
 }
 \value{
diff --git a/r/man/read_parquet.Rd b/r/man/read_parquet.Rd
index 2bce02c..3b1973b 100644
--- a/r/man/read_parquet.Rd
+++ b/r/man/read_parquet.Rd
@@ -4,7 +4,7 @@
 \alias{read_parquet}
 \title{Read Parquet file from disk}
 \usage{
-read_parquet(file, as_tibble = TRUE, use_threads = TRUE, ...)
+read_parquet(file, as_tibble = TRUE, ...)
 }
 \arguments{
 \item{file}{a file path}
@@ -12,9 +12,6 @@ read_parquet(file, as_tibble = TRUE, use_threads = TRUE, ...)
 \item{as_tibble}{Should the \link[=arrow__Table]{arrow::Table} be converted to a
 tibble? Default is \code{TRUE}.}
 
-\item{use_threads}{Use threads when converting to a tibble? Default is
-'\code{TRUE}. Only relevant if \code{as_tibble} is \code{TRUE}.}
-
 \item{...}{Additional arguments, currently ignored}
 }
 \value{
diff --git a/r/man/read_table.Rd b/r/man/read_table.Rd
index 356ec5e..3231b26 100644
--- a/r/man/read_table.Rd
+++ b/r/man/read_table.Rd
@@ -7,7 +7,7 @@
 \usage{
 read_table(stream)
 
-read_arrow(stream, use_threads = TRUE)
+read_arrow(stream)
 }
 \arguments{
 \item{stream}{stream.
@@ -23,8 +23,6 @@ binary file format, and uses a \link[=arrow__ipc__RecordBatchFileReader]{arrow::
 to process it.
 \item a raw vector: read using a \link[=arrow__ipc__RecordBatchStreamReader]{arrow::ipc::RecordBatchStreamReader}
 }}
-
-\item{use_threads}{Use threads when converting to a tibble}
 }
 \value{
 \itemize{
diff --git a/r/tests/testthat/test-json.R b/r/tests/testthat/test-json.R
index 627e445..0321fb4 100644
--- a/r/tests/testthat/test-json.R
+++ b/r/tests/testthat/test-json.R
@@ -37,7 +37,7 @@ test_that("Can read json file with scalars columns (ARROW-5503)", {
     tab1$schema,
     schema(hello = float64(), world = boolean(), yo = utf8())
   )
-  tib <- as_tibble(tab1)
+  tib <- as.data.frame(tab1)
   expect_equal(tib$hello, c(3.5, 3.25, 3.125, 0))
   expect_equal(tib$world, c(FALSE, NA, NA, TRUE))
   expect_equal(tib$yo, c("thing", NA, "\u5fcd", NA))
@@ -99,7 +99,7 @@ test_that("Can read json file with nested columns (ARROW-5503)", {
     )
   )
   # cannot yet test list and struct types in R api
-  # tib <- as_tibble(tab1)
+  # tib <- as.data.frame(tab1)
 
   unlink(tf)
 })
diff --git a/r/tests/testthat/test-parquet.R b/r/tests/testthat/test-parquet.R
index f0a742d..5ad573c 100644
--- a/r/tests/testthat/test-parquet.R
+++ b/r/tests/testthat/test-parquet.R
@@ -25,10 +25,3 @@ test_that("reading a known Parquet file to tibble", {
   expect_identical(dim(df), c(10L, 11L))
   # TODO: assert more about the contents
 })
-
-test_that("as.data.frame with and without threads", {
-  expect_identical(
-    read_parquet(pq_file),
-    read_parquet(pq_file, use_threads = FALSE)
-  )
-})