You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ro...@apache.org on 2019/06/12 11:51:55 UTC
[arrow] branch master updated: ARROW-5504 [R]: move use_threads
argument to global option
This is an automated email from the ASF dual-hosted git repository.
romainfrancois pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new f6e3c43 ARROW-5504 [R]: move use_threads argument to global option
f6e3c43 is described below
commit f6e3c437f0b330fb8ffbc6959838bc1fe3bf24f4
Author: Romain Francois <ro...@rstudio.com>
AuthorDate: Wed Jun 12 13:51:44 2019 +0200
ARROW-5504 [R]: move use_threads argument to global option
At this point the function is not exported or documented and threads are always used, users would need to set `options(arrow.use_threads)` to turn them off.
Author: Romain Francois <ro...@rstudio.com>
Closes #4515 from romainfrancois/ARROW-5504/use_threads and squashes the following commits:
445364ab <Romain Francois> s/as_tibble()/as.data.frame()/
4afa7d4d <Romain Francois> + option_use_threads() function
---
r/DESCRIPTION | 2 +-
r/NAMESPACE | 1 +
r/R/ChunkedArray.R | 1 -
r/R/R6.R | 6 ------
r/R/RecordBatch.R | 2 +-
r/R/Table.R | 2 +-
r/R/array.R | 1 -
r/R/{zzz.R => arrow-package.R} | 13 +++++++++++--
r/R/csv.R | 6 ++----
r/R/feather.R | 5 ++---
r/R/json.R | 2 +-
r/R/parquet.R | 6 ++----
r/R/read_table.R | 2 --
r/README.Rmd | 2 +-
r/README.md | 22 +++++++++++-----------
r/man/arrow-package.Rd | 40 ++++++++++++++++++++++++++++++++++++++++
r/man/csv_read_options.Rd | 4 +---
r/man/read_feather.Rd | 5 +----
r/man/read_parquet.Rd | 5 +----
r/man/read_table.Rd | 4 +---
r/tests/testthat/test-json.R | 4 ++--
r/tests/testthat/test-parquet.R | 7 -------
22 files changed, 80 insertions(+), 62 deletions(-)
diff --git a/r/DESCRIPTION b/r/DESCRIPTION
index c38e5a1..f38f0de 100644
--- a/r/DESCRIPTION
+++ b/r/DESCRIPTION
@@ -60,6 +60,7 @@ Collate:
'Table.R'
'array.R'
'arrowExports.R'
+ 'arrow-package.R'
'buffer.R'
'io.R'
'compression.R'
@@ -75,4 +76,3 @@ Collate:
'read_table.R'
'reexports-bit64.R'
'write_arrow.R'
- 'zzz.R'
diff --git a/r/NAMESPACE b/r/NAMESPACE
index d535ea9..3f91568 100644
--- a/r/NAMESPACE
+++ b/r/NAMESPACE
@@ -180,6 +180,7 @@ importFrom(purrr,map_int)
importFrom(rlang,"%||%")
importFrom(rlang,abort)
importFrom(rlang,dots_n)
+importFrom(rlang,is_false)
importFrom(rlang,list2)
importFrom(rlang,warn)
useDynLib(arrow, .registration = TRUE)
diff --git a/r/R/ChunkedArray.R b/r/R/ChunkedArray.R
index 339a416..69a0224 100644
--- a/r/R/ChunkedArray.R
+++ b/r/R/ChunkedArray.R
@@ -60,7 +60,6 @@
#' @param \dots Vectors to coerce
#' @param type currently ignored
#'
-#' @importFrom rlang list2 %||%
#' @export
chunked_array <- function(..., type = NULL){
shared_ptr(`arrow::ChunkedArray`, ChunkedArray__from_list(list2(...), type))
diff --git a/r/R/R6.R b/r/R/R6.R
index 26c679f..e343116 100644
--- a/r/R/R6.R
+++ b/r/R/R6.R
@@ -16,12 +16,6 @@
# under the License.
#' @include enums.R
-#' @importFrom R6 R6Class
-#' @importFrom glue glue
-#' @importFrom purrr map map_int map2
-#' @importFrom rlang dots_n
-#' @importFrom assertthat assert_that
-
`arrow::Object` <- R6Class("arrow::Object",
public = list(
initialize = function(xp) self$set_pointer(xp),
diff --git a/r/R/RecordBatch.R b/r/R/RecordBatch.R
index 3ebd81b..d60c823 100644
--- a/r/R/RecordBatch.R
+++ b/r/R/RecordBatch.R
@@ -86,7 +86,7 @@
#' @export
`as.data.frame.arrow::RecordBatch` <- function(x, row.names = NULL, optional = FALSE, use_threads = TRUE, ...){
- RecordBatch__to_dataframe(x, use_threads = use_threads)
+ RecordBatch__to_dataframe(x, use_threads = option_use_threads())
}
#' Create an [arrow::RecordBatch][arrow__RecordBatch] from a data frame
diff --git a/r/R/Table.R b/r/R/Table.R
index 4c434b0..6d50394 100644
--- a/r/R/Table.R
+++ b/r/R/Table.R
@@ -67,7 +67,7 @@ table <- function(..., schema = NULL){
#' @export
`as.data.frame.arrow::Table` <- function(x, row.names = NULL, optional = FALSE, use_threads = TRUE, ...){
- Table__to_dataframe(x, use_threads = use_threads)
+ Table__to_dataframe(x, use_threads = option_use_threads())
}
#' @export
diff --git a/r/R/array.R b/r/R/array.R
index ccb8521..244cee0 100644
--- a/r/R/array.R
+++ b/r/R/array.R
@@ -122,7 +122,6 @@
#' @param x R object
#' @param type Explicit [type][arrow__DataType], or NULL (the default) to infer from the data
#'
-#' @importFrom rlang warn
#' @export
array <- function(x, type = NULL){
`arrow::Array`$dispatch(Array__from_vector(x, type))
diff --git a/r/R/zzz.R b/r/R/arrow-package.R
similarity index 76%
rename from r/R/zzz.R
rename to r/R/arrow-package.R
index eab9ad4..41cbc2a 100644
--- a/r/R/zzz.R
+++ b/r/R/arrow-package.R
@@ -15,9 +15,15 @@
# specific language governing permissions and limitations
# under the License.
-#' @useDynLib arrow, .registration = TRUE
+#' @importFrom R6 R6Class
+#' @importFrom glue glue
+#' @importFrom purrr map map_int map2
+#' @importFrom assertthat assert_that
+#' @importFrom rlang list2 %||% is_false abort dots_n warn
#' @importFrom Rcpp sourceCpp
-NULL
+#' @useDynLib arrow, .registration = TRUE
+#' @keywords internal
+"_PACKAGE"
#' Is the C++ Arrow library available
#'
@@ -26,3 +32,6 @@ arrow_available <- function() {
.Call(`_arrow_available`)
}
+option_use_threads <- function() {
+ !is_false(getOption("arrow.use_threads"))
+}
diff --git a/r/R/csv.R b/r/R/csv.R
index 5199279..d34ddcb 100644
--- a/r/R/csv.R
+++ b/r/R/csv.R
@@ -29,14 +29,13 @@
#' read options for the csv reader
#'
-#' @param use_threads Whether to use the global CPU thread pool
#' @param block_size Block size we request from the IO layer; also determines the size of chunks when use_threads is `TRUE`
#'
#' @export
-csv_read_options <- function(use_threads = TRUE, block_size = 1048576L) {
+csv_read_options <- function(block_size = 1048576L) {
shared_ptr(`arrow::csv::ReadOptions`, csv___ReadOptions__initialize(
list(
- use_threads = use_threads,
+ use_threads = option_use_threads(),
block_size = block_size
)
))
@@ -107,7 +106,6 @@ csv_table_reader <- function(file,
UseMethod("csv_table_reader")
}
-#' @importFrom rlang abort
#' @export
csv_table_reader.default <- function(file,
read_options = csv_read_options(),
diff --git a/r/R/feather.R b/r/R/feather.R
index c65ea9e..998f39b 100644
--- a/r/R/feather.R
+++ b/r/R/feather.R
@@ -160,16 +160,15 @@ FeatherTableReader.fs_path <- function(file, mmap = TRUE, ...) {
#' @param file a arrow::ipc::feather::TableReader or whatever the [FeatherTableReader()] function can handle
#' @param columns names if the columns to read. The default `NULL` means all columns
#' @param as_tibble should the [arrow::Table][arrow__Table] be converted to a tibble.
-#' @param use_threads Use threads when converting to a tibble.
#' @param ... additional parameters
#'
#' @return a data frame if `as_tibble` is `TRUE` (the default), or a [arrow::Table][arrow__Table] otherwise
#'
#' @export
-read_feather <- function(file, columns = NULL, as_tibble = TRUE, use_threads = TRUE, ...){
+read_feather <- function(file, columns = NULL, as_tibble = TRUE, ...){
out <- FeatherTableReader(file, ...)$Read(columns)
if (isTRUE(as_tibble)) {
- out <- as.data.frame(out, use_threads = use_threads)
+ out <- as.data.frame(out)
}
out
}
diff --git a/r/R/json.R b/r/R/json.R
index 2de8b94..e51051d 100644
--- a/r/R/json.R
+++ b/r/R/json.R
@@ -153,7 +153,7 @@ json_table_reader.default <- function(file,
read_json_arrow <- function(..., as_tibble = TRUE) {
tab <- json_table_reader(...)$Read()
if (isTRUE(as_tibble)) {
- tab <- as_tibble(tab)
+ tab <- as.data.frame(tab)
}
tab
}
diff --git a/r/R/parquet.R b/r/R/parquet.R
index 8caf356..0efed1f 100644
--- a/r/R/parquet.R
+++ b/r/R/parquet.R
@@ -23,8 +23,6 @@
#' @param file a file path
#' @param as_tibble Should the [arrow::Table][arrow__Table] be converted to a
#' tibble? Default is `TRUE`.
-#' @param use_threads Use threads when converting to a tibble? Default is
-#' '`TRUE`. Only relevant if `as_tibble` is `TRUE`.
#' @param ... Additional arguments, currently ignored
#'
#' @return A [arrow::Table][arrow__Table], or a `tbl_df` if `as_tibble` is
@@ -36,10 +34,10 @@
#' }
#'
#' @export
-read_parquet <- function(file, as_tibble = TRUE, use_threads = TRUE, ...) {
+read_parquet <- function(file, as_tibble = TRUE, ...) {
tab <- shared_ptr(`arrow::Table`, read_parquet_file(file))
if (isTRUE(as_tibble)) {
- tab <- as.data.frame(tab, use_threads = use_threads)
+ tab <- as.data.frame(tab)
}
tab
}
diff --git a/r/R/read_table.R b/r/R/read_table.R
index f7a7987..d5122a8 100644
--- a/r/R/read_table.R
+++ b/r/R/read_table.R
@@ -33,8 +33,6 @@
#'
#' - a raw vector: read using a [arrow::ipc::RecordBatchStreamReader][arrow__ipc__RecordBatchStreamReader]
#'
-#' @param use_threads Use threads when converting to a tibble
-#'
#' @return
#'
#' - `read_table` returns an [arrow::Table][arrow__Table]
diff --git a/r/README.Rmd b/r/README.Rmd
index 4b6ab14..f718732 100644
--- a/r/README.Rmd
+++ b/r/README.Rmd
@@ -91,7 +91,7 @@ library(arrow)
tab <- arrow::table(x = 1:10, y = rnorm(10))
tab$schema
tab
-as_tibble(tab)
+as.data.frame(tab)
```
## Developing
diff --git a/r/README.md b/r/README.md
index c87b8c6..b584486 100644
--- a/r/README.md
+++ b/r/README.md
@@ -104,20 +104,20 @@ tab$schema
#> y: double
tab
#> arrow::Table
-as_tibble(tab)
+as.data.frame(tab)
#> # A tibble: 10 x 2
#> x y
#> <int> <dbl>
-#> 1 1 0.524
-#> 2 2 -0.606
-#> 3 3 -0.655
-#> 4 4 1.37
-#> 5 5 1.53
-#> 6 6 1.96
-#> 7 7 1.80
-#> 8 8 1.27
-#> 9 9 0.698
-#> 10 10 -0.661
+#> 1 1 -1.56
+#> 2 2 -0.147
+#> 3 3 -1.16
+#> 4 4 0.106
+#> 5 5 1.14
+#> 6 6 0.340
+#> 7 7 0.184
+#> 8 8 -1.01
+#> 9 9 1.77
+#> 10 10 0.344
```
## Developing
diff --git a/r/man/arrow-package.Rd b/r/man/arrow-package.Rd
new file mode 100644
index 0000000..e2cbbbe
--- /dev/null
+++ b/r/man/arrow-package.Rd
@@ -0,0 +1,40 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/arrow-package.R
+\docType{package}
+\name{arrow-package}
+\alias{arrow}
+\alias{arrow-package}
+\title{arrow: Integration to 'Apache' 'Arrow'}
+\description{
+'Apache' 'Arrow' <https://arrow.apache.org/> is a cross-language
+ development platform for in-memory data. It specifies a standardized
+ language-independent columnar memory format for flat and hierarchical data,
+ organized for efficient analytic operations on modern hardware. This
+ package provides an interface to the Arrow C++ library.
+}
+\seealso{
+Useful links:
+\itemize{
+ \item \url{https://arrow.apache.org/docs/r/}
+ \item \url{https://github.com/apache/arrow/}
+ \item Report bugs at \url{https://issues.apache.org/jira/projects/ARROW/issues}
+}
+
+}
+\author{
+\strong{Maintainer}: Romain François \email{romain@rstudio.com}
+
+Authors:
+\itemize{
+ \item Jeroen Ooms \email{jeroen@berkeley.edu}
+ \item Apache Arrow \email{dev@arrow.apache.org} [copyright holder]
+}
+
+Other contributors:
+\itemize{
+ \item Javier Luraschi \email{javier@rstudio.com} [contributor]
+ \item Jeffrey Wong \email{jeffreyw@netflix.com} [contributor]
+}
+
+}
+\keyword{internal}
diff --git a/r/man/csv_read_options.Rd b/r/man/csv_read_options.Rd
index 3fa2d8c..952138a 100644
--- a/r/man/csv_read_options.Rd
+++ b/r/man/csv_read_options.Rd
@@ -4,11 +4,9 @@
\alias{csv_read_options}
\title{read options for the csv reader}
\usage{
-csv_read_options(use_threads = TRUE, block_size = 1048576L)
+csv_read_options(block_size = 1048576L)
}
\arguments{
-\item{use_threads}{Whether to use the global CPU thread pool}
-
\item{block_size}{Block size we request from the IO layer; also determines the size of chunks when use_threads is \code{TRUE}}
}
\description{
diff --git a/r/man/read_feather.Rd b/r/man/read_feather.Rd
index 4509c7d..31fd36a 100644
--- a/r/man/read_feather.Rd
+++ b/r/man/read_feather.Rd
@@ -4,8 +4,7 @@
\alias{read_feather}
\title{Read a feather file}
\usage{
-read_feather(file, columns = NULL, as_tibble = TRUE,
- use_threads = TRUE, ...)
+read_feather(file, columns = NULL, as_tibble = TRUE, ...)
}
\arguments{
\item{file}{a arrow::ipc::feather::TableReader or whatever the \code{\link[=FeatherTableReader]{FeatherTableReader()}} function can handle}
@@ -14,8 +13,6 @@ read_feather(file, columns = NULL, as_tibble = TRUE,
\item{as_tibble}{should the \link[=arrow__Table]{arrow::Table} be converted to a tibble.}
-\item{use_threads}{Use threads when converting to a tibble.}
-
\item{...}{additional parameters}
}
\value{
diff --git a/r/man/read_parquet.Rd b/r/man/read_parquet.Rd
index 2bce02c..3b1973b 100644
--- a/r/man/read_parquet.Rd
+++ b/r/man/read_parquet.Rd
@@ -4,7 +4,7 @@
\alias{read_parquet}
\title{Read Parquet file from disk}
\usage{
-read_parquet(file, as_tibble = TRUE, use_threads = TRUE, ...)
+read_parquet(file, as_tibble = TRUE, ...)
}
\arguments{
\item{file}{a file path}
@@ -12,9 +12,6 @@ read_parquet(file, as_tibble = TRUE, use_threads = TRUE, ...)
\item{as_tibble}{Should the \link[=arrow__Table]{arrow::Table} be converted to a
tibble? Default is \code{TRUE}.}
-\item{use_threads}{Use threads when converting to a tibble? Default is
-'\code{TRUE}. Only relevant if \code{as_tibble} is \code{TRUE}.}
-
\item{...}{Additional arguments, currently ignored}
}
\value{
diff --git a/r/man/read_table.Rd b/r/man/read_table.Rd
index 356ec5e..3231b26 100644
--- a/r/man/read_table.Rd
+++ b/r/man/read_table.Rd
@@ -7,7 +7,7 @@
\usage{
read_table(stream)
-read_arrow(stream, use_threads = TRUE)
+read_arrow(stream)
}
\arguments{
\item{stream}{stream.
@@ -23,8 +23,6 @@ binary file format, and uses a \link[=arrow__ipc__RecordBatchFileReader]{arrow::
to process it.
\item a raw vector: read using a \link[=arrow__ipc__RecordBatchStreamReader]{arrow::ipc::RecordBatchStreamReader}
}}
-
-\item{use_threads}{Use threads when converting to a tibble}
}
\value{
\itemize{
diff --git a/r/tests/testthat/test-json.R b/r/tests/testthat/test-json.R
index 627e445..0321fb4 100644
--- a/r/tests/testthat/test-json.R
+++ b/r/tests/testthat/test-json.R
@@ -37,7 +37,7 @@ test_that("Can read json file with scalars columns (ARROW-5503)", {
tab1$schema,
schema(hello = float64(), world = boolean(), yo = utf8())
)
- tib <- as_tibble(tab1)
+ tib <- as.data.frame(tab1)
expect_equal(tib$hello, c(3.5, 3.25, 3.125, 0))
expect_equal(tib$world, c(FALSE, NA, NA, TRUE))
expect_equal(tib$yo, c("thing", NA, "\u5fcd", NA))
@@ -99,7 +99,7 @@ test_that("Can read json file with nested columns (ARROW-5503)", {
)
)
# cannot yet test list and struct types in R api
- # tib <- as_tibble(tab1)
+ # tib <- as.data.frame(tab1)
unlink(tf)
})
diff --git a/r/tests/testthat/test-parquet.R b/r/tests/testthat/test-parquet.R
index f0a742d..5ad573c 100644
--- a/r/tests/testthat/test-parquet.R
+++ b/r/tests/testthat/test-parquet.R
@@ -25,10 +25,3 @@ test_that("reading a known Parquet file to tibble", {
expect_identical(dim(df), c(10L, 11L))
# TODO: assert more about the contents
})
-
-test_that("as.data.frame with and without threads", {
- expect_identical(
- read_parquet(pq_file),
- read_parquet(pq_file, use_threads = FALSE)
- )
-})