You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by np...@apache.org on 2022/04/22 17:40:40 UTC
[arrow] branch master updated: ARROW-15989: [R] rbind & cbind for Table & RecordBatch
This is an automated email from the ASF dual-hosted git repository.
npr pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new f23965ceed ARROW-15989: [R] rbind & cbind for Table & RecordBatch
f23965ceed is described below
commit f23965ceedb403f0f480a0892806f23ab81a2db0
Author: Will Jones <wi...@gmail.com>
AuthorDate: Fri Apr 22 13:40:34 2022 -0400
ARROW-15989: [R] rbind & cbind for Table & RecordBatch
## Summary of Changes
* Added `rbind` and `cbind` for Table
* Added `cbind` for RecordBatch. `rbind` just redirects the user to use `Table$create`
* Changed `c.Array()` implementation to use either `concat_array()` or `ChunkedArray$create()` depending on whether the user wants a single array or zero-copy.
* Implemented `c.ChunkedArray`
Closes #12751 from wjones127/ARROW-15989-rbind-table
Lead-authored-by: Will Jones <wi...@gmail.com>
Co-authored-by: Neal Richardson <ne...@gmail.com>
Co-authored-by: Dewey Dunnington <de...@fishandwhistle.net>
Signed-off-by: Neal Richardson <ne...@gmail.com>
---
r/NAMESPACE | 7 ++
r/R/array.R | 12 +--
r/R/arrow-package.R | 2 +-
r/R/arrowExports.R | 4 +
r/R/chunked-array.R | 19 ++++-
r/R/record-batch.R | 53 +++++++++++++
r/R/table.R | 81 +++++++++++++++++++
r/_pkgdown.yml | 1 +
r/man/ChunkedArray.Rd | 5 ++
r/man/concat_arrays.Rd | 5 +-
r/man/concat_tables.Rd | 29 +++++++
r/src/arrowExports.cpp | 10 +++
r/src/chunkedarray.cpp | 3 +-
r/src/table.cpp | 8 ++
r/tests/testthat/_snaps/Array.md | 6 ++
r/tests/testthat/_snaps/RecordBatch.md | 9 +++
r/tests/testthat/_snaps/Table.md | 5 ++
r/tests/testthat/test-Array.R | 24 +-----
r/tests/testthat/test-RecordBatch.R | 99 ++++++++++++++++++++++-
r/tests/testthat/test-Table.R | 141 +++++++++++++++++++++++++++++++++
r/tests/testthat/test-chunked-array.R | 33 ++++++++
21 files changed, 521 insertions(+), 35 deletions(-)
diff --git a/r/NAMESPACE b/r/NAMESPACE
index da43a3f511..d6e67c85a8 100644
--- a/r/NAMESPACE
+++ b/r/NAMESPACE
@@ -39,7 +39,10 @@ S3method(as.list,Schema)
S3method(as.raw,Buffer)
S3method(as.vector,ArrowDatum)
S3method(c,Array)
+S3method(c,ChunkedArray)
S3method(c,Dataset)
+S3method(cbind,RecordBatch)
+S3method(cbind,Table)
S3method(dim,ArrowTabular)
S3method(dim,Dataset)
S3method(dim,RecordBatchReader)
@@ -86,6 +89,8 @@ S3method(print,arrow_dplyr_query)
S3method(print,arrow_info)
S3method(print,arrow_r_metadata)
S3method(quantile,ArrowDatum)
+S3method(rbind,RecordBatch)
+S3method(rbind,Table)
S3method(read_message,InputStream)
S3method(read_message,MessageReader)
S3method(read_message,default)
@@ -218,6 +223,7 @@ export(cast_options)
export(chunked_array)
export(codec_is_available)
export(concat_arrays)
+export(concat_tables)
export(contains)
export(copy_files)
export(cpu_count)
@@ -325,6 +331,7 @@ importFrom(bit64,print.integer64)
importFrom(bit64,str.integer64)
importFrom(methods,as)
importFrom(purrr,as_mapper)
+importFrom(purrr,flatten)
importFrom(purrr,imap)
importFrom(purrr,imap_chr)
importFrom(purrr,keep)
diff --git a/r/R/array.R b/r/R/array.R
index 965e3bfc33..4e7fbdab7c 100644
--- a/r/R/array.R
+++ b/r/R/array.R
@@ -222,10 +222,7 @@ Array$import_from_c <- ImportArray
#' Concatenates zero or more [Array] objects into a single
#' array. This operation will make a copy of its input; if you need
#' the behavior of a single Array but don't need a
-#' single object, use [ChunkedArray]. Note that a [c()]
-#' method is provided for convenience but that it may
-#' produce surprising results when used with other
-#' classes of objects.
+#' single object, use [ChunkedArray].
#'
#' @param ... zero or more [Array] objects to concatenate
#' @param type An optional `type` describing the desired
@@ -236,7 +233,6 @@ Array$import_from_c <- ImportArray
#'
#' @examplesIf arrow_available()
#' concat_arrays(Array$create(1:3), Array$create(4:5))
-#'
concat_arrays <- function(..., type = NULL) {
dots <- lapply(list2(...), Array$create, type = type)
@@ -256,7 +252,11 @@ concat_arrays <- function(..., type = NULL) {
#' @rdname concat_arrays
#' @export
c.Array <- function(...) {
- concat_arrays(...)
+ abort(c(
+ "Use `concat_arrays()` or `ChunkedArray$create()` instead.",
+ i = "`concat_arrays()` creates a new Array by copying data.",
+ i = "`ChunkedArray$create()` uses the arrays as chunks for zero-copy concatenation."
+ ))
}
#' @rdname array
diff --git a/r/R/arrow-package.R b/r/R/arrow-package.R
index 3c810bb8f2..624cca9e7d 100644
--- a/r/R/arrow-package.R
+++ b/r/R/arrow-package.R
@@ -17,7 +17,7 @@
#' @importFrom stats quantile median na.omit na.exclude na.pass na.fail
#' @importFrom R6 R6Class
-#' @importFrom purrr as_mapper map map2 map_chr map2_chr map_dfr map_int map_lgl keep imap imap_chr
+#' @importFrom purrr as_mapper map map2 map_chr map2_chr map_dfr map_int map_lgl keep imap imap_chr flatten
#' @importFrom assertthat assert_that is.string
#' @importFrom rlang list2 %||% is_false abort dots_n warn enquo quo_is_null enquos is_integerish quos
#' @importFrom rlang eval_tidy new_data_mask syms env new_environment env_bind set_names exec
diff --git a/r/R/arrowExports.R b/r/R/arrowExports.R
index 6bf9a75d0f..23309f70d1 100644
--- a/r/R/arrowExports.R
+++ b/r/R/arrowExports.R
@@ -1980,6 +1980,10 @@ Table__ReferencedBufferSize <- function(table) {
.Call(`_arrow_Table__ReferencedBufferSize`, table)
}
+Table__ConcatenateTables <- function(tables, unify_schemas) {
+ .Call(`_arrow_Table__ConcatenateTables`, tables, unify_schemas)
+}
+
GetCpuThreadPoolCapacity <- function() {
.Call(`_arrow_GetCpuThreadPoolCapacity`)
}
diff --git a/r/R/chunked-array.R b/r/R/chunked-array.R
index 95a05aba5b..c91b125af4 100644
--- a/r/R/chunked-array.R
+++ b/r/R/chunked-array.R
@@ -77,6 +77,11 @@
#' # When constructing a ChunkedArray, the first chunk is used to infer type.
#' doubles <- chunked_array(c(1, 2, 3), c(5L, 6L, 7L))
#' doubles$type
+#'
+#' # Concatenating chunked arrays returns a new chunked array containing all chunks
+#' a <- chunked_array(c(1, 2), 3)
+#' b <- chunked_array(c(4, 5), 6)
+#' c(a, b)
#' @export
ChunkedArray <- R6Class("ChunkedArray",
inherit = ArrowDatum,
@@ -145,7 +150,19 @@ ChunkedArray$create <- function(..., type = NULL) {
if (!is.null(type)) {
type <- as_type(type)
}
- ChunkedArray__from_list(list2(...), type)
+ chunks <- flatten(map(list2(...), function(arr) {
+ if (inherits(arr, "ChunkedArray")) {
+ arr$chunks
+ } else {
+ list(arr)
+ }
+ }))
+ ChunkedArray__from_list(chunks, type)
+}
+
+#' @export
+c.ChunkedArray <- function(...) {
+ ChunkedArray$create(...)
}
#' @param \dots Vectors to coerce
diff --git a/r/R/record-batch.R b/r/R/record-batch.R
index 24bd61535e..03f97a5130 100644
--- a/r/R/record-batch.R
+++ b/r/R/record-batch.R
@@ -189,3 +189,56 @@ record_batch <- RecordBatch$create
#' @export
names.RecordBatch <- function(x) x$names()
+
+#' @export
+rbind.RecordBatch <- function(...) {
+ abort("Use `Table$create()` to combine RecordBatches into a Table")
+}
+
+cbind_check_length <- function(inputs, call = caller_env()) {
+ sizes <- map_int(inputs, NROW)
+ ok_lengths <- sizes %in% c(head(sizes, 1), 1L)
+ if (!all(ok_lengths)) {
+ first_bad_one <- which.min(ok_lengths)
+ abort(
+ c("Non-scalar inputs must have an equal number of rows.",
+ i = sprintf("..1 has %d, ..%d has %d", sizes[[1]], first_bad_one, sizes[[first_bad_one]])),
+ call = call
+ )
+ }
+}
+
+#' @export
+cbind.RecordBatch <- function(...) {
+ call <- sys.call()
+ inputs <- list(...)
+ arg_names <- if (is.null(names(inputs))) {
+ rep("", length(inputs))
+ } else {
+ names(inputs)
+ }
+
+ cbind_check_length(inputs, call)
+
+ columns <- flatten(map(seq_along(inputs), function(i) {
+ input <- inputs[[i]]
+ name <- arg_names[i]
+
+ if (inherits(input, "RecordBatch")) {
+ set_names(input$columns, names(input))
+ } else if (inherits(input, "data.frame")) {
+ as.list(input)
+ } else if (inherits(input, "Table") || inherits(input, "ChunkedArray")) {
+ abort("Cannot cbind a RecordBatch with Tables or ChunkedArrays",
+ i = "Hint: consider converting the RecordBatch into a Table first")
+ } else {
+ if (name == "") {
+ abort("Vector and array arguments must have names",
+ i = sprintf("Argument ..%d is missing a name", i))
+ }
+ list2("{name}" := input)
+ }
+ }))
+
+ RecordBatch$create(!!! columns)
+}
diff --git a/r/R/table.R b/r/R/table.R
index 07750786ee..102d0ecd10 100644
--- a/r/R/table.R
+++ b/r/R/table.R
@@ -149,6 +149,87 @@ Table$create <- function(..., schema = NULL) {
#' @export
names.Table <- function(x) x$ColumnNames()
+#' Concatenate one or more Tables
+#'
+#' Concatenate one or more [Table] objects into a single table. This operation
+#' does not copy array data, but instead creates new chunked arrays for each
+#' column that point at existing array data.
+#'
+#' @param ... A [Table]
+#' @param unify_schemas If TRUE, the schemas of the tables will be first unified
+#' with fields of the same name being merged, then each table will be promoted
+#' to the unified schema before being concatenated. Otherwise, all tables should
+#' have the same schema.
+#' @examplesIf arrow_available()
+#' tbl <- arrow_table(name = rownames(mtcars), mtcars)
+#' prius <- arrow_table(name = "Prius", mpg = 58, cyl = 4, disp = 1.8)
+#' combined <- concat_tables(tbl, prius)
+#' tail(combined)$to_data_frame()
+#' @export
+concat_tables <- function(..., unify_schemas = TRUE) {
+ tables <- list2(...)
+
+ if (length(tables) == 0) {
+ abort("Must pass at least one Table.")
+ }
+
+ if (!unify_schemas) {
+ # assert they have same schema
+ schema <- tables[[1]]$schema
+ unequal_schema_idx <- which.min(lapply(tables, function(x) x$schema == schema))
+ if (unequal_schema_idx != 1) {
+ abort(c(
+ sprintf("Schema at index %i does not match the first schema.", unequal_schema_idx),
+ i = paste0("Schema 1:\n", schema$ToString()),
+ i = paste0(
+ sprintf("Schema %d:\n", unequal_schema_idx),
+ tables[[unequal_schema_idx]]$schema$ToString()
+ )
+ ))
+ }
+ }
+
+ Table__ConcatenateTables(tables, unify_schemas)
+}
+
+#' @export
+rbind.Table <- function(...) {
+ concat_tables(..., unify_schemas = FALSE)
+}
+
+#' @export
+cbind.Table <- function(...) {
+ call <- sys.call()
+ inputs <- list(...)
+ arg_names <- if (is.null(names(inputs))) {
+ rep("", length(inputs))
+ } else {
+ names(inputs)
+ }
+
+ cbind_check_length(inputs, call)
+
+ columns <- flatten(map(seq_along(inputs), function(i) {
+ input <- inputs[[i]]
+ name <- arg_names[i]
+
+ if (inherits(input, "ArrowTabular")) {
+ set_names(input$columns, names(input))
+ } else if (inherits(input, "data.frame")) {
+ as.list(input)
+ } else {
+ if (name == "") {
+ abort("Vector and array arguments must have names",
+ i = sprintf("Argument ..%d is missing a name", i)
+ )
+ }
+ list2("{name}" := input)
+ }
+ }))
+
+ Table$create(!!!columns)
+}
+
#' @param ... A `data.frame` or a named set of Arrays or vectors. If given a
#' mixture of data.frames and named vectors, the inputs will be autospliced together
#' (see examples). Alternatively, you can provide a single Arrow IPC
diff --git a/r/_pkgdown.yml b/r/_pkgdown.yml
index c3810cdf09..713af8578f 100644
--- a/r/_pkgdown.yml
+++ b/r/_pkgdown.yml
@@ -144,6 +144,7 @@ reference:
- buffer
- read_message
- concat_arrays
+ - concat_tables
- ExtensionArray
- vctrs_extension_array
- title: Arrow data types and schema
diff --git a/r/man/ChunkedArray.Rd b/r/man/ChunkedArray.Rd
index 4da31d28e2..ab5e0f73c2 100644
--- a/r/man/ChunkedArray.Rd
+++ b/r/man/ChunkedArray.Rd
@@ -74,6 +74,11 @@ list_scores$num_chunks
# When constructing a ChunkedArray, the first chunk is used to infer type.
doubles <- chunked_array(c(1, 2, 3), c(5L, 6L, 7L))
doubles$type
+
+# Concatenating chunked arrays returns a new chunked array containing all chunks
+a <- chunked_array(c(1, 2), 3)
+b <- chunked_array(c(4, 5), 6)
+c(a, b)
\dontshow{\}) # examplesIf}
}
\seealso{
diff --git a/r/man/concat_arrays.Rd b/r/man/concat_arrays.Rd
index 795445b1c2..0cbe7ba578 100644
--- a/r/man/concat_arrays.Rd
+++ b/r/man/concat_arrays.Rd
@@ -22,10 +22,7 @@ A single \link{Array}
Concatenates zero or more \link{Array} objects into a single
array. This operation will make a copy of its input; if you need
the behavior of a single Array but don't need a
-single object, use \link{ChunkedArray}. Note that a \code{\link[=c]{c()}}
-method is provided for convenience but that it may
-produce surprising results when used with other
-classes of objects.
+single object, use \link{ChunkedArray}.
}
\examples{
\dontshow{if (arrow_available()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf}
diff --git a/r/man/concat_tables.Rd b/r/man/concat_tables.Rd
new file mode 100644
index 0000000000..a03fc49a33
--- /dev/null
+++ b/r/man/concat_tables.Rd
@@ -0,0 +1,29 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/table.R
+\name{concat_tables}
+\alias{concat_tables}
+\title{Concatenate one or more Tables}
+\usage{
+concat_tables(..., unify_schemas = TRUE)
+}
+\arguments{
+\item{...}{A \link{Table}}
+
+\item{unify_schemas}{If TRUE, the schemas of the tables will be first unified
+with fields of the same name being merged, then each table will be promoted
+to the unified schema before being concatenated. Otherwise, all tables should
+have the same schema.}
+}
+\description{
+Concatenate one or more \link{Table} objects into a single table. This operation
+does not copy array data, but instead creates new chunked arrays for each
+column that point at existing array data.
+}
+\examples{
+\dontshow{if (arrow_available()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf}
+tbl <- arrow_table(name = rownames(mtcars), mtcars)
+prius <- arrow_table(name = "Prius", mpg = 58, cyl = 4, disp = 1.8)
+combined <- concat_tables(tbl, prius)
+tail(combined)$to_data_frame()
+\dontshow{\}) # examplesIf}
+}
diff --git a/r/src/arrowExports.cpp b/r/src/arrowExports.cpp
index 760b71a5be..5440dd3e62 100644
--- a/r/src/arrowExports.cpp
+++ b/r/src/arrowExports.cpp
@@ -5019,6 +5019,15 @@ BEGIN_CPP11
return cpp11::as_sexp(Table__ReferencedBufferSize(table));
END_CPP11
}
+// table.cpp
+std::shared_ptr<arrow::Table> Table__ConcatenateTables(const std::vector<std::shared_ptr<arrow::Table>>& tables, bool unify_schemas);
+extern "C" SEXP _arrow_Table__ConcatenateTables(SEXP tables_sexp, SEXP unify_schemas_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::vector<std::shared_ptr<arrow::Table>>&>::type tables(tables_sexp);
+ arrow::r::Input<bool>::type unify_schemas(unify_schemas_sexp);
+ return cpp11::as_sexp(Table__ConcatenateTables(tables, unify_schemas));
+END_CPP11
+}
// threadpool.cpp
int GetCpuThreadPoolCapacity();
extern "C" SEXP _arrow_GetCpuThreadPoolCapacity(){
@@ -5615,6 +5624,7 @@ static const R_CallMethodDef CallEntries[] = {
{ "_arrow_all_record_batches", (DL_FUNC) &_arrow_all_record_batches, 1},
{ "_arrow_Table__from_record_batches", (DL_FUNC) &_arrow_Table__from_record_batches, 2},
{ "_arrow_Table__ReferencedBufferSize", (DL_FUNC) &_arrow_Table__ReferencedBufferSize, 1},
+ { "_arrow_Table__ConcatenateTables", (DL_FUNC) &_arrow_Table__ConcatenateTables, 2},
{ "_arrow_GetCpuThreadPoolCapacity", (DL_FUNC) &_arrow_GetCpuThreadPoolCapacity, 0},
{ "_arrow_SetCpuThreadPoolCapacity", (DL_FUNC) &_arrow_SetCpuThreadPoolCapacity, 1},
{ "_arrow_GetIOThreadPoolCapacity", (DL_FUNC) &_arrow_GetIOThreadPoolCapacity, 0},
diff --git a/r/src/chunkedarray.cpp b/r/src/chunkedarray.cpp
index 72ae420061..bfc08008de 100644
--- a/r/src/chunkedarray.cpp
+++ b/r/src/chunkedarray.cpp
@@ -141,7 +141,8 @@ std::shared_ptr<arrow::ChunkedArray> ChunkedArray__from_list(cpp11::list chunks,
}
}
- return std::make_shared<arrow::ChunkedArray>(std::move(vec));
+ // Use Make so we validate that chunk types are all the same
+ return ValueOrStop(arrow::ChunkedArray::Make(std::move(vec)));
}
// [[arrow::export]]
diff --git a/r/src/table.cpp b/r/src/table.cpp
index 5168dc9784..2d2d35b06a 100644
--- a/r/src/table.cpp
+++ b/r/src/table.cpp
@@ -289,4 +289,12 @@ int64_t Table__ReferencedBufferSize(const std::shared_ptr<arrow::Table>& table)
return ValueOrStop(arrow::util::ReferencedBufferSize(*table));
}
+// [[arrow::export]]
+std::shared_ptr<arrow::Table> Table__ConcatenateTables(
+ const std::vector<std::shared_ptr<arrow::Table>>& tables, bool unify_schemas) {
+ arrow::ConcatenateTablesOptions options;
+ options.unify_schemas = unify_schemas;
+ return ValueOrStop(arrow::ConcatenateTables(tables, options));
+}
+
#endif
diff --git a/r/tests/testthat/_snaps/Array.md b/r/tests/testthat/_snaps/Array.md
new file mode 100644
index 0000000000..3f8ebe966d
--- /dev/null
+++ b/r/tests/testthat/_snaps/Array.md
@@ -0,0 +1,6 @@
+# Array doesn't support c()
+
+ Use `concat_arrays()` or `ChunkedArray$create()` instead.
+ i `concat_arrays()` creates a new Array by copying data.
+ i `ChunkedArray$create()` uses the arrays as chunks for zero-copy concatenation.
+
diff --git a/r/tests/testthat/_snaps/RecordBatch.md b/r/tests/testthat/_snaps/RecordBatch.md
new file mode 100644
index 0000000000..30aef6164a
--- /dev/null
+++ b/r/tests/testthat/_snaps/RecordBatch.md
@@ -0,0 +1,9 @@
+# RecordBatch doesn't support rbind
+
+ Use `Table$create()` to combine RecordBatches into a Table
+
+# RecordBatch supports cbind
+
+ Non-scalar inputs must have an equal number of rows.
+ i ..1 has 10, ..2 has 2
+
diff --git a/r/tests/testthat/_snaps/Table.md b/r/tests/testthat/_snaps/Table.md
new file mode 100644
index 0000000000..47a565be68
--- /dev/null
+++ b/r/tests/testthat/_snaps/Table.md
@@ -0,0 +1,5 @@
+# Table supports cbind
+
+ Non-scalar inputs must have an equal number of rows.
+ i ..1 has 10, ..2 has 2
+
diff --git a/r/tests/testthat/test-Array.R b/r/tests/testthat/test-Array.R
index 2f75efb3d6..1e774d7fb6 100644
--- a/r/tests/testthat/test-Array.R
+++ b/r/tests/testthat/test-Array.R
@@ -1056,30 +1056,12 @@ test_that("concat_arrays() coerces its input to Array", {
)
})
-test_that("c() works for Array", {
- expect_r6_class(c(Array$create(1L), Array$create(1L)), "Array")
-
- struct <- call_function(
- "make_struct",
- Array$create(1L),
- options = list(field_names = "")
+test_that("Array doesn't support c()", {
+ expect_snapshot_error(
+ c(Array$create(1:2), Array$create(3:5))
)
- expect_r6_class(c(struct, struct), "StructArray")
-
- list <- Array$create(list(1))
- expect_r6_class(c(list, list), "ListArray")
-
- list <- Array$create(list(), type = large_list_of(float64()))
- expect_r6_class(c(list, list), "LargeListArray")
-
- list <- Array$create(list(), type = fixed_size_list_of(float64(), 1L))
- expect_r6_class(c(list, list), "FixedSizeListArray")
-
- list <- Array$create(list(), type = map_of(string(), float64()))
- expect_r6_class(c(list, list), "MapArray")
})
-
test_that("Array to C-interface", {
# create a struct array since that's one of the more complicated array types
df <- tibble::tibble(x = 1:10, y = x / 2, z = letters[1:10])
diff --git a/r/tests/testthat/test-RecordBatch.R b/r/tests/testthat/test-RecordBatch.R
index d280754a33..c284b7b1d5 100644
--- a/r/tests/testthat/test-RecordBatch.R
+++ b/r/tests/testthat/test-RecordBatch.R
@@ -513,6 +513,103 @@ test_that("record_batch() with different length arrays", {
expect_error(record_batch(a = 1:5, b = 1:6), msg)
})
+test_that("RecordBatch doesn't support rbind", {
+ expect_snapshot_error(
+ rbind(
+ record_batch(a = 1:10),
+ record_batch(a = 2:4)
+ )
+ )
+})
+
+test_that("RecordBatch supports cbind", {
+ expect_snapshot_error(
+ cbind(
+ record_batch(a = 1:10),
+ record_batch(a = c("a", "b"))
+ )
+ )
+ expect_error(
+ cbind(record_batch(a = 1:10), record_batch(b = character(0))),
+ regexp = "Non-scalar inputs must have an equal number of rows"
+ )
+
+ actual <- cbind(
+ record_batch(a = c(1, 2), b = c("a", "b")),
+ record_batch(a = c("d", "c")),
+ record_batch(c = c(2, 3))
+ )
+ expected <- record_batch(
+ a = c(1, 2),
+ b = c("a", "b"),
+ a = c("d", "c"),
+ c = c(2, 3)
+ )
+ expect_equal(actual, expected)
+
+ # cbind() with one argument returns identical table
+ expected <- record_batch(a = 1:10)
+ expect_equal(expected, cbind(expected))
+
+ # Handles arrays
+ expect_equal(
+ cbind(record_batch(a = 1:2), b = Array$create(4:5)),
+ record_batch(a = 1:2, b = 4:5)
+ )
+
+ # Handles data.frames on R 4.0 or greater
+ if (getRversion() >= "4.0.0") {
+ # Prior to R 4.0, cbind would short-circuit to the data.frame implementation
+ # if **any** of the arguments are a data.frame.
+ expect_equal(
+ cbind(record_batch(a = 1:2), data.frame(b = 4:5)),
+ record_batch(a = 1:2, b = 4:5)
+ )
+ }
+
+ # Handles base factors
+ expect_equal(
+ cbind(record_batch(a = 1:2), b = factor(c("a", "b"))),
+ record_batch(a = 1:2, b = factor(c("a", "b")))
+ )
+
+ # Handles base scalars
+ expect_equal(
+ cbind(record_batch(a = 1:2), b = 1L),
+ record_batch(a = 1:2, b = rep(1L, 2))
+ )
+
+ # Handles zero rows
+ expect_equal(
+ cbind(record_batch(a = character(0)), b = Array$create(numeric(0)), c = integer(0)),
+ record_batch(a = character(0), b = numeric(0), c = integer(0)),
+ )
+
+ # Rejects unnamed arrays, even in cases where no named arguments are passed
+ expect_error(
+ cbind(record_batch(a = 1:2), b = 3:4, 5:6),
+ regexp = "Vector and array arguments must have names"
+ )
+ expect_error(
+ cbind(record_batch(a = 1:2), 3:4, 5:6),
+ regexp = "Vector and array arguments must have names"
+ )
+
+ # Rejects Table and ChunkedArray arguments
+ if (getRversion() >= "4.0.0") {
+ # R 3.6 cbind dispatch rules cause cbind to fall back to default impl if
+ # there are multiple arguments with distinct cbind implementations
+ expect_error(
+ cbind(record_batch(a = 1:2), arrow_table(b = 3:4)),
+ regexp = "Cannot cbind a RecordBatch with Tables or ChunkedArrays"
+ )
+ }
+ expect_error(
+ cbind(record_batch(a = 1:2), b = chunked_array(1, 2)),
+ regexp = "Cannot cbind a RecordBatch with Tables or ChunkedArrays"
+ )
+})
+
test_that("Handling string data with embedded nuls", {
raws <- Array$create(structure(list(
as.raw(c(0x70, 0x65, 0x72, 0x73, 0x6f, 0x6e)),
@@ -655,7 +752,7 @@ test_that("RecordBatch to C-interface", {
# then import it and check that the roundtripped value is the same
circle <- RecordBatch$import_from_c(array_ptr, schema_ptr)
- expect_equal
+ expect_equal(batch, circle)
# must clean up the pointers or we leak
delete_arrow_schema(schema_ptr)
diff --git a/r/tests/testthat/test-Table.R b/r/tests/testthat/test-Table.R
index 44144c00ba..89c22b97e1 100644
--- a/r/tests/testthat/test-Table.R
+++ b/r/tests/testthat/test-Table.R
@@ -518,6 +518,147 @@ test_that("Table$create() no recycling with tibbles", {
)
})
+test_that("Tables can be combined with concat_tables()", {
+ expect_error(
+ concat_tables(arrow_table(a = 1:10), arrow_table(a = c("a", "b")), unify_schemas = FALSE),
+ regexp = "Schema at index 2 does not match the first schema"
+ )
+
+ expect_error(
+ concat_tables(arrow_table(a = 1:10), arrow_table(a = c("a", "b")), unify_schemas = TRUE),
+ regexp = "Unable to merge: Field a has incompatible types: int32 vs string"
+ )
+ expect_error(
+ concat_tables(),
+ regexp = "Must pass at least one Table"
+ )
+
+ expect_equal(
+ concat_tables(
+ arrow_table(a = 1:5),
+ arrow_table(a = 6:7, b = c("d", "e"))
+ ),
+ arrow_table(a = 1:7, b = c(rep(NA, 5), "d", "e"))
+ )
+
+ # concat_tables() with one argument returns identical table
+ expected <- arrow_table(a = 1:10)
+ expect_equal(expected, concat_tables(expected))
+})
+
+test_that("Table supports rbind", {
+ expect_error(
+ rbind(arrow_table(a = 1:10), arrow_table(a = c("a", "b"))),
+ regexp = "Schema at index 2 does not match the first schema"
+ )
+
+ tables <- list(
+ arrow_table(a = 1:10, b = Scalar$create("x")),
+ arrow_table(a = 2:42, b = Scalar$create("y")),
+ arrow_table(a = 8:10, b = Scalar$create("z"))
+ )
+ expected <- Table$create(do.call(rbind, lapply(tables, as.data.frame)))
+ actual <- do.call(rbind, tables)
+ expect_equal(actual, expected, ignore_attr = TRUE)
+
+ # rbind with empty table produces identical table
+ expected <- arrow_table(a = 1:10, b = Scalar$create("x"))
+ expect_equal(
+ rbind(expected, arrow_table(a = integer(0), b = character(0))),
+ expected
+ )
+ # rbind() with one argument returns identical table
+ expect_equal(rbind(expected), expected)
+})
+
+test_that("Table supports cbind", {
+ expect_snapshot_error(
+ cbind(
+ arrow_table(a = 1:10),
+ arrow_table(a = c("a", "b"))
+ )
+ )
+ expect_error(
+ cbind(arrow_table(a = 1:10), arrow_table(b = character(0))),
+ regexp = "Non-scalar inputs must have an equal number of rows"
+ )
+
+ actual <- cbind(
+ arrow_table(a = 1:10, b = Scalar$create("x")),
+ arrow_table(a = 11:20, b = Scalar$create("y")),
+ arrow_table(c = 1:10)
+ )
+ expected <- arrow_table(cbind(
+ tibble::tibble(a = 1:10, b = "x"),
+ tibble::tibble(a = 11:20, b = "y"),
+ tibble::tibble(c = 1:10)
+ ))
+ expect_equal(actual, expected, ignore_attr = TRUE)
+
+ # cbind() with one argument returns identical table
+ expected <- arrow_table(a = 1:10)
+ expect_equal(expected, cbind(expected))
+
+ # Handles Arrow arrays and chunked arrays
+ expect_equal(
+ cbind(arrow_table(a = 1:2), b = Array$create(4:5)),
+ arrow_table(a = 1:2, b = 4:5)
+ )
+ expect_equal(
+ cbind(arrow_table(a = 1:2), b = chunked_array(4, 5)),
+ arrow_table(a = 1:2, b = chunked_array(4, 5))
+ )
+
+ # Handles data.frame
+ if (getRversion() >= "4.0.0") {
+ # Prior to R 4.0, cbind would short-circuit to the data.frame implementation
+ # if **any** of the arguments are a data.frame.
+ expect_equal(
+ cbind(arrow_table(a = 1:2), data.frame(b = 4:5)),
+ arrow_table(a = 1:2, b = 4:5)
+ )
+ }
+
+ # Handles factors
+ expect_equal(
+ cbind(arrow_table(a = 1:2), b = factor(c("a", "b"))),
+ arrow_table(a = 1:2, b = factor(c("a", "b")))
+ )
+
+ # Handles scalar values
+ expect_equal(
+ cbind(arrow_table(a = 1:2), b = "x"),
+ arrow_table(a = 1:2, b = c("x", "x"))
+ )
+
+ # Handles zero rows
+ expect_equal(
+ cbind(arrow_table(a = character(0)), b = Array$create(numeric(0)), c = integer(0)),
+ arrow_table(a = character(0), b = numeric(0), c = integer(0)),
+ )
+
+ # Rejects unnamed arrays, even in cases where no named arguments are passed
+ expect_error(
+ cbind(arrow_table(a = 1:2), b = 3:4, 5:6),
+ regexp = "Vector and array arguments must have names"
+ )
+ expect_error(
+ cbind(arrow_table(a = 1:2), 3:4, 5:6),
+ regexp = "Vector and array arguments must have names"
+ )
+})
+
+test_that("cbind.Table handles record batches and tables", {
+ # R 3.6 cbind dispatch rules cause cbind to fall back to default impl if
+ # there are multiple arguments with distinct cbind implementations
+ skip_if(getRversion() < "4.0.0", "R 3.6 cbind dispatch rules prevent this behavior")
+
+ expect_equal(
+ cbind(arrow_table(a = 1L:2L), record_batch(b = 4:5)),
+ arrow_table(a = 1L:2L, b = 4:5)
+ )
+})
+
test_that("ARROW-11769 - grouping preserved in table creation", {
skip_if_not_available("dataset")
diff --git a/r/tests/testthat/test-chunked-array.R b/r/tests/testthat/test-chunked-array.R
index 730868fa5e..87d2a9d92a 100644
--- a/r/tests/testthat/test-chunked-array.R
+++ b/r/tests/testthat/test-chunked-array.R
@@ -91,6 +91,27 @@ test_that("ChunkedArray", {
expect_warning(z$Slice(2, 10), "Slice 'length' greater than available length")
})
+test_that("ChunkedArray can be constructed from Array and ChunkedArrays", {
+ expect_equal(
+ chunked_array(Array$create(1:2), Array$create(3:4)),
+ chunked_array(1:2, 3:4),
+ )
+ expect_equal(
+ chunked_array(chunked_array(1:2, 3:4), chunked_array(5:6)),
+ chunked_array(1:2, 3:4, 5:6),
+ )
+
+ # Cannot mix array types
+ expect_error(
+ chunked_array(Array$create(1:2), Array$create(c("a", "b"))),
+ regexp = "Array chunks must all be same type"
+ )
+ expect_error(
+ chunked_array(chunked_array(1:2), chunked_array(c("a", "b"))),
+ regexp = "Array chunks must all be same type"
+ )
+})
+
test_that("print ChunkedArray", {
verify_output(test_path("test-chunked-array.txt"), {
chunked_array(c(1, 2, 3), c(4, 5, 6))
@@ -100,6 +121,18 @@ test_that("print ChunkedArray", {
})
})
+test_that("ChunkedArray can be concatenated with c()", {
+ a <- chunked_array(c(1, 2), 3)
+ b <- chunked_array(c(4, 5), 6)
+ expected <- chunked_array(c(1, 2), 3, c(4, 5), 6)
+ expect_equal(c(a, b), expected)
+
+ # Can handle Arrays and base vectors
+ vectors <- list(chunked_array(1:10), Array$create(1:10), 1:10)
+ expected <- chunked_array(1:10, 1:10, 1:10)
+ expect_equal(do.call(c, vectors), expected)
+})
+
test_that("ChunkedArray handles !!! splicing", {
data <- list(1, 2, 3)
x <- chunked_array(!!!data)