You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by pa...@apache.org on 2022/11/18 19:16:16 UTC
[arrow-nanoarrow] branch main updated: [R] Complete ptype inferences and array conversions (#65)
This is an automated email from the ASF dual-hosted git repository.
paleolimbot pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-nanoarrow.git
The following commit(s) were added to refs/heads/main by this push:
new e6e71d6 [R] Complete ptype inferences and array conversions (#65)
e6e71d6 is described below
commit e6e71d61ac103d26d48d907b7efd738a33c704a2
Author: Dewey Dunnington <de...@fishandwhistle.net>
AuthorDate: Fri Nov 18 15:16:09 2022 -0400
[R] Complete ptype inferences and array conversions (#65)
* basic unspecified
* with conversion to unspecified for other types
* conversion from null array to stuff
* use schema printing in error messages
* pop back into R to infer array ptypes
* add schema parser
* add some more type inferences
* fixed size to matrix
* start shuffling to a more flexible materialize system
* prepare to use new materialize for more stuff
* inching more towards materializing
* move over int materializing to new system
* move dbl to new system
* chr materialization to new system
* better names
* big swaperoo
* don't special case stuff from other packages
* reorganize + rename
* split off ptype inference and make it work on schemas
* api tweaks
* termo changes
* implement some list_of conversions
* infer ptype for dates
* failing tests for datetime types
* don't support matrices
* most datetime types handled
* difftime units
* date64 support
* make sure null -> difftime works
* nix the 'context' argument
* draft converter api
* actually use converter api
* fix errors in converter API
* use converter for more types
* make altrep use the converter API
* more using the converter api
* improve coverage
* document the converter API
* separate converter from materializer
* materialize_array -> convert_array
* more renaming
* one more
* array stream convert prototype
* stub fixed-size array stream collection
* progress on nested materialize calls
* clean + format
* fix errors identified in chunked flights collection
* split up class-specific materialize logic
* better support for struct-style ptypes
* fix nested data.frame
* functional ptypes + don't use system timezone
* start documenting conversions
* documenting conversions + add batch pull limit
* better docs for array streams
* add test for output that might not have been allocated with enough space
* run format()
* fallback decimal conversion
* make sure recursive
* maybe fix ci error
* test nested types
* less silent stripping
* Update r/src/materialize.c
Co-authored-by: Romain François <ro...@rstudio.com>
* document rowname behaviour
* rowname handling for generating results
* cache namespaces and make protection calls easier to read
* simplify nanoarrow_alloc_type()
* cache SEXPs for common classes
* carefuller Rf_getAttrib() calls with respect to rownames
* rename nanoarrow_converter_result() to make it more obvious what's going on
* explain PROTECT() call
* fix call_stop_cant_convert_array() to make rchk happy
Co-authored-by: Romain François <ro...@rstudio.com>
---
r/DESCRIPTION | 2 +
r/NAMESPACE | 11 +-
r/R/altrep.R | 4 +-
r/R/array-convert-vector.R | 127 -----
r/R/array-stream.R | 15 +
r/R/array.R | 4 +-
r/R/convert-array-stream.R | 84 +++
r/R/convert-array.R | 153 ++++++
r/R/infer-ptype.R | 127 +++++
r/R/schema.R | 16 +
r/man/as_nanoarrow_schema.Rd | 6 +
r/man/convert_array.Rd | 68 +++
r/man/convert_array_stream.Rd | 38 ++
r/man/from_nanoarrow_array.Rd | 25 -
r/man/infer_nanoarrow_ptype.Rd | 44 ++
r/src/altrep.c | 89 ++--
r/src/altrep.h | 2 +-
r/src/array.c | 7 +-
r/src/array.h | 9 +-
r/src/array_convert_vector.c | 359 -------------
r/src/array_stream.h | 3 +-
r/src/array_view.c | 3 +-
r/src/convert.c | 491 +++++++++++++++++
r/src/convert.h | 66 +++
r/src/convert_array.c | 217 ++++++++
r/src/convert_array_stream.c | 96 ++++
r/src/infer_ptype.c | 149 ++++++
r/src/init.c | 16 +-
r/src/materialize.c | 408 +++++++--------
r/src/materialize.h | 27 +-
r/src/materialize_blob.h | 60 +++
r/src/materialize_chr.h | 61 +++
r/src/materialize_common.h | 106 ++++
r/src/{materialize.h => materialize_date.h} | 29 +-
r/src/materialize_dbl.h | 124 +++++
r/src/materialize_difftime.h | 92 ++++
r/src/materialize_int.h | 123 +++++
r/src/materialize_lgl.h | 88 ++++
r/src/materialize_posixct.h | 79 +++
r/src/materialize_unspecified.h | 58 ++
r/src/schema.c | 101 +++-
r/src/schema.h | 3 +-
r/src/util.c | 51 ++
r/src/{materialize.h => util.h} | 22 +-
r/tests/testthat/test-array-convert-vector.R | 375 -------------
r/tests/testthat/test-array-stream.R | 16 +
r/tests/testthat/test-array.R | 6 +-
r/tests/testthat/test-convert-array-stream.R | 208 ++++++++
r/tests/testthat/test-convert-array.R | 755 +++++++++++++++++++++++++++
r/tests/testthat/test-infer-ptype.R | 132 +++++
r/tests/testthat/test-schema.R | 36 ++
51 files changed, 3990 insertions(+), 1201 deletions(-)
diff --git a/r/DESCRIPTION b/r/DESCRIPTION
index d84baf4..4956b77 100644
--- a/r/DESCRIPTION
+++ b/r/DESCRIPTION
@@ -23,6 +23,8 @@ URL: https://github.com/apache/arrow-nanoarrow
BugReports: https://github.com/apache/arrow-nanoarrow/issues
Suggests:
arrow (>= 8.0.0),
+ blob,
+ hms,
testthat (>= 3.0.0),
tibble,
vctrs
diff --git a/r/NAMESPACE b/r/NAMESPACE
index 1f285a3..6bfd421 100644
--- a/r/NAMESPACE
+++ b/r/NAMESPACE
@@ -7,8 +7,10 @@ S3method("[[",nanoarrow_array)
S3method("[[",nanoarrow_array_stream)
S3method("[[",nanoarrow_schema)
S3method(as.data.frame,nanoarrow_array)
+S3method(as.data.frame,nanoarrow_array_stream)
S3method(as.raw,nanoarrow_buffer)
S3method(as.vector,nanoarrow_array)
+S3method(as.vector,nanoarrow_array_stream)
S3method(as_nanoarrow_array,Array)
S3method(as_nanoarrow_array,ChunkedArray)
S3method(as_nanoarrow_array,RecordBatch)
@@ -21,13 +23,12 @@ S3method(as_nanoarrow_schema,DataType)
S3method(as_nanoarrow_schema,Field)
S3method(as_nanoarrow_schema,Schema)
S3method(as_nanoarrow_schema,nanoarrow_schema)
+S3method(convert_array,default)
+S3method(convert_array,vctrs_partial_frame)
S3method(format,nanoarrow_array)
S3method(format,nanoarrow_array_stream)
S3method(format,nanoarrow_buffer)
S3method(format,nanoarrow_schema)
-S3method(from_nanoarrow_array,default)
-S3method(from_nanoarrow_array,tbl_df)
-S3method(from_nanoarrow_array,vctrs_partial_frame)
S3method(infer_nanoarrow_schema,default)
S3method(infer_nanoarrow_schema,nanoarrow_array)
S3method(infer_nanoarrow_schema,nanoarrow_array_stream)
@@ -48,7 +49,8 @@ S3method(str,nanoarrow_schema)
export(as_nanoarrow_array)
export(as_nanoarrow_array_stream)
export(as_nanoarrow_schema)
-export(from_nanoarrow_array)
+export(convert_array)
+export(convert_array_stream)
export(infer_nanoarrow_ptype)
export(infer_nanoarrow_schema)
export(nanoarrow_allocate_array)
@@ -62,6 +64,7 @@ export(nanoarrow_pointer_export)
export(nanoarrow_pointer_is_valid)
export(nanoarrow_pointer_move)
export(nanoarrow_pointer_release)
+export(nanoarrow_schema_parse)
importFrom(utils,getFromNamespace)
importFrom(utils,str)
useDynLib(nanoarrow, .registration = TRUE)
diff --git a/r/R/altrep.R b/r/R/altrep.R
index ce3e40b..8071bd0 100644
--- a/r/R/altrep.R
+++ b/r/R/altrep.R
@@ -1,9 +1,7 @@
# For testing the altrep chr conversion
nanoarrow_altrep_chr <- function(array) {
- schema <- infer_nanoarrow_schema(array)
- array_view <- .Call(nanoarrow_c_array_view, array, schema)
- .Call(nanoarrow_c_make_altrep_chr, array_view)
+ .Call(nanoarrow_c_make_altrep_chr, array)
}
is_nanoarrow_altrep <- function(x) {
diff --git a/r/R/array-convert-vector.R b/r/R/array-convert-vector.R
deleted file mode 100644
index 3209fc4..0000000
--- a/r/R/array-convert-vector.R
+++ /dev/null
@@ -1,127 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-
-#' Convert an Array to an R vector
-#'
-#' @param array A [nanoarrow_array][as_nanoarrow_array].
-#' @param to A target prototype object describing the type to which `array`
-#' should be converted, or `NULL` to use the default conversion.
-#' @param ... Passed to S3 methods
-#'
-#' @return An R vector of type `to`.
-#' @export
-#'
-from_nanoarrow_array <- function(array, to = NULL, ...) {
- stopifnot(inherits(array, "nanoarrow_array"))
- UseMethod("from_nanoarrow_array", to)
-}
-
-#' @export
-from_nanoarrow_array.default <- function(array, to = NULL, ..., .from_c = FALSE) {
- if (.from_c) {
- stop_cant_convert_array(array, to)
- }
-
- .Call(nanoarrow_c_from_array, array, to)
-}
-
-# This is defined because it's verbose to pass named arguments from C.
-# When converting data frame columns, we try the internal C conversions
-# first to save R evaluation overhead. When the internal conversions fail,
-# we call from_nanoarrow_array() to dispatch to conversions defined via S3
-# dispatch, making sure to let the default method know that we've already
-# tried the internal C conversions.
-from_nanoarrow_array_from_c <- function(array, to) {
- from_nanoarrow_array(array, to, .from_c = TRUE)
-}
-
-#' @export
-from_nanoarrow_array.vctrs_partial_frame <- function(array, to, ...) {
- ptype <- infer_nanoarrow_ptype(array)
- if (!is.data.frame(ptype)) {
- stop_cant_convert_array(array, to)
- }
-
- ptype <- vctrs::vec_ptype_common(ptype, to)
- .Call(nanoarrow_c_from_array, array, ptype)
-}
-
-#' @export
-from_nanoarrow_array.tbl_df <- function(array, to, ...) {
- df <- from_nanoarrow_array(array, as.data.frame(to))
- tibble::as_tibble(df)
-}
-
-#' @rdname from_nanoarrow_array
-#' @export
-infer_nanoarrow_ptype <- function(array) {
- stopifnot(inherits(array, "nanoarrow_array"))
- .Call(nanoarrow_c_infer_ptype, array)
-}
-
-stop_cant_infer_ptype <- function(array) {
- schema <- infer_nanoarrow_schema(array)
-
- if (is.null(schema$name) || identical(schema$name, "")) {
- cnd <- simpleError(
- sprintf(
- "Can't infer R vector type for array <%s>",
- schema$format
- ),
- call = sys.call(-1)
- )
- } else {
- cnd <- simpleError(
- sprintf(
- "Can't infer R vector type for `%s` <%s>",
- schema$name,
- schema$format
- ),
- call = sys.call(-1)
- )
- }
-
- stop(cnd)
-}
-
-stop_cant_convert_array <- function(array, to) {
- schema <- infer_nanoarrow_schema(array)
-
- if (is.null(schema$name) || identical(schema$name, "")) {
- cnd <- simpleError(
- sprintf(
- "Can't convert array <%s> to R vector of type %s",
- schema$format,
- class(to)[1]
- ),
- call = sys.call(-1)
- )
- } else {
- cnd <- simpleError(
- sprintf(
- "Can't convert `%s` <%s> to R vector of type %s",
- schema$name,
- schema$format,
- class(to)[1]
- ),
- call = sys.call(-1)
- )
- }
-
- stop(cnd)
-}
diff --git a/r/R/array-stream.R b/r/R/array-stream.R
index 4f30c1b..6ea7c39 100644
--- a/r/R/array-stream.R
+++ b/r/R/array-stream.R
@@ -71,6 +71,21 @@ infer_nanoarrow_schema.nanoarrow_array_stream <- function(x, ...) {
x$get_schema()
}
+#' @export
+as.data.frame.nanoarrow_array_stream <- function(x, ...) {
+ to <- infer_nanoarrow_ptype(x$get_schema())
+ if (!inherits(to, "data.frame")) {
+ stop("Can't convert non-struct array stream to data.frame")
+ }
+
+ convert_array_stream(x, to)
+}
+
+#' @export
+as.vector.nanoarrow_array_stream <- function(x, mode) {
+ convert_array_stream(x)
+}
+
#' @importFrom utils str
#' @export
str.nanoarrow_array_stream <- function(object, ...) {
diff --git a/r/R/array.R b/r/R/array.R
index 9df5c79..3f74c2e 100644
--- a/r/R/array.R
+++ b/r/R/array.R
@@ -46,7 +46,7 @@ as_nanoarrow_array <- function(x, ..., schema = NULL) {
#' @export
as.vector.nanoarrow_array <- function(x, mode = "any") {
stopifnot(identical(mode, "any"))
- from_nanoarrow_array(x, to = infer_nanoarrow_ptype(x))
+ convert_array(x, to = infer_nanoarrow_ptype(x))
}
#' @export
@@ -61,7 +61,7 @@ as.data.frame.nanoarrow_array <- function(x, ...) {
)
}
- .Call(nanoarrow_c_from_array, x, NULL)
+ .Call(nanoarrow_c_convert_array, x, NULL)
}
# exported in zzz.R
diff --git a/r/R/convert-array-stream.R b/r/R/convert-array-stream.R
new file mode 100644
index 0000000..eaeda7d
--- /dev/null
+++ b/r/R/convert-array-stream.R
@@ -0,0 +1,84 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+#' Convert an Array Stream into an R vector
+#'
+#' Converts `array_stream` to the type specified by `to`. This is a low-level
+#' interface; most users should use `as.data.frame()` or `as.vector()` unless
+#' finer-grained control is needed over the conversion. See [convert_array()]
+#' for details of the conversion process; see [infer_nanoarrow_ptype()] for
+#' default inferences of `to`.
+#'
+#' @param array_stream A [nanoarrow_array_stream][as_nanoarrow_array_stream].
+#' @param size The exact size of the output, if known. If specified,
+#' slightly more efficient implementation may be used to collect the output.
+#' @param n The maximum number of batches to pull from the array stream.
+#' @inheritParams convert_array
+#'
+#' @return An R vector of type `to`.
+#' @export
+#'
+#' @examples
+#' stream <- as_nanoarrow_array_stream(data.frame(x = 1:5))
+#' str(convert_array_stream(stream))
+#' str(convert_array_stream(stream, to = data.frame(x = double())))
+#'
+convert_array_stream <- function(array_stream, to = NULL, size = NULL, n = Inf) {
+ stopifnot(
+ inherits(array_stream, "nanoarrow_array_stream")
+ )
+
+ schema <- .Call(nanoarrow_c_array_stream_get_schema, array_stream)
+ if (is.null(to)) {
+ to <- infer_nanoarrow_ptype(schema)
+ } else if (is.function(to)) {
+ to <- to(schema, infer_nanoarrow_ptype(schema))
+ }
+
+ n <- as.double(n)[1]
+ if (!is.null(size)) {
+ return(
+ .Call(
+ nanoarrow_c_convert_array_stream,
+ array_stream,
+ to,
+ as.double(size)[1],
+ n
+ )
+ )
+ }
+
+ batches <- vector("list", 1024L)
+ n_batches <- 0L
+ get_next <- array_stream$get_next
+ while (!is.null(array <- get_next(schema, validate = FALSE)) && (n_batches < n)) {
+ n_batches <- n_batches + 1L
+ batches[[n_batches]] <- .Call(nanoarrow_c_convert_array, array, to)
+ }
+
+ if (n_batches == 0L && is.data.frame(to)) {
+ to[integer(0), , drop = FALSE]
+ } else if (n_batches == 0L && is.data.frame(to)) {
+ to[integer(0)]
+ } else if (n_batches == 1L) {
+ batches[[1]]
+ } else if (inherits(to, "data.frame")) {
+ do.call(rbind, batches[seq_len(n_batches)])
+ } else {
+ do.call(c, batches[seq_len(n_batches)])
+ }
+}
diff --git a/r/R/convert-array.R b/r/R/convert-array.R
new file mode 100644
index 0000000..1160ab6
--- /dev/null
+++ b/r/R/convert-array.R
@@ -0,0 +1,153 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+#' Convert an Array into an R vector
+#'
+#' Converts `array` to the type specified by `to`. This is a low-level interface;
+#' most users should use `as.data.frame()` or `as.vector()` unless finer-grained
+#' control is needed over the conversion. This function is an S3 generic
+#' dispatching on `to`: developers may implement their own S3 methods for
+#' custom vector types.
+#'
+#' @param array A [nanoarrow_array][as_nanoarrow_array].
+#' @param to A target prototype object describing the type to which `array`
+#' should be converted, or `NULL` to use the default conversion as
+#' returned by [infer_nanoarrow_ptype()]. Alternatively, a function can be
+#' passed to perform an alternative calculation of the default ptype as
+#' a function of `array` and the default inference of the prototype.
+#' @param ... Passed to S3 methods
+#'
+#' @return An R vector of type `to`.
+#' @export
+#'
+#' @details
+#' Conversions are implemented for the following R vector types:
+#'
+#' - [logical()]: Any numeric type can be converted to [logical()] in addition
+#' to the bool type. For numeric types, any non-zero value is considered `TRUE`.
+#' - [integer()]: Any numeric type can be converted to [integer()]; however,
+#' a warning will be signaled if the any value is outside the range of the
+#' 32-bit integer.
+#' - [double()]: Any numeric type can be converted to [double()]. This
+#' conversion currently does not warn for values that may not roundtrip
+#' through a floating-point double (e.g., very large uint64 and int64 values).
+#' - [character()]: String and large string types can be converted to
+#' [character()]. The conversion does not check for valid UTF-8: if you need
+#' finer-grained control over encodings, use `to = blob::blob()`.
+#' - [Date][as.Date]: Only the date32 type can be converted to an R Date vector.
+#' - [hms::hms()]: Time32 and time64 types can be converted to [hms::hms()].
+#' - [difftime()]: Time32, time64, and duration types can be converted to
+#' R [difftime()] vectors. The value is converted to match the [units()]
+#' attribute of `to`.
+#' - [blob::blob()]: String, large string, binary, and large binary types can
+#' be converted to [blob::blob()].
+#' - [vctrs::list_of()]: List, large list, and fixed-size list types can be
+#' converted to [vctrs::list_of()].
+#' - [data.frame()]: Struct types can be converted to [data.frame()].
+#' - [vctrs::unspecified()]: Any type can be converted to [vctrs::unspecified()];
+#' however, a warning will be raised if any non-null values are encountered.
+#'
+#' In addition to the above conversions, a null array may be converted to any
+#' target prototype except [data.frame()]. Extension arrays are currently
+#' converted as their storage type; dictionary-encoded arrays are not
+#' currently supported.
+#'
+#' @examples
+#' array <- as_nanoarrow_array(data.frame(x = 1:5))
+#' str(convert_array(array))
+#' str(convert_array(array, to = data.frame(x = double())))
+#'
+convert_array <- function(array, to = NULL, ...) {
+ stopifnot(inherits(array, "nanoarrow_array"))
+ UseMethod("convert_array", to)
+}
+
+#' @export
+convert_array.default <- function(array, to = NULL, ..., .from_c = FALSE) {
+ if (.from_c) {
+ stop_cant_convert_array(array, to)
+ }
+
+ if (is.function(to)) {
+ to <- to(array, infer_nanoarrow_ptype(array))
+ }
+
+ .Call(nanoarrow_c_convert_array, array, to)
+}
+
+# This is defined because it's verbose to pass named arguments from C.
+# When converting data frame columns, we try the internal C conversions
+# first to save R evaluation overhead. When the internal conversions fail,
+# we call convert_array() to dispatch to conversions defined via S3
+# dispatch, making sure to let the default method know that we've already
+# tried the internal C conversions.
+convert_array_from_c <- function(array, to) {
+ convert_array(array, to, .from_c = TRUE)
+}
+
+#' @export
+convert_array.vctrs_partial_frame <- function(array, to, ...) {
+ ptype <- infer_nanoarrow_ptype(array)
+ if (!is.data.frame(ptype)) {
+ stop_cant_convert_array(array, to)
+ }
+
+ ptype <- vctrs::vec_ptype_common(ptype, to)
+ .Call(nanoarrow_c_convert_array, array, ptype)
+}
+
+stop_cant_convert_array <- function(array, to, n = 0) {
+ stop_cant_convert_schema(infer_nanoarrow_schema(array), to, n - 1)
+}
+
+stop_cant_convert_schema <- function(schema, to, n = 0) {
+ schema_label <- nanoarrow_schema_formatted(schema)
+
+ if (is.null(schema$name) || identical(schema$name, "")) {
+ cnd <- simpleError(
+ sprintf(
+ "Can't convert array <%s> to R vector of type %s",
+ schema_label,
+ class(to)[1]
+ ),
+ call = sys.call(n - 1)
+ )
+ } else {
+ cnd <- simpleError(
+ sprintf(
+ "Can't convert `%s` <%s> to R vector of type %s",
+ schema$name,
+ schema_label,
+ class(to)[1]
+ ),
+ call = sys.call(n - 1)
+ )
+ }
+
+ stop(cnd)
+}
+
+# Called from C for decimal types
+convert_decimal_to_double <- function(array, schema, offset, length) {
+ array2 <- nanoarrow_allocate_array()
+ schema2 <- nanoarrow_allocate_schema()
+ nanoarrow_pointer_export(array, array2)
+ nanoarrow_pointer_export(schema, schema2)
+ arrow_array <- arrow::Array$import_from_c(array2, schema2)
+ arrow_array$Slice(offset, length)$as_vector()
+}
diff --git a/r/R/infer-ptype.R b/r/R/infer-ptype.R
new file mode 100644
index 0000000..ab2f750
--- /dev/null
+++ b/r/R/infer-ptype.R
@@ -0,0 +1,127 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+#' Infer an R vector prototype
+#'
+#' Resolves the default `to` value to use in [convert_array()] and
+#' [convert_array_stream()]. The default conversions are:
+#'
+#' - null to [vctrs::unspecified()]
+#' - boolean to [logical()]
+#' - int8, uint8, int16, uint16, and int13 to [integer()]
+#' - uint32, int64, uint64, float, and double to [double()]
+#' - string and large string to [character()]
+#' - struct to [data.frame()]
+#' - binary and large binary to [blob::blob()]
+#' - list, large_list, and fixed_size_list to [vctrs::list_of()]
+#' - time32 and time64 to [hms::hms()]
+#' - duration to [difftime()]
+#' - date32 to [as.Date()]
+#' - timestamp to [as.POSIXct()]
+#'
+#' Additional conversions are possible by specifying an explicit value for
+#' `to`. For details of each conversion, see [convert_array()].
+#'
+#' @param x A [nanoarrow_schema][as_nanoarrow_schema],
+#' [nanoarrow_array][as_nanoarrow_array], or
+#' [nanoarrow_array_stream][as_nanoarrow_array_stream].
+#'
+#' @return An R vector of zero size describing the target into which
+#' the array should be materialized.
+#' @export
+#'
+#' @examples
+#' infer_nanoarrow_ptype(as_nanoarrow_array(1:10))
+#'
+infer_nanoarrow_ptype <- function(x) {
+ if (inherits(x, "nanoarrow_array")) {
+ x <- .Call(nanoarrow_c_infer_schema_array, x)
+ } else if (inherits(x, "nanoarrow_array_stream")) {
+ x <- .Call(nanoarrow_c_array_stream_get_schema, x)
+ } else if (!inherits(x, "nanoarrow_schema")) {
+ stop("`x` must be a nanoarrow_schema(), nanoarrow_array(), or nanoarrow_array_stream()")
+ }
+
+ .Call(nanoarrow_c_infer_ptype, x)
+}
+
+# This is called from C from nanoarrow_c_infer_ptype when all the C conversions
+# have been tried. Some of these inferences could be moved to C to be faster
+# (but are much less verbose to create here)
+infer_ptype_other <- function(schema) {
+ # we don't need the user-friendly versions and this is performance-sensitive
+ parsed <- .Call(nanoarrow_c_schema_parse, schema)
+
+ switch(
+ parsed$type,
+ "na" = vctrs::unspecified(),
+ "binary" = ,
+ "large_binary" = blob::new_blob(),
+ "date32" = structure(numeric(), class = "Date"),
+ "time32" = ,
+ "time64" = hms::hms(),
+ "duration" = structure(numeric(), class = "difftime", units = "secs"),
+ "date64" = ,
+ "timestamp" = {
+ if (is.null(parsed$timezone) || parsed$timezone == "") {
+ # We almost never want to assume the user's timezone here, which is
+ # what would happen if we passed on "". This is consistent with how
+ # readr handles reading timezones (assign "UTC" since it's DST-free
+ # and let the user explicitly set this later)
+ parsed$timezone <- getOption("nanoarrow.timezone_if_unspecified", "UTC")
+ }
+
+ structure(
+ numeric(0),
+ class = c("POSIXct", "POSIXt"),
+ tzone = parsed$timezone
+ )
+ },
+ "large_list" = ,
+ "list" = ,
+ "fixed_size_list" = {
+ ptype <- infer_nanoarrow_ptype(schema$children[[1]])
+ vctrs::list_of(.ptype = ptype)
+ },
+ stop_cant_infer_ptype(schema, n = -1)
+ )
+}
+
+stop_cant_infer_ptype <- function(schema, n = 0) {
+ schema_label <- nanoarrow_schema_formatted(schema)
+
+ if (is.null(schema$name) || identical(schema$name, "")) {
+ cnd <- simpleError(
+ sprintf(
+ "Can't infer R vector type for array <%s>",
+ schema_label
+ ),
+ call = sys.call(n - 1)
+ )
+ } else {
+ cnd <- simpleError(
+ sprintf(
+ "Can't infer R vector type for `%s` <%s>",
+ schema$name,
+ schema_label
+ ),
+ call = sys.call(n - 1)
+ )
+ }
+
+ stop(cnd)
+}
diff --git a/r/R/schema.R b/r/R/schema.R
index 0f5424e..87cc934 100644
--- a/r/R/schema.R
+++ b/r/R/schema.R
@@ -24,6 +24,8 @@
#' are represented identically.
#'
#' @param x An object to convert to a schema
+#' @param recursive Use `TRUE` to include a `children` member when parsing
+#' schemas.
#' @param ... Passed to S3 methods
#'
#' @return An object of class 'nanoarrow_schema'
@@ -53,6 +55,20 @@ infer_nanoarrow_schema.default <- function(x, ...) {
as_nanoarrow_schema(arrow::infer_type(x, ...))
}
+#' @rdname as_nanoarrow_schema
+#' @export
+nanoarrow_schema_parse <- function(x, recursive = FALSE) {
+ parsed <- .Call(nanoarrow_c_schema_parse, as_nanoarrow_schema(x))
+ parsed_null <- vapply(parsed, is.null, logical(1))
+ result <- parsed[!parsed_null]
+
+ if (recursive && !is.null(x$children)) {
+ result$children <- lapply(x$children, nanoarrow_schema_parse, TRUE)
+ }
+
+ result
+}
+
#' @importFrom utils str
#' @export
str.nanoarrow_schema <- function(object, ...) {
diff --git a/r/man/as_nanoarrow_schema.Rd b/r/man/as_nanoarrow_schema.Rd
index 9cc25b0..ece1873 100644
--- a/r/man/as_nanoarrow_schema.Rd
+++ b/r/man/as_nanoarrow_schema.Rd
@@ -3,16 +3,22 @@
\name{as_nanoarrow_schema}
\alias{as_nanoarrow_schema}
\alias{infer_nanoarrow_schema}
+\alias{nanoarrow_schema_parse}
\title{Convert an object to a nanoarrow schema}
\usage{
as_nanoarrow_schema(x, ...)
infer_nanoarrow_schema(x, ...)
+
+nanoarrow_schema_parse(x, recursive = FALSE)
}
\arguments{
\item{x}{An object to convert to a schema}
\item{...}{Passed to S3 methods}
+
+\item{recursive}{Use \code{TRUE} to include a \code{children} member when parsing
+schemas.}
}
\value{
An object of class 'nanoarrow_schema'
diff --git a/r/man/convert_array.Rd b/r/man/convert_array.Rd
new file mode 100644
index 0000000..bb45ee9
--- /dev/null
+++ b/r/man/convert_array.Rd
@@ -0,0 +1,68 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/convert-array.R
+\name{convert_array}
+\alias{convert_array}
+\title{Convert an Array into an R vector}
+\usage{
+convert_array(array, to = NULL, ...)
+}
+\arguments{
+\item{array}{A \link[=as_nanoarrow_array]{nanoarrow_array}.}
+
+\item{to}{A target prototype object describing the type to which \code{array}
+should be converted, or \code{NULL} to use the default conversion as
+returned by \code{\link[=infer_nanoarrow_ptype]{infer_nanoarrow_ptype()}}. Alternatively, a function can be
+passed to perform an alternative calculation of the default ptype as
+a function of \code{array} and the default inference of the prototype.}
+
+\item{...}{Passed to S3 methods}
+}
+\value{
+An R vector of type \code{to}.
+}
+\description{
+Converts \code{array} to the type specified by \code{to}. This is a low-level interface;
+most users should use \code{as.data.frame()} or \code{as.vector()} unless finer-grained
+control is needed over the conversion. This function is an S3 generic
+dispatching on \code{to}: developers may implement their own S3 methods for
+custom vector types.
+}
+\details{
+Conversions are implemented for the following R vector types:
+\itemize{
+\item \code{\link[=logical]{logical()}}: Any numeric type can be converted to \code{\link[=logical]{logical()}} in addition
+to the bool type. For numeric types, any non-zero value is considered \code{TRUE}.
+\item \code{\link[=integer]{integer()}}: Any numeric type can be converted to \code{\link[=integer]{integer()}}; however,
+a warning will be signaled if the any value is outside the range of the
+32-bit integer.
+\item \code{\link[=double]{double()}}: Any numeric type can be converted to \code{\link[=double]{double()}}. This
+conversion currently does not warn for values that may not roundtrip
+through a floating-point double (e.g., very large uint64 and int64 values).
+\item \code{\link[=character]{character()}}: String and large string types can be converted to
+\code{\link[=character]{character()}}. The conversion does not check for valid UTF-8: if you need
+finer-grained control over encodings, use \code{to = blob::blob()}.
+\item \link[=as.Date]{Date}: Only the date32 type can be converted to an R Date vector.
+\item \code{\link[hms:hms]{hms::hms()}}: Time32 and time64 types can be converted to \code{\link[hms:hms]{hms::hms()}}.
+\item \code{\link[=difftime]{difftime()}}: Time32, time64, and duration types can be converted to
+R \code{\link[=difftime]{difftime()}} vectors. The value is converted to match the \code{\link[=units]{units()}}
+attribute of \code{to}.
+\item \code{\link[blob:blob]{blob::blob()}}: String, large string, binary, and large binary types can
+be converted to \code{\link[blob:blob]{blob::blob()}}.
+\item \code{\link[vctrs:list_of]{vctrs::list_of()}}: List, large list, and fixed-size list types can be
+converted to \code{\link[vctrs:list_of]{vctrs::list_of()}}.
+\item \code{\link[=data.frame]{data.frame()}}: Struct types can be converted to \code{\link[=data.frame]{data.frame()}}.
+\item \code{\link[vctrs:unspecified]{vctrs::unspecified()}}: Any type can be converted to \code{\link[vctrs:unspecified]{vctrs::unspecified()}};
+however, a warning will be raised if any non-null values are encountered.
+}
+
+In addition to the above conversions, a null array may be converted to any
+target prototype except \code{\link[=data.frame]{data.frame()}}. Extension arrays are currently
+converted as their storage type; dictionary-encoded arrays are not
+currently supported.
+}
+\examples{
+array <- as_nanoarrow_array(data.frame(x = 1:5))
+str(convert_array(array))
+str(convert_array(array, to = data.frame(x = double())))
+
+}
diff --git a/r/man/convert_array_stream.Rd b/r/man/convert_array_stream.Rd
new file mode 100644
index 0000000..97e3608
--- /dev/null
+++ b/r/man/convert_array_stream.Rd
@@ -0,0 +1,38 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/convert-array-stream.R
+\name{convert_array_stream}
+\alias{convert_array_stream}
+\title{Convert an Array Stream into an R vector}
+\usage{
+convert_array_stream(array_stream, to = NULL, size = NULL, n = Inf)
+}
+\arguments{
+\item{array_stream}{A \link[=as_nanoarrow_array_stream]{nanoarrow_array_stream}.}
+
+\item{to}{A target prototype object describing the type to which \code{array}
+should be converted, or \code{NULL} to use the default conversion as
+returned by \code{\link[=infer_nanoarrow_ptype]{infer_nanoarrow_ptype()}}. Alternatively, a function can be
+passed to perform an alternative calculation of the default ptype as
+a function of \code{array} and the default inference of the prototype.}
+
+\item{size}{The exact size of the output, if known. If specified,
+slightly more efficient implementation may be used to collect the output.}
+
+\item{n}{The maximum number of batches to pull from the array stream.}
+}
+\value{
+An R vector of type \code{to}.
+}
+\description{
+Converts \code{array_stream} to the type specified by \code{to}. This is a low-level
+interface; most users should use \code{as.data.frame()} or \code{as.vector()} unless
+finer-grained control is needed over the conversion. See \code{\link[=convert_array]{convert_array()}}
+for details of the conversion process; see \code{\link[=infer_nanoarrow_ptype]{infer_nanoarrow_ptype()}} for
+default inferences of \code{to}.
+}
+\examples{
+stream <- as_nanoarrow_array_stream(data.frame(x = 1:5))
+str(convert_array_stream(stream))
+str(convert_array_stream(stream, to = data.frame(x = double())))
+
+}
diff --git a/r/man/from_nanoarrow_array.Rd b/r/man/from_nanoarrow_array.Rd
deleted file mode 100644
index e4f8d70..0000000
--- a/r/man/from_nanoarrow_array.Rd
+++ /dev/null
@@ -1,25 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/array-convert-vector.R
-\name{from_nanoarrow_array}
-\alias{from_nanoarrow_array}
-\alias{infer_nanoarrow_ptype}
-\title{Convert an Array to an R vector}
-\usage{
-from_nanoarrow_array(array, to = NULL, ...)
-
-infer_nanoarrow_ptype(array)
-}
-\arguments{
-\item{array}{A \link[=as_nanoarrow_array]{nanoarrow_array}.}
-
-\item{to}{A target prototype object describing the type to which \code{array}
-should be converted, or \code{NULL} to use the default conversion.}
-
-\item{...}{Passed to S3 methods}
-}
-\value{
-An R vector of type \code{to}.
-}
-\description{
-Convert an Array to an R vector
-}
diff --git a/r/man/infer_nanoarrow_ptype.Rd b/r/man/infer_nanoarrow_ptype.Rd
new file mode 100644
index 0000000..7c444c7
--- /dev/null
+++ b/r/man/infer_nanoarrow_ptype.Rd
@@ -0,0 +1,44 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/infer-ptype.R
+\name{infer_nanoarrow_ptype}
+\alias{infer_nanoarrow_ptype}
+\title{Infer an R vector prototype}
+\usage{
+infer_nanoarrow_ptype(x)
+}
+\arguments{
+\item{x}{A \link[=as_nanoarrow_schema]{nanoarrow_schema},
+\link[=as_nanoarrow_array]{nanoarrow_array}, or
+\link[=as_nanoarrow_array_stream]{nanoarrow_array_stream}.}
+}
+\value{
+An R vector of zero size describing the target into which
+the array should be materialized.
+}
+\description{
+Resolves the default \code{to} value to use in \code{\link[=convert_array]{convert_array()}} and
+\code{\link[=convert_array_stream]{convert_array_stream()}}. The default conversions are:
+}
+\details{
+\itemize{
+\item null to \code{\link[vctrs:unspecified]{vctrs::unspecified()}}
+\item boolean to \code{\link[=logical]{logical()}}
+\item int8, uint8, int16, uint16, and int13 to \code{\link[=integer]{integer()}}
+\item uint32, int64, uint64, float, and double to \code{\link[=double]{double()}}
+\item string and large string to \code{\link[=character]{character()}}
+\item struct to \code{\link[=data.frame]{data.frame()}}
+\item binary and large binary to \code{\link[blob:blob]{blob::blob()}}
+\item list, large_list, and fixed_size_list to \code{\link[vctrs:list_of]{vctrs::list_of()}}
+\item time32 and time64 to \code{\link[hms:hms]{hms::hms()}}
+\item duration to \code{\link[=difftime]{difftime()}}
+\item date32 to \code{\link[=as.Date]{as.Date()}}
+\item timestamp to \code{\link[=as.POSIXct]{as.POSIXct()}}
+}
+
+Additional conversions are possible by specifying an explicit value for
+\code{to}. For details of each conversion, see \code{\link[=convert_array]{convert_array()}}.
+}
+\examples{
+infer_nanoarrow_ptype(as_nanoarrow_array(1:10))
+
+}
diff --git a/r/src/altrep.c b/r/src/altrep.c
index dbb9e39..9a85e2b 100644
--- a/r/src/altrep.c
+++ b/r/src/altrep.c
@@ -24,9 +24,9 @@
#include "altrep.h"
#include "array.h"
+#include "convert.h"
#include "nanoarrow.h"
-
-#include "materialize.h"
+#include "util.h"
#ifdef HAS_ALTREP
@@ -36,27 +36,26 @@
//
// All ALTREP classes follow some common patterns:
//
-// - R_altrep_data1() holds an external pointer to a struct ArrowArrayView
+// - R_altrep_data1() holds an external pointer to a struct RConverter.
// - R_altrep_data2() holds the materialized version of the vector.
// - When materialization happens, we set R_altrep_data1() to R_NilValue
// to ensure we don't hold on to any more resources than needed.
static R_xlen_t nanoarrow_altrep_length(SEXP altrep_sexp) {
- SEXP array_view_xptr = R_altrep_data1(altrep_sexp);
- if (array_view_xptr == R_NilValue) {
+ SEXP converter_xptr = R_altrep_data1(altrep_sexp);
+ if (converter_xptr == R_NilValue) {
return Rf_xlength(R_altrep_data2(altrep_sexp));
}
- struct ArrowArrayView* array_view =
- (struct ArrowArrayView*)R_ExternalPtrAddr(array_view_xptr);
- return array_view->array->length;
+ struct RConverter* converter = (struct RConverter*)R_ExternalPtrAddr(converter_xptr);
+ return converter->array_view.array->length;
}
static Rboolean nanoarrow_altrep_inspect(SEXP altrep_sexp, int pre, int deep, int pvec,
void (*inspect_subtree)(SEXP, int, int, int)) {
- SEXP array_view_xptr = R_altrep_data1(altrep_sexp);
+ SEXP converter_xptr = R_altrep_data1(altrep_sexp);
const char* materialized = "";
- if (array_view_xptr == R_NilValue) {
+ if (converter_xptr == R_NilValue) {
materialized = "materialized ";
}
@@ -67,36 +66,39 @@ static Rboolean nanoarrow_altrep_inspect(SEXP altrep_sexp, int pre, int deep, in
}
static SEXP nanoarrow_altstring_elt(SEXP altrep_sexp, R_xlen_t i) {
- SEXP array_view_xptr = R_altrep_data1(altrep_sexp);
- if (array_view_xptr == R_NilValue) {
+ SEXP converter_xptr = R_altrep_data1(altrep_sexp);
+ if (converter_xptr == R_NilValue) {
return STRING_ELT(R_altrep_data2(altrep_sexp), i);
}
- struct ArrowArrayView* array_view =
- (struct ArrowArrayView*)R_ExternalPtrAddr(array_view_xptr);
-
- if (ArrowArrayViewIsNull(array_view, i)) {
+ struct RConverter* converter = (struct RConverter*)R_ExternalPtrAddr(converter_xptr);
+ if (ArrowArrayViewIsNull(&converter->array_view, i)) {
return NA_STRING;
}
- struct ArrowStringView item = ArrowArrayViewGetStringUnsafe(array_view, i);
+ struct ArrowStringView item = ArrowArrayViewGetStringUnsafe(&converter->array_view, i);
return Rf_mkCharLenCE(item.data, item.n_bytes, CE_UTF8);
}
static SEXP nanoarrow_altstring_materialize(SEXP altrep_sexp) {
- SEXP array_view_xptr = R_altrep_data1(altrep_sexp);
- if (array_view_xptr == R_NilValue) {
+ SEXP converter_xptr = R_altrep_data1(altrep_sexp);
+ if (converter_xptr == R_NilValue) {
return R_altrep_data2(altrep_sexp);
}
- struct ArrowArrayView* array_view =
- (struct ArrowArrayView*)R_ExternalPtrAddr(array_view_xptr);
+ if (nanoarrow_converter_materialize_all(converter_xptr) != NANOARROW_OK) {
+ Rf_error("Error materializing altstring");
+ }
+
+ if (nanoarrow_converter_finalize(converter_xptr) != NANOARROW_OK) {
+ Rf_error("Error finalizing materialized altstring");
+ }
- SEXP result = PROTECT(nanoarrow_materialize_chr(array_view));
- R_set_altrep_data2(altrep_sexp, result);
+ SEXP result_sexp = PROTECT(nanoarrow_converter_release_result(converter_xptr));
+ R_set_altrep_data2(altrep_sexp, result_sexp);
R_set_altrep_data1(altrep_sexp, R_NilValue);
UNPROTECT(1);
- return result;
+ return result_sexp;
}
static void* nanoarrow_altrep_dataptr(SEXP altrep_sexp, Rboolean writable) {
@@ -104,8 +106,8 @@ static void* nanoarrow_altrep_dataptr(SEXP altrep_sexp, Rboolean writable) {
}
static const void* nanoarrow_altrep_dataptr_or_null(SEXP altrep_sexp) {
- SEXP array_view_xptr = R_altrep_data1(altrep_sexp);
- if (array_view_xptr == R_NilValue) {
+ SEXP converter_xptr = R_altrep_data1(altrep_sexp);
+ if (converter_xptr == R_NilValue) {
return DATAPTR_OR_NULL(R_altrep_data2(altrep_sexp));
}
@@ -139,37 +141,48 @@ static void register_nanoarrow_altstring(DllInfo* info) {
// - It may be beneficial to implement the Extract_subset method to defer string
// conversion even longer since this is expensive compared to rearranging integer
// indices.
+ // - The duplicate method may be useful because it's used when setting attributes
+ // or unclassing the vector.
#endif
}
void register_nanoarrow_altrep(DllInfo* info) { register_nanoarrow_altstring(info); }
-SEXP nanoarrow_c_make_altrep_chr(SEXP array_view_xptr) {
+SEXP nanoarrow_c_make_altrep_chr(SEXP array_xptr) {
#ifdef HAS_ALTREP
- struct ArrowArrayView* array_view =
- (struct ArrowArrayView*)R_ExternalPtrAddr(array_view_xptr);
+ SEXP schema_xptr = array_xptr_get_schema(array_xptr);
+
+ // Create the converter
+ SEXP converter_xptr = PROTECT(nanoarrow_converter_from_type(VECTOR_TYPE_CHR));
+ if (nanoarrow_converter_set_schema(converter_xptr, schema_xptr) != NANOARROW_OK) {
+ nanoarrow_converter_stop(converter_xptr);
+ }
- switch (array_view->storage_type) {
+ struct RConverter* converter = (struct RConverter*)R_ExternalPtrAddr(converter_xptr);
+ switch (converter->array_view.storage_type) {
+ case NANOARROW_TYPE_NA:
case NANOARROW_TYPE_STRING:
case NANOARROW_TYPE_LARGE_STRING:
break;
default:
+ UNPROTECT(1);
return R_NilValue;
}
// Ensure the array that we're attaching to this ALTREP object does not keep its
// parent struct alive unnecessarily (i.e., a user can select only a few columns
// and the memory for the unused columns will be released).
- SEXP array_xptr_independent =
- PROTECT(array_xptr_ensure_independent(R_ExternalPtrProtected(array_view_xptr)));
- array_view->array = array_from_xptr(array_xptr_independent);
- R_SetExternalPtrProtected(array_view_xptr, array_xptr_independent);
- UNPROTECT(1);
+ SEXP array_xptr_independent = PROTECT(array_xptr_ensure_independent(array_xptr));
- Rf_setAttrib(array_view_xptr, R_ClassSymbol, Rf_mkString("nanoarrow::altrep_chr"));
- SEXP out = PROTECT(R_new_altrep(nanoarrow_altrep_chr_cls, array_view_xptr, R_NilValue));
+ if (nanoarrow_converter_set_array(converter_xptr, array_xptr_independent) !=
+ NANOARROW_OK) {
+ nanoarrow_converter_stop(converter_xptr);
+ }
+
+ Rf_setAttrib(converter_xptr, R_ClassSymbol, nanoarrow_cls_altrep_chr);
+ SEXP out = PROTECT(R_new_altrep(nanoarrow_altrep_chr_cls, converter_xptr, R_NilValue));
MARK_NOT_MUTABLE(out);
- UNPROTECT(1);
+ UNPROTECT(3);
return out;
#else
return R_NilValue;
diff --git a/r/src/altrep.h b/r/src/altrep.h
index ef01b21..98be7e2 100644
--- a/r/src/altrep.h
+++ b/r/src/altrep.h
@@ -56,6 +56,6 @@ static inline int is_nanoarrow_altrep(SEXP x) {
// Creates an altstring vector backed by a nanoarrow array or returns
// R_NilValue if the conversion is not possible.
-SEXP nanoarrow_c_make_altrep_chr(SEXP array_view_xptr);
+SEXP nanoarrow_c_make_altrep_chr(SEXP array_xptr);
#endif
diff --git a/r/src/array.c b/r/src/array.c
index b98b331..6fa034b 100644
--- a/r/src/array.c
+++ b/r/src/array.c
@@ -22,6 +22,7 @@
#include "array.h"
#include "nanoarrow.h"
#include "schema.h"
+#include "util.h"
void finalize_array_xptr(SEXP array_xptr) {
struct ArrowArray* array = (struct ArrowArray*)R_ExternalPtrAddr(array_xptr);
@@ -77,7 +78,7 @@ SEXP nanoarrow_c_infer_schema_array(SEXP array_xptr) {
static SEXP borrow_array_xptr(struct ArrowArray* array, SEXP shelter) {
SEXP array_xptr = PROTECT(R_MakeExternalPtr(array, R_NilValue, shelter));
- Rf_setAttrib(array_xptr, R_ClassSymbol, Rf_mkString("nanoarrow_array"));
+ Rf_setAttrib(array_xptr, R_ClassSymbol, nanoarrow_cls_array);
UNPROTECT(1);
return array_xptr;
}
@@ -86,7 +87,9 @@ SEXP borrow_array_child_xptr(SEXP array_xptr, int64_t i) {
struct ArrowArray* array = array_from_xptr(array_xptr);
SEXP schema_xptr = R_ExternalPtrTag(array_xptr);
SEXP child_xptr = PROTECT(borrow_array_xptr(array->children[i], array_xptr));
- array_xptr_set_schema(child_xptr, borrow_schema_child_xptr(schema_xptr, i));
+ if (schema_xptr != R_NilValue) {
+ array_xptr_set_schema(child_xptr, borrow_schema_child_xptr(schema_xptr, i));
+ }
UNPROTECT(1);
return child_xptr;
}
diff --git a/r/src/array.h b/r/src/array.h
index f974450..4167001 100644
--- a/r/src/array.h
+++ b/r/src/array.h
@@ -22,6 +22,7 @@
#include <Rinternals.h>
#include "nanoarrow.h"
+#include "util.h"
void finalize_array_xptr(SEXP array_xptr);
void finalize_exported_array(struct ArrowArray* array);
@@ -70,7 +71,7 @@ static inline SEXP array_owning_xptr() {
array->release = NULL;
SEXP array_xptr = PROTECT(R_MakeExternalPtr(array, R_NilValue, R_NilValue));
- Rf_setAttrib(array_xptr, R_ClassSymbol, Rf_mkString("nanoarrow_array"));
+ Rf_setAttrib(array_xptr, R_ClassSymbol, nanoarrow_cls_array);
R_RegisterCFinalizer(array_xptr, &finalize_array_xptr);
UNPROTECT(1);
return array_xptr;
@@ -82,6 +83,10 @@ static inline void array_xptr_set_schema(SEXP array_xptr, SEXP schema_xptr) {
R_SetExternalPtrTag(array_xptr, schema_xptr);
}
+static inline SEXP array_xptr_get_schema(SEXP array_xptr) {
+ return R_ExternalPtrTag(array_xptr);
+}
+
// Retrieves a schema from an array external pointer if it exists or returns
// NULL otherwise.
static inline struct ArrowSchema* schema_from_array_xptr(SEXP array_xptr) {
@@ -113,7 +118,7 @@ static inline void array_export(SEXP array_xptr, struct ArrowArray* array_copy)
array_copy->private_data = independent_array_xptr;
array_copy->release = &finalize_exported_array;
R_PreserveObject(independent_array_xptr);
- UNPROTECT(1);
+ UNPROTECT(1);
}
// When arrays arrive as a nanoarrow_array, they are responsible for
diff --git a/r/src/array_convert_vector.c b/r/src/array_convert_vector.c
deleted file mode 100644
index e4b30cc..0000000
--- a/r/src/array_convert_vector.c
+++ /dev/null
@@ -1,359 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#define R_NO_REMAP
-#include <R.h>
-#include <Rinternals.h>
-
-#include "nanoarrow.h"
-
-#include "altrep.h"
-#include "array.h"
-#include "array_view.h"
-#include "materialize.h"
-
-// These are the vector types that have some special casing
-// internally to avoid unnecessary allocations or looping at
-// the R level. Other types are represented by an SEXP ptype.
-enum VectorType {
- VECTOR_TYPE_LGL,
- VECTOR_TYPE_INT,
- VECTOR_TYPE_DBL,
- VECTOR_TYPE_CHR,
- VECTOR_TYPE_LIST_OF_RAW,
- VECTOR_TYPE_DATA_FRAME,
- VECTOR_TYPE_OTHER
-};
-
-// These conversions are the default R-native type guesses for
-// an array that don't require extra information from the ptype (e.g.,
-// factor with levels). Some of these guesses may result in a conversion
-// that later warns for out-of-range values (e.g., int64 to double());
-// however, a user can use the from_nanoarrow_array(x, ptype = something_safer())
-// when this occurs.
-static enum VectorType vector_type_from_array_type(enum ArrowType type) {
- switch (type) {
- case NANOARROW_TYPE_BOOL:
- return VECTOR_TYPE_LGL;
-
- case NANOARROW_TYPE_INT8:
- case NANOARROW_TYPE_UINT8:
- case NANOARROW_TYPE_INT16:
- case NANOARROW_TYPE_UINT16:
- case NANOARROW_TYPE_INT32:
- return VECTOR_TYPE_INT;
-
- case NANOARROW_TYPE_UINT32:
- case NANOARROW_TYPE_INT64:
- case NANOARROW_TYPE_UINT64:
- case NANOARROW_TYPE_FLOAT:
- case NANOARROW_TYPE_DOUBLE:
- return VECTOR_TYPE_DBL;
-
- case NANOARROW_TYPE_STRING:
- case NANOARROW_TYPE_LARGE_STRING:
- return VECTOR_TYPE_CHR;
-
- case NANOARROW_TYPE_BINARY:
- case NANOARROW_TYPE_LARGE_BINARY:
- return VECTOR_TYPE_LIST_OF_RAW;
-
- case NANOARROW_TYPE_STRUCT:
- return VECTOR_TYPE_DATA_FRAME;
-
- default:
- return VECTOR_TYPE_OTHER;
- }
-}
-
-// The same as the above, but from a nanoarrow_array()
-static enum VectorType vector_type_from_array_xptr(SEXP array_xptr) {
- struct ArrowSchema* schema = schema_from_array_xptr(array_xptr);
-
- struct ArrowSchemaView schema_view;
- struct ArrowError error;
- if (ArrowSchemaViewInit(&schema_view, schema, &error) != NANOARROW_OK) {
- Rf_error("vector_type_from_array_view_xptr(): %s", ArrowErrorMessage(&error));
- }
-
- return vector_type_from_array_type(schema_view.data_type);
-}
-
-// Call stop_cant_infer_ptype(), which gives a more informative error
-// message than we can provide in a reasonable amount of C code here
-static void call_stop_cant_infer_ptype(SEXP array_xptr) {
- SEXP ns = PROTECT(R_FindNamespace(Rf_mkString("nanoarrow")));
- SEXP call = PROTECT(Rf_lang2(Rf_install("stop_cant_infer_ptype"), array_xptr));
- Rf_eval(call, ns);
- UNPROTECT(2);
-}
-
-SEXP nanoarrow_c_infer_ptype(SEXP array_xptr);
-
-static SEXP infer_ptype_data_frame(SEXP array_xptr) {
- struct ArrowArray* array = array_from_xptr(array_xptr);
- SEXP result = PROTECT(Rf_allocVector(VECSXP, array->n_children));
- SEXP result_names = PROTECT(Rf_allocVector(STRSXP, array->n_children));
-
- for (R_xlen_t i = 0; i < array->n_children; i++) {
- SEXP child_xptr = PROTECT(borrow_array_child_xptr(array_xptr, i));
- SET_VECTOR_ELT(result, i, nanoarrow_c_infer_ptype(child_xptr));
- UNPROTECT(1);
-
- struct ArrowSchema* schema = schema_from_array_xptr(child_xptr);
- if (schema->name != NULL) {
- SET_STRING_ELT(result_names, i, Rf_mkCharCE(schema->name, CE_UTF8));
- } else {
- SET_STRING_ELT(result_names, i, Rf_mkChar(""));
- }
- }
-
- Rf_setAttrib(result, R_ClassSymbol, Rf_mkString("data.frame"));
- Rf_setAttrib(result, R_NamesSymbol, result_names);
- SEXP rownames = PROTECT(Rf_allocVector(INTSXP, 2));
- INTEGER(rownames)[0] = NA_INTEGER;
- INTEGER(rownames)[1] = 0;
- Rf_setAttrib(result, R_RowNamesSymbol, rownames);
- UNPROTECT(3);
- return result;
-}
-
-SEXP nanoarrow_c_infer_ptype(SEXP array_xptr) {
- enum VectorType vector_type = vector_type_from_array_xptr(array_xptr);
-
- switch (vector_type) {
- case VECTOR_TYPE_LGL:
- return Rf_allocVector(LGLSXP, 0);
- case VECTOR_TYPE_INT:
- return Rf_allocVector(INTSXP, 0);
- case VECTOR_TYPE_DBL:
- return Rf_allocVector(REALSXP, 0);
- case VECTOR_TYPE_CHR:
- return Rf_allocVector(STRSXP, 0);
- case VECTOR_TYPE_DATA_FRAME:
- return infer_ptype_data_frame(array_xptr);
- default:
- call_stop_cant_infer_ptype(array_xptr);
- }
-
- return R_NilValue;
-}
-
-// This calls from_nanoarrow_array() (via a package helper) to try S3
-// dispatch to find a from_nanoarrow_array() method (or error if there
-// isn't one)
-static SEXP call_from_nanoarrow_array(SEXP array_xptr, SEXP ptype_sexp) {
- SEXP ns = PROTECT(R_FindNamespace(Rf_mkString("nanoarrow")));
- SEXP call = PROTECT(
- Rf_lang3(Rf_install("from_nanoarrow_array_from_c"), array_xptr, ptype_sexp));
- SEXP result = PROTECT(Rf_eval(call, ns));
- UNPROTECT(3);
- return result;
-}
-
-// Call stop_cant_convert_array(), which gives a more informative error
-// message than we can provide in a reasonable amount of C code here
-static void call_stop_cant_convert_array(SEXP array_xptr, int sexp_type) {
- SEXP ns = PROTECT(R_FindNamespace(Rf_mkString("nanoarrow")));
- SEXP ptype_sexp = PROTECT(Rf_allocVector(sexp_type, 0));
- SEXP call =
- PROTECT(Rf_lang3(Rf_install("stop_cant_convert_array"), array_xptr, ptype_sexp));
- Rf_eval(call, ns);
- UNPROTECT(3);
-}
-
-SEXP nanoarrow_c_from_array(SEXP array_xptr, SEXP ptype_sexp);
-
-static SEXP from_array_to_data_frame(SEXP array_xptr, SEXP ptype_sexp) {
- struct ArrowArray* array = array_from_xptr(array_xptr);
- R_xlen_t n_col = array->n_children;
- SEXP result = PROTECT(Rf_allocVector(VECSXP, n_col));
-
- if (ptype_sexp == R_NilValue) {
- SEXP result_names = PROTECT(Rf_allocVector(STRSXP, n_col));
-
- for (R_xlen_t i = 0; i < n_col; i++) {
- SEXP child_xptr = PROTECT(borrow_array_child_xptr(array_xptr, i));
- SET_VECTOR_ELT(result, i, nanoarrow_c_from_array(child_xptr, R_NilValue));
- UNPROTECT(1);
-
- struct ArrowSchema* schema = schema_from_array_xptr(child_xptr);
- if (schema->name != NULL) {
- SET_STRING_ELT(result_names, i, Rf_mkCharCE(schema->name, CE_UTF8));
- } else {
- SET_STRING_ELT(result_names, i, Rf_mkChar(""));
- }
- }
-
- Rf_setAttrib(result, R_NamesSymbol, result_names);
- UNPROTECT(1);
- } else {
- if (n_col != Rf_xlength(ptype_sexp)) {
- Rf_error("Expected data.frame() ptype with %ld column(s) but found %ld column(s)",
- (long)n_col, (long)Rf_xlength(ptype_sexp));
- }
-
- for (R_xlen_t i = 0; i < n_col; i++) {
- SEXP child_xptr = PROTECT(borrow_array_child_xptr(array_xptr, i));
- SEXP child_ptype = VECTOR_ELT(ptype_sexp, i);
- SET_VECTOR_ELT(result, i, nanoarrow_c_from_array(child_xptr, child_ptype));
- UNPROTECT(1);
- }
-
- Rf_setAttrib(result, R_NamesSymbol, Rf_getAttrib(ptype_sexp, R_NamesSymbol));
- }
-
- Rf_setAttrib(result, R_ClassSymbol, Rf_mkString("data.frame"));
- SEXP rownames = PROTECT(Rf_allocVector(INTSXP, 2));
- INTEGER(rownames)[0] = NA_INTEGER;
- INTEGER(rownames)[1] = array->length;
- Rf_setAttrib(result, R_RowNamesSymbol, rownames);
-
- UNPROTECT(2);
- return result;
-}
-
-static SEXP from_array_to_lgl(SEXP array_xptr) {
- SEXP array_view_xptr = PROTECT(array_view_xptr_from_array_xptr(array_xptr));
- SEXP result = PROTECT(nanoarrow_materialize_lgl(array_view_from_xptr(array_view_xptr)));
- if (result == R_NilValue) {
- call_stop_cant_convert_array(array_xptr, LGLSXP);
- }
- UNPROTECT(2);
- return result;
-}
-
-static SEXP from_array_to_int(SEXP array_xptr) {
- SEXP array_view_xptr = PROTECT(array_view_xptr_from_array_xptr(array_xptr));
- SEXP result = PROTECT(nanoarrow_materialize_int(array_view_from_xptr(array_view_xptr)));
- if (result == R_NilValue) {
- call_stop_cant_convert_array(array_xptr, INTSXP);
- }
- UNPROTECT(2);
- return result;
-}
-
-static SEXP from_array_to_dbl(SEXP array_xptr) {
- SEXP array_view_xptr = PROTECT(array_view_xptr_from_array_xptr(array_xptr));
- SEXP result = PROTECT(nanoarrow_materialize_dbl(array_view_from_xptr(array_view_xptr)));
- if (result == R_NilValue) {
- call_stop_cant_convert_array(array_xptr, REALSXP);
- }
- UNPROTECT(2);
- return result;
-}
-
-static SEXP from_array_to_chr(SEXP array_xptr) {
- SEXP array_view_xptr = PROTECT(array_view_xptr_from_array_xptr(array_xptr));
- SEXP result = PROTECT(nanoarrow_c_make_altrep_chr(array_view_xptr));
- if (result == R_NilValue) {
- call_stop_cant_convert_array(array_xptr, STRSXP);
- }
- UNPROTECT(2);
- return result;
-}
-
-static SEXP from_array_to_list_of_raw(SEXP array_xptr) {
- SEXP array_view_xptr = PROTECT(array_view_xptr_from_array_xptr(array_xptr));
- SEXP result =
- PROTECT(nanoarrow_materialize_list_of_raw(array_view_from_xptr(array_view_xptr)));
- if (result == R_NilValue) {
- call_stop_cant_convert_array(array_xptr, STRSXP);
- }
- UNPROTECT(2);
- return result;
-}
-
-// TODO: Lists are not all that well supported yet.
-static SEXP from_array_to_list(SEXP array_xptr, SEXP ptype_sexp) {
- struct ArrowSchema* schema = schema_from_array_xptr(array_xptr);
-
- struct ArrowSchemaView schema_view;
- struct ArrowError error;
- if (ArrowSchemaViewInit(&schema_view, schema, &error) != NANOARROW_OK) {
- Rf_error("from_array_to_list(): %s", ArrowErrorMessage(&error));
- }
-
- SEXP result = R_NilValue;
- switch (schema_view.data_type) {
- case NANOARROW_TYPE_BINARY:
- case NANOARROW_TYPE_LARGE_BINARY:
- result = PROTECT(from_array_to_list_of_raw(array_xptr));
- break;
- default:
- call_stop_cant_convert_array(array_xptr, STRSXP);
- }
-
- UNPROTECT(1);
- return result;
-}
-
-SEXP nanoarrow_c_from_array(SEXP array_xptr, SEXP ptype_sexp) {
- // See if we can skip any ptype resolution at all
- if (ptype_sexp == R_NilValue) {
- enum VectorType vector_type = vector_type_from_array_xptr(array_xptr);
- switch (vector_type) {
- case VECTOR_TYPE_LGL:
- return from_array_to_lgl(array_xptr);
- case VECTOR_TYPE_INT:
- return from_array_to_int(array_xptr);
- case VECTOR_TYPE_DBL:
- return from_array_to_dbl(array_xptr);
- case VECTOR_TYPE_CHR:
- return from_array_to_chr(array_xptr);
- case VECTOR_TYPE_LIST_OF_RAW:
- return from_array_to_list_of_raw(array_xptr);
- case VECTOR_TYPE_DATA_FRAME:
- return from_array_to_data_frame(array_xptr, R_NilValue);
- default:
- break;
- }
-
- // Otherwise, resolve the ptype and use it (this will also error
- // for ptypes that can't be resolved)
- ptype_sexp = PROTECT(nanoarrow_c_infer_ptype(array_xptr));
- SEXP result = nanoarrow_c_from_array(array_xptr, ptype_sexp);
- UNPROTECT(1);
- return result;
- }
-
- // Handle some S3 objects internally to avoid S3 dispatch
- // (e.g., when looping over a data frame with a lot of columns)
- if (Rf_isObject(ptype_sexp)) {
- if (Rf_inherits(ptype_sexp, "data.frame") && !Rf_inherits(ptype_sexp, "tbl_df")) {
- return from_array_to_data_frame(array_xptr, ptype_sexp);
- } else {
- return call_from_nanoarrow_array(array_xptr, ptype_sexp);
- }
- }
-
- // If we're here, these are non-S3 objects
- switch (TYPEOF(ptype_sexp)) {
- case LGLSXP:
- return from_array_to_lgl(array_xptr);
- case INTSXP:
- return from_array_to_int(array_xptr);
- case REALSXP:
- return from_array_to_dbl(array_xptr);
- case STRSXP:
- return from_array_to_chr(array_xptr);
- case VECSXP:
- return from_array_to_list(array_xptr, ptype_sexp);
- default:
- return call_from_nanoarrow_array(array_xptr, ptype_sexp);
- }
-}
diff --git a/r/src/array_stream.h b/r/src/array_stream.h
index d8bf71a..816944d 100644
--- a/r/src/array_stream.h
+++ b/r/src/array_stream.h
@@ -22,6 +22,7 @@
#include <Rinternals.h>
#include "nanoarrow.h"
+#include "util.h"
void finalize_array_stream_xptr(SEXP array_stream_xptr);
@@ -54,7 +55,7 @@ static inline SEXP array_stream_owning_xptr() {
SEXP array_stream_xptr =
PROTECT(R_MakeExternalPtr(array_stream, R_NilValue, R_NilValue));
- Rf_setAttrib(array_stream_xptr, R_ClassSymbol, Rf_mkString("nanoarrow_array_stream"));
+ Rf_setAttrib(array_stream_xptr, R_ClassSymbol, nanoarrow_cls_array_stream);
R_RegisterCFinalizer(array_stream_xptr, &finalize_array_stream_xptr);
UNPROTECT(1);
return array_stream_xptr;
diff --git a/r/src/array_view.c b/r/src/array_view.c
index 2d536aa..f34d97d 100644
--- a/r/src/array_view.c
+++ b/r/src/array_view.c
@@ -23,6 +23,7 @@
#include "array.h"
#include "schema.h"
+#include "util.h"
static void finalize_array_view_xptr(SEXP array_view_xptr) {
struct ArrowArrayView* array_view =
@@ -56,7 +57,7 @@ SEXP nanoarrow_c_array_view(SEXP array_xptr, SEXP schema_xptr) {
Rf_error("<ArrowArrayViewSetArray> %s", error.message);
}
- Rf_setAttrib(xptr, R_ClassSymbol, Rf_mkString("nanoarrow_array_view"));
+ Rf_setAttrib(xptr, R_ClassSymbol, nanoarrow_cls_array_view);
UNPROTECT(1);
return xptr;
}
diff --git a/r/src/convert.c b/r/src/convert.c
new file mode 100644
index 0000000..be262ee
--- /dev/null
+++ b/r/src/convert.c
@@ -0,0 +1,491 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#define R_NO_REMAP
+#include <R.h>
+#include <Rinternals.h>
+
+#include "nanoarrow.h"
+
+#include "array.h"
+#include "convert.h"
+#include "materialize.h"
+#include "schema.h"
+
+static R_xlen_t nanoarrow_vec_size(SEXP vec_sexp, struct PTypeView* ptype_view) {
+ if (ptype_view->vector_type == VECTOR_TYPE_DATA_FRAME) {
+ if (Rf_length(vec_sexp) > 0) {
+ // This both avoids materializing the row.names attribute and
+ // makes this work with struct-style vctrs that don't have a
+ // row.names attribute but that always have one or more element
+ return Rf_xlength(VECTOR_ELT(vec_sexp, 0));
+ } else {
+ // Since ALTREP was introduced, materializing the row.names attribute is
+ // usually deferred such that values in the form c(NA, -nrow), 1:nrow, or
+ // as.character(1:nrow) are never actually computed when the length is
+ // taken.
+ return Rf_xlength(Rf_getAttrib(vec_sexp, R_RowNamesSymbol));
+ }
+ } else {
+ return Rf_xlength(vec_sexp);
+ }
+}
+
+static void finalize_converter(SEXP converter_xptr) {
+ struct RConverter* converter = (struct RConverter*)R_ExternalPtrAddr(converter_xptr);
+ if (converter != NULL) {
+ ArrowArrayViewReset(&converter->array_view);
+
+ if (converter->children != NULL) {
+ ArrowFree(converter->children);
+ }
+
+ ArrowFree(converter);
+ }
+}
+
+SEXP nanoarrow_converter_from_type(enum VectorType vector_type) {
+ struct RConverter* converter =
+ (struct RConverter*)ArrowMalloc(sizeof(struct RConverter));
+ if (converter == NULL) {
+ Rf_error("Failed to allocate RConverter");
+ }
+
+ // 0: ptype, 1: schema_xptr, 2: array_xptr, 3: children, 4: result
+ SEXP converter_shelter = PROTECT(Rf_allocVector(VECSXP, 5));
+ SEXP converter_xptr =
+ PROTECT(R_MakeExternalPtr(converter, R_NilValue, converter_shelter));
+ R_RegisterCFinalizer(converter_xptr, &finalize_converter);
+
+ ArrowArrayViewInit(&converter->array_view, NANOARROW_TYPE_UNINITIALIZED);
+ converter->schema_view.data_type = NANOARROW_TYPE_UNINITIALIZED;
+ converter->schema_view.storage_data_type = NANOARROW_TYPE_UNINITIALIZED;
+ converter->src.array_view = &converter->array_view;
+ converter->dst.vec_sexp = R_NilValue;
+ converter->options = NULL;
+ converter->error.message[0] = '\0';
+ converter->size = 0;
+ converter->capacity = 0;
+ converter->n_children = 0;
+ converter->children = NULL;
+
+ converter->ptype_view.vector_type = vector_type;
+ converter->ptype_view.ptype = R_NilValue;
+
+ switch (vector_type) {
+ case VECTOR_TYPE_NULL:
+ converter->ptype_view.sexp_type = NILSXP;
+ break;
+ case VECTOR_TYPE_LGL:
+ converter->ptype_view.sexp_type = LGLSXP;
+ break;
+ case VECTOR_TYPE_INT:
+ converter->ptype_view.sexp_type = INTSXP;
+ break;
+ case VECTOR_TYPE_DBL:
+ converter->ptype_view.sexp_type = REALSXP;
+ break;
+ case VECTOR_TYPE_CHR:
+ converter->ptype_view.sexp_type = STRSXP;
+ break;
+ default:
+ UNPROTECT(2);
+ return R_NilValue;
+ }
+
+ UNPROTECT(2);
+ return converter_xptr;
+}
+
+static enum RTimeUnits time_units_from_difftime(SEXP ptype) {
+ SEXP units_attr = Rf_getAttrib(ptype, Rf_install("units"));
+ if (units_attr == R_NilValue || TYPEOF(units_attr) != STRSXP ||
+ Rf_length(units_attr) != 1) {
+ Rf_error("Expected difftime 'units' attribute of type character(1)");
+ }
+
+ const char* dst_units = Rf_translateCharUTF8(STRING_ELT(units_attr, 0));
+ if (strcmp(dst_units, "secs") == 0) {
+ return R_TIME_UNIT_SECONDS;
+ } else if (strcmp(dst_units, "mins") == 0) {
+ return R_TIME_UNIT_MINUTES;
+ } else if (strcmp(dst_units, "hours") == 0) {
+ return R_TIME_UNIT_HOURS;
+ } else if (strcmp(dst_units, "days") == 0) {
+ return R_TIME_UNIT_DAYS;
+ } else if (strcmp(dst_units, "weeks") == 0) {
+ return R_TIME_UNIT_WEEKS;
+ } else {
+ Rf_error("Unexpected value for difftime 'units' attribute");
+ return R_TIME_UNIT_SECONDS;
+ }
+}
+
+static void set_converter_data_frame(SEXP converter_xptr, struct RConverter* converter,
+ SEXP ptype) {
+ converter->n_children = Rf_xlength(ptype);
+ converter->children = (struct RConverter**)ArrowMalloc(converter->n_children *
+ sizeof(struct RConverter*));
+ if (converter->children == NULL) {
+ Rf_error("Failed to allocate converter children array");
+ }
+
+ SEXP child_converter_xptrs = PROTECT(Rf_allocVector(VECSXP, converter->n_children));
+
+ for (R_xlen_t i = 0; i < converter->n_children; i++) {
+ SEXP child_ptype = VECTOR_ELT(ptype, i);
+ SEXP child_converter = PROTECT(nanoarrow_converter_from_ptype(child_ptype));
+ converter->children[i] = (struct RConverter*)R_ExternalPtrAddr(child_converter);
+ SET_VECTOR_ELT(child_converter_xptrs, i, child_converter);
+ UNPROTECT(1);
+ }
+
+ SEXP converter_shelter = R_ExternalPtrProtected(converter_xptr);
+ SET_VECTOR_ELT(converter_shelter, 3, child_converter_xptrs);
+ UNPROTECT(1);
+}
+
+static void set_converter_list_of(SEXP converter_xptr, struct RConverter* converter,
+ SEXP ptype) {
+ SEXP child_ptype = Rf_getAttrib(ptype, Rf_install("ptype"));
+ if (child_ptype == R_NilValue) {
+ Rf_error("Expected attribute 'ptype' for conversion to list_of");
+ }
+
+ converter->children = (struct RConverter**)ArrowMalloc(1 * sizeof(struct RConverter*));
+ if (converter->children == NULL) {
+ Rf_error("Failed to allocate converter children array");
+ }
+ converter->n_children = 1;
+
+ SEXP child_converter_xptrs = PROTECT(Rf_allocVector(VECSXP, 1));
+ SEXP child_converter = PROTECT(nanoarrow_converter_from_ptype(child_ptype));
+ converter->children[0] = (struct RConverter*)R_ExternalPtrAddr(child_converter);
+ SET_VECTOR_ELT(child_converter_xptrs, 0, child_converter);
+
+ SEXP converter_shelter = R_ExternalPtrProtected(converter_xptr);
+ SET_VECTOR_ELT(converter_shelter, 3, child_converter_xptrs);
+ UNPROTECT(2);
+}
+
+static int set_converter_children_schema(SEXP converter_xptr, SEXP schema_xptr) {
+ struct RConverter* converter = (struct RConverter*)R_ExternalPtrAddr(converter_xptr);
+ SEXP converter_shelter = R_ExternalPtrProtected(converter_xptr);
+ struct ArrowSchema* schema = schema_from_xptr(schema_xptr);
+
+ if (schema->n_children != converter->n_children) {
+ ArrowErrorSet(&converter->error,
+ "Expected schema with %ld children but got schema with %ld children",
+ (long)converter->n_children, (long)schema->n_children);
+ return EINVAL;
+ }
+
+ SEXP child_converter_xptrs = VECTOR_ELT(converter_shelter, 3);
+
+ for (R_xlen_t i = 0; i < converter->n_children; i++) {
+ SEXP child_converter_xptr = VECTOR_ELT(child_converter_xptrs, i);
+ SEXP child_schema_xptr = PROTECT(borrow_schema_child_xptr(schema_xptr, i));
+ int result = nanoarrow_converter_set_schema(child_converter_xptr, child_schema_xptr);
+ UNPROTECT(1);
+ if (result != NANOARROW_OK) {
+ return result;
+ }
+ }
+
+ return NANOARROW_OK;
+}
+
+static int set_converter_children_array(SEXP converter_xptr, SEXP array_xptr) {
+ struct RConverter* converter = (struct RConverter*)R_ExternalPtrAddr(converter_xptr);
+ SEXP converter_shelter = R_ExternalPtrProtected(converter_xptr);
+ struct ArrowArray* array = array_from_xptr(array_xptr);
+
+ if (array->n_children != converter->n_children) {
+ ArrowErrorSet(&converter->error,
+ "Expected array with %ld children but got array with %ld children",
+ (long)converter->n_children, (long)array->n_children);
+ return EINVAL;
+ }
+
+ SEXP child_converter_xptrs = VECTOR_ELT(converter_shelter, 3);
+
+ for (R_xlen_t i = 0; i < converter->n_children; i++) {
+ SEXP child_converter_xptr = VECTOR_ELT(child_converter_xptrs, i);
+ SEXP child_array_xptr = PROTECT(borrow_array_child_xptr(array_xptr, i));
+ int result = nanoarrow_converter_set_array(child_converter_xptr, child_array_xptr);
+ UNPROTECT(1);
+ if (result != NANOARROW_OK) {
+ return result;
+ }
+ }
+
+ return NANOARROW_OK;
+}
+
+SEXP nanoarrow_converter_from_ptype(SEXP ptype) {
+ SEXP converter_xptr = PROTECT(nanoarrow_converter_from_type(VECTOR_TYPE_NULL));
+ SEXP converter_shelter = R_ExternalPtrProtected(converter_xptr);
+ struct RConverter* converter = (struct RConverter*)R_ExternalPtrAddr(converter_xptr);
+
+ if (Rf_isObject(ptype)) {
+ if (nanoarrow_ptype_is_data_frame(ptype)) {
+ converter->ptype_view.vector_type = VECTOR_TYPE_DATA_FRAME;
+ set_converter_data_frame(converter_xptr, converter, ptype);
+ } else if (Rf_inherits(ptype, "blob")) {
+ converter->ptype_view.vector_type = VECTOR_TYPE_BLOB;
+ } else if (Rf_inherits(ptype, "vctrs_list_of")) {
+ converter->ptype_view.vector_type = VECTOR_TYPE_LIST_OF;
+ set_converter_list_of(converter_xptr, converter, ptype);
+ } else if (Rf_inherits(ptype, "vctrs_unspecified")) {
+ converter->ptype_view.vector_type = VECTOR_TYPE_UNSPECIFIED;
+ } else if (Rf_inherits(ptype, "Date")) {
+ converter->ptype_view.vector_type = VECTOR_TYPE_DATE;
+ converter->ptype_view.r_time_units = R_TIME_UNIT_DAYS;
+ } else if (Rf_inherits(ptype, "POSIXct")) {
+ converter->ptype_view.vector_type = VECTOR_TYPE_POSIXCT;
+ converter->ptype_view.r_time_units = R_TIME_UNIT_SECONDS;
+ } else if (Rf_inherits(ptype, "difftime")) {
+ converter->ptype_view.vector_type = VECTOR_TYPE_DIFFTIME;
+ converter->ptype_view.r_time_units = time_units_from_difftime(ptype);
+ } else {
+ converter->ptype_view.vector_type = VECTOR_TYPE_OTHER;
+ }
+ } else {
+ switch (TYPEOF(ptype)) {
+ case LGLSXP:
+ converter->ptype_view.vector_type = VECTOR_TYPE_LGL;
+ break;
+ case INTSXP:
+ converter->ptype_view.vector_type = VECTOR_TYPE_INT;
+ break;
+ case REALSXP:
+ converter->ptype_view.vector_type = VECTOR_TYPE_DBL;
+ break;
+ case STRSXP:
+ converter->ptype_view.vector_type = VECTOR_TYPE_CHR;
+ break;
+ default:
+ converter->ptype_view.vector_type = VECTOR_TYPE_OTHER;
+ break;
+ }
+ }
+
+ converter->ptype_view.ptype = ptype;
+ converter->ptype_view.sexp_type = TYPEOF(ptype);
+ SET_VECTOR_ELT(converter_shelter, 0, ptype);
+
+ UNPROTECT(1);
+ return converter_xptr;
+}
+
+int nanoarrow_converter_set_schema(SEXP converter_xptr, SEXP schema_xptr) {
+ struct RConverter* converter = (struct RConverter*)R_ExternalPtrAddr(converter_xptr);
+ SEXP converter_shelter = R_ExternalPtrProtected(converter_xptr);
+ struct ArrowSchema* schema = schema_from_xptr(schema_xptr);
+ NANOARROW_RETURN_NOT_OK(
+ ArrowSchemaViewInit(&converter->schema_view, schema, &converter->error));
+
+ // TODO: Currently we error at the materialize stage if a conversion is not possible;
+ // however, at this stage we have all the information we need to calculate that.
+
+ // For extension types, warn that we are about to strip the extension type, as we don't
+ // have a mechanism for dealing with them yet
+ if (converter->schema_view.extension_name.n_bytes > 0) {
+ int64_t schema_chars = ArrowSchemaToString(schema, NULL, 0, 1);
+ SEXP fmt_shelter = PROTECT(Rf_allocVector(RAWSXP, schema_chars + 1));
+ ArrowSchemaToString(schema, (char*)RAW(fmt_shelter), schema_chars + 1, 1);
+ const char* schema_name = schema->name;
+ if (schema_name == NULL || schema_name[0] == '\0') {
+ Rf_warning("Converting unknown extension %s as storage type",
+ (const char*)RAW(fmt_shelter));
+ } else {
+ Rf_warning("%s: Converting unknown extension %s as storage type", schema_name,
+ (const char*)RAW(fmt_shelter));
+ }
+
+ UNPROTECT(1);
+ }
+
+ // Sub-par error for dictionary types until we have a way to deal with them
+ if (converter->schema_view.data_type == NANOARROW_TYPE_DICTIONARY) {
+ ArrowErrorSet(&converter->error,
+ "Conversion to dictionary-encoded array is not supported");
+ return ENOTSUP;
+ }
+
+ SET_VECTOR_ELT(converter_shelter, 1, schema_xptr);
+
+ ArrowArrayViewReset(&converter->array_view);
+ SET_VECTOR_ELT(converter_shelter, 2, R_NilValue);
+ NANOARROW_RETURN_NOT_OK(
+ ArrowArrayViewInitFromSchema(&converter->array_view, schema, &converter->error));
+
+ if (converter->ptype_view.vector_type == VECTOR_TYPE_LIST_OF ||
+ converter->ptype_view.vector_type == VECTOR_TYPE_DATA_FRAME) {
+ set_converter_children_schema(converter_xptr, schema_xptr);
+ }
+
+ return NANOARROW_OK;
+}
+
+int nanoarrow_converter_set_array(SEXP converter_xptr, SEXP array_xptr) {
+ struct RConverter* converter = (struct RConverter*)R_ExternalPtrAddr(converter_xptr);
+ SEXP converter_shelter = R_ExternalPtrProtected(converter_xptr);
+ struct ArrowArray* array = array_from_xptr(array_xptr);
+ NANOARROW_RETURN_NOT_OK(
+ ArrowArrayViewSetArray(&converter->array_view, array, &converter->error));
+ SET_VECTOR_ELT(converter_shelter, 2, array_xptr);
+ converter->src.offset = 0;
+ converter->src.length = 0;
+
+ if (converter->ptype_view.vector_type == VECTOR_TYPE_LIST_OF ||
+ converter->ptype_view.vector_type == VECTOR_TYPE_DATA_FRAME) {
+ set_converter_children_array(converter_xptr, array_xptr);
+ }
+
+ return NANOARROW_OK;
+}
+
+void sync_after_converter_reallocate(SEXP converter_xptr, struct RConverter* converter,
+ SEXP result_sexp, R_xlen_t capacity) {
+ SEXP converter_shelter = R_ExternalPtrProtected(converter_xptr);
+ SET_VECTOR_ELT(converter_shelter, 4, result_sexp);
+
+ converter->dst.vec_sexp = result_sexp;
+ converter->dst.offset = 0;
+ converter->dst.length = 0;
+ converter->size = 0;
+ converter->capacity = capacity;
+
+ if (converter->ptype_view.vector_type == VECTOR_TYPE_DATA_FRAME) {
+ SEXP child_converters = VECTOR_ELT(converter_shelter, 3);
+ for (R_xlen_t i = 0; i < converter->n_children; i++) {
+ sync_after_converter_reallocate(VECTOR_ELT(child_converters, i),
+ converter->children[i], VECTOR_ELT(result_sexp, i),
+ capacity);
+ }
+ }
+}
+
+int nanoarrow_converter_reserve(SEXP converter_xptr, R_xlen_t additional_size) {
+ struct RConverter* converter = (struct RConverter*)R_ExternalPtrAddr(converter_xptr);
+ SEXP converter_shelter = R_ExternalPtrProtected(converter_xptr);
+ SEXP current_result = VECTOR_ELT(converter_shelter, 4);
+
+ if (current_result != R_NilValue) {
+ ArrowErrorSet(&converter->error, "Reallocation in converter is not implemented");
+ return ENOTSUP;
+ }
+
+ SEXP result_sexp;
+ if (converter->ptype_view.ptype != R_NilValue) {
+ result_sexp = PROTECT(
+ nanoarrow_materialize_realloc(converter->ptype_view.ptype, additional_size));
+ } else {
+ result_sexp =
+ PROTECT(nanoarrow_alloc_type(converter->ptype_view.vector_type, additional_size));
+ }
+
+ sync_after_converter_reallocate(converter_xptr, converter, result_sexp,
+ additional_size);
+ UNPROTECT(1);
+ return NANOARROW_OK;
+}
+
+R_xlen_t nanoarrow_converter_materialize_n(SEXP converter_xptr, R_xlen_t n) {
+ struct RConverter* converter = (struct RConverter*)R_ExternalPtrAddr(converter_xptr);
+ if ((converter->dst.offset + n) > converter->capacity) {
+ n = converter->capacity - converter->dst.offset;
+ }
+
+ if ((converter->src.offset + n) > converter->array_view.array->length) {
+ n = converter->array_view.array->length - converter->src.offset;
+ }
+
+ if (n == 0) {
+ return 0;
+ }
+
+ converter->src.length = converter->dst.length = n;
+ int result = nanoarrow_materialize(converter, converter_xptr);
+ if (result != NANOARROW_OK) {
+ ArrowErrorSet(&converter->error, "Error in nanoarrow_materialize()");
+ return 0;
+ }
+
+ converter->src.offset += n;
+ converter->dst.offset += n;
+ converter->size += n;
+ return n;
+}
+
+int nanoarrow_converter_materialize_all(SEXP converter_xptr) {
+ struct RConverter* converter = (struct RConverter*)R_ExternalPtrAddr(converter_xptr);
+ R_xlen_t remaining = converter->array_view.array->length;
+ NANOARROW_RETURN_NOT_OK(nanoarrow_converter_reserve(converter_xptr, remaining));
+ if (nanoarrow_converter_materialize_n(converter_xptr, remaining) != remaining) {
+ return ERANGE;
+ } else {
+ return NANOARROW_OK;
+ }
+}
+
+int nanoarrow_converter_finalize(SEXP converter_xptr) {
+ struct RConverter* converter = (struct RConverter*)R_ExternalPtrAddr(converter_xptr);
+ SEXP converter_shelter = R_ExternalPtrProtected(converter_xptr);
+ SEXP current_result = VECTOR_ELT(converter_shelter, 4);
+
+ // Materialize never called (e.g., empty stream)
+ if (current_result == R_NilValue) {
+ NANOARROW_RETURN_NOT_OK(nanoarrow_converter_reserve(converter_xptr, 0));
+ current_result = VECTOR_ELT(converter_shelter, 4);
+ }
+
+ // Check result size. A future implementation could also shrink the length
+ // or reallocate a shorter vector.
+ R_xlen_t current_result_size =
+ nanoarrow_vec_size(current_result, &converter->ptype_view);
+ if (current_result_size != converter->size) {
+ ArrowErrorSet(&converter->error,
+ "Expected result of size %ld but got result of size %ld",
+ (long)current_result_size, (long)converter->size);
+ return ENOTSUP;
+ }
+
+ return NANOARROW_OK;
+}
+
+SEXP nanoarrow_converter_release_result(SEXP converter_xptr) {
+ struct RConverter* converter = (struct RConverter*)R_ExternalPtrAddr(converter_xptr);
+ SEXP converter_shelter = R_ExternalPtrProtected(converter_xptr);
+ // PROTECT()ing here because we are about to release the object from the
+ // shelter of the converter and return it
+ SEXP result = PROTECT(VECTOR_ELT(converter_shelter, 4));
+ SET_VECTOR_ELT(converter_shelter, 4, R_NilValue);
+ converter->dst.vec_sexp = R_NilValue;
+ converter->dst.offset = 0;
+ converter->dst.length = 0;
+ converter->size = 0;
+ converter->capacity = 0;
+ UNPROTECT(1);
+ return result;
+}
+
+void nanoarrow_converter_stop(SEXP converter_xptr) {
+ struct RConverter* converter = (struct RConverter*)R_ExternalPtrAddr(converter_xptr);
+ Rf_error("%s", ArrowErrorMessage(&converter->error));
+}
diff --git a/r/src/convert.h b/r/src/convert.h
new file mode 100644
index 0000000..0397447
--- /dev/null
+++ b/r/src/convert.h
@@ -0,0 +1,66 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#ifndef R_CONVERT_H_INCLUDED
+#define R_CONVERT_H_INCLUDED
+
+#include <R.h>
+#include <Rinternals.h>
+
+#include "nanoarrow.h"
+
+#include "materialize.h"
+
+// Create and initialize a converter. A converter's output R vector type
+// never changes once it has been created.
+SEXP nanoarrow_converter_from_type(enum VectorType vector_type);
+SEXP nanoarrow_converter_from_ptype(SEXP ptype);
+
+// Set the schema for the next array that will be materialized into
+// the R vector. In theory this could change although this has not been
+// implemented. This will also validate the schema. Returns an errno code.
+int nanoarrow_converter_set_schema(SEXP converter_xptr, SEXP schema_xptr);
+
+// Set the array target. This will also validate the array against the last
+// schema that was set. Returns an errno code.
+int nanoarrow_converter_set_array(SEXP converter_xptr, SEXP array_xptr);
+
+// Reserve space in the R vector output for additional elements. In theory
+// this could be used to provide growable behaviour; however, this is not
+// implemented. Returns an errno code.
+int nanoarrow_converter_reserve(SEXP converter_xptr, R_xlen_t additional_size);
+
+// Materialize the next n elements into the output. Returns the number of elements
+// that were actualy materialized which may be less than n.
+R_xlen_t nanoarrow_converter_materialize_n(SEXP converter_xptr, R_xlen_t n);
+
+// Materialize the entire array into the output. Returns an errno code.
+int nanoarrow_converter_materialize_all(SEXP converter_xptr);
+
+// Finalize the output. Currently this just validates the length of the
+// output. Returns an errno code.
+int nanoarrow_converter_finalize(SEXP converter_xptr);
+
+// Returns the resulting SEXP and moves the result out of the protection
+// of the converter.
+SEXP nanoarrow_converter_release_result(SEXP converter_xptr);
+
+// Calls Rf_error() with the internal error buffer populated by above calls
+// that return a non-zero errno value.
+void nanoarrow_converter_stop(SEXP converter_xptr);
+
+#endif
diff --git a/r/src/convert_array.c b/r/src/convert_array.c
new file mode 100644
index 0000000..5508e6f
--- /dev/null
+++ b/r/src/convert_array.c
@@ -0,0 +1,217 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#define R_NO_REMAP
+#include <R.h>
+#include <Rinternals.h>
+
+#include "nanoarrow.h"
+
+#include "altrep.h"
+#include "array.h"
+#include "array_view.h"
+#include "convert.h"
+#include "util.h"
+
+// The common case of converting a single array into a single vector is
+// defined here, powered by the generic conversion available via
+// convert.h but special-casing the common case of "just use the defaults"
+// (i.e., no need to allocate a zero-size ptype) and returning ALTREP
+// where possible.
+
+// This calls nanoarrow::convert_array() (via a package helper) to try S3
+// dispatch to find a convert_array() method (or error if there
+// isn't one)
+static SEXP call_convert_array(SEXP array_xptr, SEXP ptype_sexp) {
+ SEXP fun = PROTECT(Rf_install("convert_array_from_c"));
+ SEXP call = PROTECT(Rf_lang3(fun, array_xptr, ptype_sexp));
+ SEXP result = PROTECT(Rf_eval(call, nanoarrow_ns_pkg));
+ UNPROTECT(3);
+ return result;
+}
+
+// Call stop_cant_convert_array(), which gives a more informative error
+// message than we can provide in a reasonable amount of C code here.
+// Because we opportunistically avoid allocating a ptype object, we might
+// have to allocate one here.
+static void call_stop_cant_convert_array(SEXP array_xptr, enum VectorType type,
+ SEXP ptype_sexp) {
+ SEXP fun = PROTECT(Rf_install("stop_cant_convert_array"));
+
+ if (ptype_sexp == R_NilValue) {
+ ptype_sexp = PROTECT(nanoarrow_alloc_type(type, 0));
+ SEXP call = PROTECT(Rf_lang3(fun, array_xptr, ptype_sexp));
+ Rf_eval(call, nanoarrow_ns_pkg);
+ UNPROTECT(3);
+ } else {
+ SEXP call = PROTECT(Rf_lang3(fun, array_xptr, ptype_sexp));
+ Rf_eval(call, nanoarrow_ns_pkg);
+ UNPROTECT(2);
+ }
+}
+
+static SEXP convert_array_default(SEXP array_xptr, enum VectorType vector_type,
+ SEXP ptype) {
+ SEXP converter_xptr;
+ if (ptype == R_NilValue) {
+ converter_xptr = PROTECT(nanoarrow_converter_from_type(vector_type));
+ } else {
+ converter_xptr = PROTECT(nanoarrow_converter_from_ptype(ptype));
+ }
+
+ if (nanoarrow_converter_set_schema(converter_xptr, array_xptr_get_schema(array_xptr)) !=
+ NANOARROW_OK) {
+ nanoarrow_converter_stop(converter_xptr);
+ }
+
+ if (nanoarrow_converter_set_array(converter_xptr, array_xptr) != NANOARROW_OK) {
+ nanoarrow_converter_stop(converter_xptr);
+ }
+
+ if (nanoarrow_converter_materialize_all(converter_xptr) != NANOARROW_OK) {
+ call_stop_cant_convert_array(array_xptr, vector_type, ptype);
+ }
+
+ if (nanoarrow_converter_finalize(converter_xptr) != NANOARROW_OK) {
+ nanoarrow_converter_stop(converter_xptr);
+ }
+
+ SEXP result = PROTECT(nanoarrow_converter_release_result(converter_xptr));
+ UNPROTECT(2);
+ return result;
+}
+
+static SEXP convert_array_chr(SEXP array_xptr) {
+ SEXP result = PROTECT(nanoarrow_c_make_altrep_chr(array_xptr));
+ if (result == R_NilValue) {
+ call_stop_cant_convert_array(array_xptr, VECTOR_TYPE_CHR, R_NilValue);
+ }
+ UNPROTECT(1);
+ return result;
+}
+
+SEXP nanoarrow_c_convert_array(SEXP array_xptr, SEXP ptype_sexp);
+
+static SEXP convert_array_data_frame(SEXP array_xptr, SEXP ptype_sexp) {
+ struct ArrowArray* array = array_from_xptr(array_xptr);
+ R_xlen_t n_col = array->n_children;
+ SEXP result = PROTECT(Rf_allocVector(VECSXP, n_col));
+
+ if (ptype_sexp == R_NilValue) {
+ SEXP result_names = PROTECT(Rf_allocVector(STRSXP, n_col));
+
+ for (R_xlen_t i = 0; i < n_col; i++) {
+ SEXP child_xptr = PROTECT(borrow_array_child_xptr(array_xptr, i));
+ SET_VECTOR_ELT(result, i, nanoarrow_c_convert_array(child_xptr, R_NilValue));
+ UNPROTECT(1);
+
+ struct ArrowSchema* schema = schema_from_array_xptr(child_xptr);
+ if (schema->name != NULL) {
+ SET_STRING_ELT(result_names, i, Rf_mkCharCE(schema->name, CE_UTF8));
+ } else {
+ SET_STRING_ELT(result_names, i, Rf_mkChar(""));
+ }
+ }
+
+ Rf_setAttrib(result, R_NamesSymbol, result_names);
+ Rf_setAttrib(result, R_ClassSymbol, nanoarrow_cls_data_frame);
+ UNPROTECT(1);
+ } else {
+ if (n_col != Rf_xlength(ptype_sexp)) {
+ Rf_error("Expected data.frame() ptype with %ld column(s) but found %ld column(s)",
+ (long)n_col, (long)Rf_xlength(ptype_sexp));
+ }
+
+ for (R_xlen_t i = 0; i < n_col; i++) {
+ SEXP child_xptr = PROTECT(borrow_array_child_xptr(array_xptr, i));
+ SEXP child_ptype = VECTOR_ELT(ptype_sexp, i);
+ SET_VECTOR_ELT(result, i, nanoarrow_c_convert_array(child_xptr, child_ptype));
+ UNPROTECT(1);
+ }
+
+ Rf_setAttrib(result, R_NamesSymbol, Rf_getAttrib(ptype_sexp, R_NamesSymbol));
+ Rf_copyMostAttrib(ptype_sexp, result);
+ }
+
+ if (Rf_inherits(result, "data.frame")) {
+ nanoarrow_set_rownames(result, array->length);
+ }
+
+ UNPROTECT(1);
+ return result;
+}
+
+// borrow nanoarrow_c_infer_ptype() from infer_ptype.c
+SEXP nanoarrow_c_infer_ptype(SEXP schema_xptr);
+enum VectorType nanoarrow_infer_vector_type_array(SEXP array_xptr);
+
+SEXP nanoarrow_c_convert_array(SEXP array_xptr, SEXP ptype_sexp) {
+ // See if we can skip any ptype resolution at all
+ if (ptype_sexp == R_NilValue) {
+ enum VectorType vector_type = nanoarrow_infer_vector_type_array(array_xptr);
+ switch (vector_type) {
+ case VECTOR_TYPE_LGL:
+ case VECTOR_TYPE_INT:
+ case VECTOR_TYPE_DBL:
+ return convert_array_default(array_xptr, vector_type, R_NilValue);
+ case VECTOR_TYPE_CHR:
+ return convert_array_chr(array_xptr);
+ case VECTOR_TYPE_DATA_FRAME:
+ return convert_array_data_frame(array_xptr, R_NilValue);
+ default:
+ break;
+ }
+
+ // Otherwise, resolve the ptype and use it (this will also error
+ // for ptypes that can't be resolved)
+ ptype_sexp = PROTECT(nanoarrow_c_infer_ptype(array_xptr_get_schema(array_xptr)));
+ SEXP result = nanoarrow_c_convert_array(array_xptr, ptype_sexp);
+ UNPROTECT(1);
+ return result;
+ }
+
+ // Handle some S3 objects internally to avoid S3 dispatch
+ // (e.g., when looping over a data frame with a lot of columns)
+ if (Rf_isObject(ptype_sexp)) {
+ if (nanoarrow_ptype_is_data_frame(ptype_sexp)) {
+ return convert_array_data_frame(array_xptr, ptype_sexp);
+ } else if (Rf_inherits(ptype_sexp, "vctrs_unspecified") ||
+ Rf_inherits(ptype_sexp, "blob") ||
+ Rf_inherits(ptype_sexp, "vctrs_list_of") ||
+ Rf_inherits(ptype_sexp, "Date") || Rf_inherits(ptype_sexp, "hms") ||
+ Rf_inherits(ptype_sexp, "POSIXct") ||
+ Rf_inherits(ptype_sexp, "difftime")) {
+ return convert_array_default(array_xptr, VECTOR_TYPE_OTHER, ptype_sexp);
+ } else {
+ return call_convert_array(array_xptr, ptype_sexp);
+ }
+ }
+
+ // If we're here, these are non-S3 objects
+ switch (TYPEOF(ptype_sexp)) {
+ case LGLSXP:
+ return convert_array_default(array_xptr, VECTOR_TYPE_LGL, ptype_sexp);
+ case INTSXP:
+ return convert_array_default(array_xptr, VECTOR_TYPE_INT, ptype_sexp);
+ case REALSXP:
+ return convert_array_default(array_xptr, VECTOR_TYPE_DBL, ptype_sexp);
+ case STRSXP:
+ return convert_array_chr(array_xptr);
+ default:
+ return call_convert_array(array_xptr, ptype_sexp);
+ }
+}
diff --git a/r/src/convert_array_stream.c b/r/src/convert_array_stream.c
new file mode 100644
index 0000000..7bb4dc9
--- /dev/null
+++ b/r/src/convert_array_stream.c
@@ -0,0 +1,96 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#define R_NO_REMAP
+#include <R.h>
+#include <Rinternals.h>
+
+#include "nanoarrow.h"
+
+#include "array.h"
+#include "array_stream.h"
+#include "convert.h"
+#include "schema.h"
+
+SEXP nanoarrow_c_convert_array_stream(SEXP array_stream_xptr, SEXP ptype_sexp,
+ SEXP size_sexp, SEXP n_sexp) {
+ struct ArrowArrayStream* array_stream = array_stream_from_xptr(array_stream_xptr);
+ double size = REAL(size_sexp)[0];
+ double n = REAL(n_sexp)[0];
+
+ SEXP schema_xptr = PROTECT(schema_owning_xptr());
+ struct ArrowSchema* schema = (struct ArrowSchema*)R_ExternalPtrAddr(schema_xptr);
+ int result = array_stream->get_schema(array_stream, schema);
+ if (result != NANOARROW_OK) {
+ Rf_error("ArrowArrayStream::get_schema(): %s",
+ array_stream->get_last_error(array_stream));
+ }
+
+ SEXP converter_xptr = PROTECT(nanoarrow_converter_from_ptype(ptype_sexp));
+ if (nanoarrow_converter_set_schema(converter_xptr, schema_xptr) != NANOARROW_OK) {
+ nanoarrow_converter_stop(converter_xptr);
+ }
+
+ if (nanoarrow_converter_reserve(converter_xptr, size) != NANOARROW_OK) {
+ nanoarrow_converter_stop(converter_xptr);
+ }
+
+ SEXP array_xptr = PROTECT(array_owning_xptr());
+ struct ArrowArray* array = (struct ArrowArray*)R_ExternalPtrAddr(array_xptr);
+
+ int64_t n_batches = 0;
+ int64_t n_materialized = 0;
+ if (n > 0) {
+ result = array_stream->get_next(array_stream, array);
+ n_batches++;
+ if (result != NANOARROW_OK) {
+ Rf_error("ArrowArrayStream::get_next(): %s",
+ array_stream->get_last_error(array_stream));
+ }
+
+ while (array->release != NULL) {
+ if (nanoarrow_converter_set_array(converter_xptr, array_xptr) != NANOARROW_OK) {
+ nanoarrow_converter_stop(converter_xptr);
+ }
+
+ n_materialized = nanoarrow_converter_materialize_n(converter_xptr, array->length);
+ if (n_materialized != array->length) {
+ Rf_error("Expected to materialize %ld values in batch %ld but materialized %ld",
+ (long)array->length, (long)n_batches, (long)n_materialized);
+ }
+
+ if (n_batches >= n) {
+ break;
+ }
+
+ result = array_stream->get_next(array_stream, array);
+ n_batches++;
+ if (result != NANOARROW_OK) {
+ Rf_error("ArrowArrayStream::get_next(): %s",
+ array_stream->get_last_error(array_stream));
+ }
+ }
+ }
+
+ if (nanoarrow_converter_finalize(converter_xptr) != NANOARROW_OK) {
+ nanoarrow_converter_stop(converter_xptr);
+ }
+
+ SEXP result_sexp = PROTECT(nanoarrow_converter_release_result(converter_xptr));
+ UNPROTECT(4);
+ return result_sexp;
+}
diff --git a/r/src/infer_ptype.c b/r/src/infer_ptype.c
new file mode 100644
index 0000000..70ab15f
--- /dev/null
+++ b/r/src/infer_ptype.c
@@ -0,0 +1,149 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#define R_NO_REMAP
+#include <R.h>
+#include <Rinternals.h>
+
+#include "nanoarrow.h"
+
+#include "altrep.h"
+#include "array.h"
+#include "array_view.h"
+#include "materialize.h"
+#include "schema.h"
+#include "util.h"
+
+// These conversions are the default R-native type guesses for
+// an array that don't require extra information from the ptype (e.g.,
+// factor with levels). Some of these guesses may result in a conversion
+// that later warns for out-of-range values (e.g., int64 to double());
+// however, a user can use the convert_array(x, ptype = something_safer())
+// when this occurs.
+enum VectorType nanoarrow_infer_vector_type(enum ArrowType type) {
+ switch (type) {
+ case NANOARROW_TYPE_BOOL:
+ return VECTOR_TYPE_LGL;
+
+ case NANOARROW_TYPE_INT8:
+ case NANOARROW_TYPE_UINT8:
+ case NANOARROW_TYPE_INT16:
+ case NANOARROW_TYPE_UINT16:
+ case NANOARROW_TYPE_INT32:
+ return VECTOR_TYPE_INT;
+
+ case NANOARROW_TYPE_UINT32:
+ case NANOARROW_TYPE_INT64:
+ case NANOARROW_TYPE_UINT64:
+ case NANOARROW_TYPE_FLOAT:
+ case NANOARROW_TYPE_DOUBLE:
+ case NANOARROW_TYPE_DECIMAL128:
+ return VECTOR_TYPE_DBL;
+
+ case NANOARROW_TYPE_STRING:
+ case NANOARROW_TYPE_LARGE_STRING:
+ return VECTOR_TYPE_CHR;
+
+ case NANOARROW_TYPE_STRUCT:
+ return VECTOR_TYPE_DATA_FRAME;
+
+ default:
+ return VECTOR_TYPE_OTHER;
+ }
+}
+
+// The same as the above, but from a nanoarrow_schema()
+enum VectorType nanoarrow_infer_vector_type_schema(SEXP schema_xptr) {
+ struct ArrowSchema* schema = schema_from_xptr(schema_xptr);
+
+ struct ArrowSchemaView schema_view;
+ struct ArrowError error;
+ if (ArrowSchemaViewInit(&schema_view, schema, &error) != NANOARROW_OK) {
+ Rf_error("nanoarrow_infer_vector_type_schema(): %s", ArrowErrorMessage(&error));
+ }
+
+ return nanoarrow_infer_vector_type(schema_view.data_type);
+}
+
+// The same as the above, but from a nanoarrow_array()
+enum VectorType nanoarrow_infer_vector_type_array(SEXP array_xptr) {
+ return nanoarrow_infer_vector_type_schema(array_xptr_get_schema(array_xptr));
+}
+
+// Call nanoarrow::infer_ptype_other(), which handles less common types that
+// are easier to compute in R or gives an informative error if this is
+// not possible.
+static SEXP call_infer_ptype_other(SEXP schema_xptr) {
+ SEXP fun = PROTECT(Rf_install("infer_ptype_other"));
+ SEXP call = PROTECT(Rf_lang2(fun, schema_xptr));
+ SEXP result = PROTECT(Rf_eval(call, nanoarrow_ns_pkg));
+ UNPROTECT(3);
+ return result;
+}
+
+SEXP nanoarrow_c_infer_ptype(SEXP schema_xptr);
+
+static SEXP infer_ptype_data_frame(SEXP schema_xptr) {
+ struct ArrowSchema* schema = schema_from_xptr(schema_xptr);
+ SEXP result = PROTECT(Rf_allocVector(VECSXP, schema->n_children));
+ SEXP result_names = PROTECT(Rf_allocVector(STRSXP, schema->n_children));
+
+ for (R_xlen_t i = 0; i < schema->n_children; i++) {
+ SEXP child_xptr = PROTECT(borrow_schema_child_xptr(schema_xptr, i));
+ SET_VECTOR_ELT(result, i, nanoarrow_c_infer_ptype(child_xptr));
+ UNPROTECT(1);
+
+ struct ArrowSchema* child = schema->children[i];
+ if (child->name != NULL) {
+ SET_STRING_ELT(result_names, i, Rf_mkCharCE(child->name, CE_UTF8));
+ } else {
+ SET_STRING_ELT(result_names, i, Rf_mkChar(""));
+ }
+ }
+
+ Rf_setAttrib(result, R_ClassSymbol, nanoarrow_cls_data_frame);
+ Rf_setAttrib(result, R_NamesSymbol, result_names);
+ SEXP rownames = PROTECT(Rf_allocVector(INTSXP, 2));
+ INTEGER(rownames)[0] = NA_INTEGER;
+ INTEGER(rownames)[1] = 0;
+ Rf_setAttrib(result, R_RowNamesSymbol, rownames);
+ UNPROTECT(3);
+ return result;
+}
+
+SEXP nanoarrow_c_infer_ptype(SEXP schema_xptr) {
+ enum VectorType vector_type = nanoarrow_infer_vector_type_schema(schema_xptr);
+ SEXP ptype = R_NilValue;
+
+ switch (vector_type) {
+ case VECTOR_TYPE_LGL:
+ case VECTOR_TYPE_INT:
+ case VECTOR_TYPE_DBL:
+ case VECTOR_TYPE_CHR:
+ ptype = PROTECT(nanoarrow_alloc_type(vector_type, 0));
+ break;
+ case VECTOR_TYPE_DATA_FRAME:
+ ptype = PROTECT(infer_ptype_data_frame(schema_xptr));
+ break;
+ default:
+ ptype = PROTECT(call_infer_ptype_other(schema_xptr));
+ break;
+ }
+
+ UNPROTECT(1);
+ return ptype;
+}
diff --git a/r/src/init.c b/r/src/init.c
index 580d001..156ec6c 100644
--- a/r/src/init.c
+++ b/r/src/init.c
@@ -20,14 +20,13 @@
#include <Rinternals.h>
#include "altrep.h"
+#include "util.h"
/* generated by tools/make-callentries.R */
-extern SEXP nanoarrow_c_make_altrep_chr(SEXP array_view_xptr);
+extern SEXP nanoarrow_c_make_altrep_chr(SEXP array_xptr);
extern SEXP nanoarrow_c_is_altrep(SEXP x_sexp);
extern SEXP nanoarrow_c_altrep_is_materialized(SEXP x_sexp);
extern SEXP nanoarrow_c_altrep_force_materialize(SEXP x_sexp, SEXP recursive_sexp);
-extern SEXP nanoarrow_c_infer_ptype(SEXP array_xptr);
-extern SEXP nanoarrow_c_from_array(SEXP array_xptr, SEXP ptype_sexp);
extern SEXP nanoarrow_c_array_stream_get_schema(SEXP array_stream_xptr);
extern SEXP nanoarrow_c_array_stream_get_next(SEXP array_stream_xptr);
extern SEXP nanoarrow_c_array_view(SEXP array_xptr, SEXP schema_xptr);
@@ -38,6 +37,9 @@ extern SEXP nanoarrow_c_buffer_info(SEXP buffer_xptr);
extern SEXP nanoarrow_c_buffer_as_raw(SEXP buffer_xptr);
extern SEXP nanoarrow_c_build_id();
extern SEXP nanoarrow_c_build_id_runtime();
+extern SEXP nanoarrow_c_convert_array_stream(SEXP array_stream_xptr, SEXP ptype_sexp, SEXP size_sexp, SEXP n_sexp);
+extern SEXP nanoarrow_c_convert_array(SEXP array_xptr, SEXP ptype_sexp);
+extern SEXP nanoarrow_c_infer_ptype(SEXP schema_xptr);
extern SEXP nanoarrow_c_allocate_schema();
extern SEXP nanoarrow_c_allocate_array();
extern SEXP nanoarrow_c_allocate_array_stream();
@@ -51,6 +53,7 @@ extern SEXP nanoarrow_c_pointer_move(SEXP ptr_src, SEXP ptr_dst);
extern SEXP nanoarrow_c_export_schema(SEXP schema_xptr, SEXP ptr_dst);
extern SEXP nanoarrow_c_export_array(SEXP array_xptr, SEXP ptr_dst);
extern SEXP nanoarrow_c_schema_to_list(SEXP schema_xptr);
+extern SEXP nanoarrow_c_schema_parse(SEXP schema_xptr);
extern SEXP nanoarrow_c_schema_format(SEXP schema_xptr, SEXP recursive_sexp);
static const R_CallMethodDef CallEntries[] = {
@@ -58,8 +61,6 @@ static const R_CallMethodDef CallEntries[] = {
{"nanoarrow_c_is_altrep", (DL_FUNC)&nanoarrow_c_is_altrep, 1},
{"nanoarrow_c_altrep_is_materialized", (DL_FUNC)&nanoarrow_c_altrep_is_materialized, 1},
{"nanoarrow_c_altrep_force_materialize", (DL_FUNC)&nanoarrow_c_altrep_force_materialize, 2},
- {"nanoarrow_c_infer_ptype", (DL_FUNC)&nanoarrow_c_infer_ptype, 1},
- {"nanoarrow_c_from_array", (DL_FUNC)&nanoarrow_c_from_array, 2},
{"nanoarrow_c_array_stream_get_schema", (DL_FUNC)&nanoarrow_c_array_stream_get_schema, 1},
{"nanoarrow_c_array_stream_get_next", (DL_FUNC)&nanoarrow_c_array_stream_get_next, 1},
{"nanoarrow_c_array_view", (DL_FUNC)&nanoarrow_c_array_view, 2},
@@ -70,6 +71,9 @@ static const R_CallMethodDef CallEntries[] = {
{"nanoarrow_c_buffer_as_raw", (DL_FUNC)&nanoarrow_c_buffer_as_raw, 1},
{"nanoarrow_c_build_id", (DL_FUNC)&nanoarrow_c_build_id, 0},
{"nanoarrow_c_build_id_runtime", (DL_FUNC)&nanoarrow_c_build_id_runtime, 0},
+ {"nanoarrow_c_convert_array_stream", (DL_FUNC)&nanoarrow_c_convert_array_stream, 4},
+ {"nanoarrow_c_convert_array", (DL_FUNC)&nanoarrow_c_convert_array, 2},
+ {"nanoarrow_c_infer_ptype", (DL_FUNC)&nanoarrow_c_infer_ptype, 1},
{"nanoarrow_c_allocate_schema", (DL_FUNC)&nanoarrow_c_allocate_schema, 0},
{"nanoarrow_c_allocate_array", (DL_FUNC)&nanoarrow_c_allocate_array, 0},
{"nanoarrow_c_allocate_array_stream", (DL_FUNC)&nanoarrow_c_allocate_array_stream, 0},
@@ -83,6 +87,7 @@ static const R_CallMethodDef CallEntries[] = {
{"nanoarrow_c_export_schema", (DL_FUNC)&nanoarrow_c_export_schema, 2},
{"nanoarrow_c_export_array", (DL_FUNC)&nanoarrow_c_export_array, 2},
{"nanoarrow_c_schema_to_list", (DL_FUNC)&nanoarrow_c_schema_to_list, 1},
+ {"nanoarrow_c_schema_parse", (DL_FUNC)&nanoarrow_c_schema_parse, 1},
{"nanoarrow_c_schema_format", (DL_FUNC)&nanoarrow_c_schema_format, 2},
{NULL, NULL, 0}};
/* end generated by tools/make-callentries.R */
@@ -91,5 +96,6 @@ void R_init_nanoarrow(DllInfo* dll) {
R_registerRoutines(dll, NULL, CallEntries, NULL, NULL);
R_useDynamicSymbols(dll, FALSE);
+ nanoarrow_init_cached_sexps();
register_nanoarrow_altrep(dll);
}
diff --git a/r/src/materialize.c b/r/src/materialize.c
index bdd7cbc..46c1eb2 100644
--- a/r/src/materialize.c
+++ b/r/src/materialize.c
@@ -21,258 +21,236 @@
#include "nanoarrow.h"
-// Note: These conversions are not currently written for safety rather than
-// speed. We could make use of C++ templating to provide faster and/or more
-// readable conversions here with a C entry point.
+// Needed for the list_of materializer
+#include "convert.h"
-SEXP nanoarrow_materialize_lgl(struct ArrowArrayView* array_view) {
- SEXP result_sexp = PROTECT(Rf_allocVector(LGLSXP, array_view->array->length));
- int* result = LOGICAL(result_sexp);
-
- // True for all the types supported here
- const uint8_t* is_valid = array_view->buffer_views[0].data.as_uint8;
- const uint8_t* data_buffer = array_view->buffer_views[1].data.as_uint8;
-
- // Fill the buffer
- switch (array_view->storage_type) {
- case NANOARROW_TYPE_BOOL:
- for (R_xlen_t i = 0; i < array_view->array->length; i++) {
- result[i] = ArrowBitGet(data_buffer, i);
- }
-
- // Set any nulls to NA_LOGICAL
- if (is_valid != NULL && array_view->array->null_count != 0) {
- for (R_xlen_t i = 0; i < array_view->array->length; i++) {
- if (!ArrowBitGet(is_valid, i)) {
- result[i] = NA_LOGICAL;
- }
- }
- }
- break;
- case NANOARROW_TYPE_INT8:
- case NANOARROW_TYPE_UINT8:
- case NANOARROW_TYPE_INT16:
- case NANOARROW_TYPE_UINT16:
- case NANOARROW_TYPE_INT32:
- case NANOARROW_TYPE_UINT32:
- case NANOARROW_TYPE_INT64:
- case NANOARROW_TYPE_UINT64:
- case NANOARROW_TYPE_FLOAT:
- case NANOARROW_TYPE_DOUBLE:
- for (R_xlen_t i = 0; i < array_view->array->length; i++) {
- result[i] = ArrowArrayViewGetIntUnsafe(array_view, i) != 0;
- }
-
- // Set any nulls to NA_LOGICAL
- if (is_valid != NULL && array_view->array->null_count != 0) {
- for (R_xlen_t i = 0; i < array_view->array->length; i++) {
- if (!ArrowBitGet(is_valid, i)) {
- result[i] = NA_LOGICAL;
- }
- }
- }
- break;
+#include "materialize.h"
+#include "materialize_blob.h"
+#include "materialize_chr.h"
+#include "materialize_date.h"
+#include "materialize_dbl.h"
+#include "materialize_difftime.h"
+#include "materialize_int.h"
+#include "materialize_lgl.h"
+#include "materialize_posixct.h"
+#include "materialize_unspecified.h"
+SEXP nanoarrow_alloc_type(enum VectorType vector_type, R_xlen_t len) {
+ switch (vector_type) {
+ case VECTOR_TYPE_LGL:
+ return Rf_allocVector(LGLSXP, len);
+ case VECTOR_TYPE_INT:
+ return Rf_allocVector(INTSXP, len);
+ case VECTOR_TYPE_DBL:
+ return Rf_allocVector(REALSXP, len);
+ case VECTOR_TYPE_CHR:
+ return Rf_allocVector(STRSXP, len);
default:
- UNPROTECT(1);
return R_NilValue;
}
+}
- UNPROTECT(1);
- return result_sexp;
+// A version of Rf_getAttrib(x, sym) != R_NilValue that never
+// expands the row.names attribute
+static int has_attrib_safe(SEXP x, SEXP sym) {
+ for (SEXP atts = ATTRIB(x); atts != R_NilValue; atts = CDR(atts)) {
+ if (TAG(atts) == sym) return TRUE;
+ }
+ return FALSE;
}
-SEXP nanoarrow_materialize_int(struct ArrowArrayView* array_view) {
- SEXP result_sexp = PROTECT(Rf_allocVector(INTSXP, array_view->array->length));
- int* result = INTEGER(result_sexp);
- int64_t n_bad_values = 0;
+void nanoarrow_set_rownames(SEXP x, R_xlen_t len) {
+ // If len fits in the integer range, we can use the c(NA, -nrow)
+ // shortcut for the row.names attribute. R expands this when
+ // the actual value is accessed (even from Rf_getAttrib()).
+ // If len does not fit in the integer range, we need
+ // as.character(seq_len(nrow)) (which returns a deferred ALTREP
+ // string conversion of an ALTREP sequence in recent R). Manipulating
+ // data frames with more than INT_MAX rows is not supported in most
+ // places but column access still works.
+ if (len <= INT_MAX) {
+ SEXP rownames = PROTECT(Rf_allocVector(INTSXP, 2));
+ INTEGER(rownames)[0] = NA_INTEGER;
+ INTEGER(rownames)[1] = -len;
+ Rf_setAttrib(x, R_RowNamesSymbol, rownames);
+ UNPROTECT(1);
+ } else {
+ SEXP length_dbl = PROTECT(Rf_ScalarReal(len));
+ SEXP seq_len_symbol = PROTECT(Rf_install("seq_len"));
+ SEXP seq_len_call = PROTECT(Rf_lang2(seq_len_symbol, length_dbl));
+ SEXP rownames_call = PROTECT(Rf_lang2(R_AsCharacterSymbol, seq_len_call));
+ Rf_setAttrib(x, R_RowNamesSymbol, Rf_eval(rownames_call, R_BaseNamespace));
+ UNPROTECT(4);
+ }
+}
- // True for all the types supported here
- const uint8_t* is_valid = array_view->buffer_views[0].data.as_uint8;
+int nanoarrow_ptype_is_data_frame(SEXP ptype) {
+ return Rf_isObject(ptype) && TYPEOF(ptype) == VECSXP &&
+ (Rf_inherits(ptype, "data.frame") ||
+ (Rf_xlength(ptype) > 0 && has_attrib_safe(ptype, R_NamesSymbol)));
+}
- // Fill the buffer
- switch (array_view->storage_type) {
- case NANOARROW_TYPE_INT32:
- memcpy(result,
- array_view->buffer_views[1].data.as_int32 + array_view->array->offset,
- array_view->array->length * sizeof(int32_t));
+SEXP nanoarrow_materialize_realloc(SEXP ptype, R_xlen_t len) {
+ SEXP result;
- // Set any nulls to NA_INTEGER
- if (is_valid != NULL && array_view->array->null_count != 0) {
- for (R_xlen_t i = 0; i < array_view->array->length; i++) {
- if (!ArrowBitGet(is_valid, i)) {
- result[i] = NA_INTEGER;
- }
- }
- }
- break;
- case NANOARROW_TYPE_BOOL:
- case NANOARROW_TYPE_INT8:
- case NANOARROW_TYPE_UINT8:
- case NANOARROW_TYPE_INT16:
- case NANOARROW_TYPE_UINT16:
- // No need to bounds check for these types
- for (R_xlen_t i = 0; i < array_view->array->length; i++) {
- result[i] = ArrowArrayViewGetIntUnsafe(array_view, i);
+ if (Rf_isObject(ptype)) {
+ if (nanoarrow_ptype_is_data_frame(ptype)) {
+ R_xlen_t num_cols = Rf_xlength(ptype);
+ result = PROTECT(Rf_allocVector(VECSXP, num_cols));
+ for (R_xlen_t i = 0; i < num_cols; i++) {
+ SET_VECTOR_ELT(result, i,
+ nanoarrow_materialize_realloc(VECTOR_ELT(ptype, i), len));
}
- // Set any nulls to NA_INTEGER
- if (is_valid != NULL && array_view->array->null_count != 0) {
- for (R_xlen_t i = 0; i < array_view->array->length; i++) {
- if (!ArrowBitGet(is_valid, i)) {
- result[i] = NA_INTEGER;
- }
- }
- }
- break;
- case NANOARROW_TYPE_UINT32:
- case NANOARROW_TYPE_INT64:
- case NANOARROW_TYPE_UINT64:
- case NANOARROW_TYPE_FLOAT:
- case NANOARROW_TYPE_DOUBLE:
- // Loop + bounds check. Because we don't know what memory might be
- // in a null slot, we have to check nulls if there are any.
- if (is_valid != NULL && array_view->array->null_count != 0) {
- for (R_xlen_t i = 0; i < array_view->array->length; i++) {
- if (ArrowBitGet(is_valid, i)) {
- int64_t value = ArrowArrayViewGetIntUnsafe(array_view, i);
- if (value > INT_MAX || value <= NA_INTEGER) {
- result[i] = NA_INTEGER;
- n_bad_values++;
- } else {
- result[i] = value;
- }
- } else {
- result[i] = NA_INTEGER;
- }
- }
- } else {
- for (R_xlen_t i = 0; i < array_view->array->length; i++) {
- int64_t value = ArrowArrayViewGetIntUnsafe(array_view, i);
- if (value > INT_MAX || value <= NA_INTEGER) {
- result[i] = NA_INTEGER;
- n_bad_values++;
- } else {
- result[i] = value;
- }
- }
+ // Set attributes from ptype
+ Rf_setAttrib(result, R_NamesSymbol, Rf_getAttrib(ptype, R_NamesSymbol));
+ Rf_copyMostAttrib(ptype, result);
+
+ // ...except rownames
+ if (Rf_inherits(ptype, "data.frame")) {
+ nanoarrow_set_rownames(result, len);
}
- break;
+ } else {
+ result = PROTECT(Rf_allocVector(TYPEOF(ptype), len));
+ Rf_copyMostAttrib(ptype, result);
+ }
+ } else {
+ result = PROTECT(Rf_allocVector(TYPEOF(ptype), len));
+ }
- default:
- UNPROTECT(1);
- return R_NilValue;
+ UNPROTECT(1);
+ return result;
+}
+
+static int nanoarrow_materialize_data_frame(struct RConverter* converter,
+ SEXP converter_xptr) {
+ if (converter->ptype_view.vector_type != VECTOR_TYPE_DATA_FRAME) {
+ return EINVAL;
}
- if (n_bad_values > 0) {
- Rf_warning("%ld value(s) outside integer range set to NA", (long)n_bad_values);
+ for (R_xlen_t i = 0; i < converter->n_children; i++) {
+ converter->children[i]->src.offset = converter->src.offset;
+ converter->children[i]->src.length = converter->src.length;
+ converter->children[i]->dst.offset = converter->dst.offset;
+ converter->children[i]->dst.length = converter->dst.length;
+ NANOARROW_RETURN_NOT_OK(
+ nanoarrow_materialize(converter->children[i], converter_xptr));
}
- UNPROTECT(1);
- return result_sexp;
+ return NANOARROW_OK;
+}
+
+static int materialize_list_element(struct RConverter* converter, SEXP converter_xptr,
+ int64_t offset, int64_t length) {
+ if (nanoarrow_converter_reserve(converter_xptr, length) != NANOARROW_OK) {
+ nanoarrow_converter_stop(converter_xptr);
+ }
+
+ converter->src.offset = offset;
+ converter->src.length = length;
+ converter->dst.offset = 0;
+ converter->dst.length = length;
+
+ if (nanoarrow_converter_materialize_n(converter_xptr, length) != length) {
+ return EINVAL;
+ }
+
+ NANOARROW_RETURN_NOT_OK(nanoarrow_converter_finalize(converter_xptr));
+ return NANOARROW_OK;
}
-SEXP nanoarrow_materialize_dbl(struct ArrowArrayView* array_view) {
- SEXP result_sexp = PROTECT(Rf_allocVector(REALSXP, array_view->array->length));
- double* result = REAL(result_sexp);
+static int nanoarrow_materialize_list_of(struct RConverter* converter,
+ SEXP converter_xptr) {
+ SEXP converter_shelter = R_ExternalPtrProtected(converter_xptr);
+ SEXP child_converter_xptrs = VECTOR_ELT(converter_shelter, 3);
+ struct RConverter* child_converter = converter->children[0];
+ SEXP child_converter_xptr = VECTOR_ELT(child_converter_xptrs, 0);
+
+ struct ArrayViewSlice* src = &converter->src;
+ struct VectorSlice* dst = &converter->dst;
- // True for all the types supported here
- const uint8_t* is_valid = array_view->buffer_views[0].data.as_uint8;
+ const int32_t* offsets = src->array_view->buffer_views[1].data.as_int32;
+ const int64_t* large_offsets = src->array_view->buffer_views[1].data.as_int64;
+ int64_t raw_src_offset = src->array_view->array->offset + src->offset;
- // Fill the buffer
- switch (array_view->storage_type) {
- case NANOARROW_TYPE_DOUBLE:
- memcpy(result,
- array_view->buffer_views[1].data.as_double + array_view->array->offset,
- array_view->array->length * sizeof(double));
+ int64_t offset;
+ int64_t length;
- // Set any nulls to NA_REAL
- if (is_valid != NULL && array_view->array->null_count != 0) {
- for (R_xlen_t i = 0; i < array_view->array->length; i++) {
- if (!ArrowBitGet(is_valid, i)) {
- result[i] = NA_REAL;
- }
+ switch (src->array_view->storage_type) {
+ case NANOARROW_TYPE_NA:
+ return NANOARROW_OK;
+ case NANOARROW_TYPE_LIST:
+ for (int64_t i = 0; i < dst->length; i++) {
+ if (!ArrowArrayViewIsNull(src->array_view, src->offset + i)) {
+ offset = offsets[raw_src_offset + i];
+ length = offsets[raw_src_offset + i + 1] - offset;
+ NANOARROW_RETURN_NOT_OK(materialize_list_element(
+ child_converter, child_converter_xptr, offset, length));
+ SET_VECTOR_ELT(dst->vec_sexp, dst->offset + i,
+ nanoarrow_converter_release_result(child_converter_xptr));
}
}
break;
- case NANOARROW_TYPE_BOOL:
- case NANOARROW_TYPE_INT8:
- case NANOARROW_TYPE_UINT8:
- case NANOARROW_TYPE_INT16:
- case NANOARROW_TYPE_UINT16:
- case NANOARROW_TYPE_INT32:
- case NANOARROW_TYPE_UINT32:
- case NANOARROW_TYPE_INT64:
- case NANOARROW_TYPE_UINT64:
- case NANOARROW_TYPE_FLOAT:
- // TODO: implement bounds check for int64 and uint64, but instead
- // of setting to NA, just warn (because sequential values might not
- // roundtrip above 2^51 ish)
- for (R_xlen_t i = 0; i < array_view->array->length; i++) {
- result[i] = ArrowArrayViewGetDoubleUnsafe(array_view, i);
+ case NANOARROW_TYPE_LARGE_LIST:
+ for (int64_t i = 0; i < dst->length; i++) {
+ if (!ArrowArrayViewIsNull(src->array_view, src->offset + i)) {
+ offset = large_offsets[raw_src_offset + i];
+ length = large_offsets[raw_src_offset + i + 1] - offset;
+ NANOARROW_RETURN_NOT_OK(materialize_list_element(
+ child_converter, child_converter_xptr, offset, length));
+ SET_VECTOR_ELT(dst->vec_sexp, dst->offset + i,
+ nanoarrow_converter_release_result(child_converter_xptr));
+ }
}
-
- // Set any nulls to NA_REAL
- if (is_valid != NULL && array_view->array->null_count != 0) {
- for (R_xlen_t i = 0; i < array_view->array->length; i++) {
- if (!ArrowBitGet(is_valid, i)) {
- result[i] = NA_REAL;
- }
+ break;
+ case NANOARROW_TYPE_FIXED_SIZE_LIST:
+ length = src->array_view->layout.child_size_elements;
+ for (int64_t i = 0; i < dst->length; i++) {
+ if (!ArrowArrayViewIsNull(src->array_view, src->offset + i)) {
+ offset = (raw_src_offset + i) * length;
+ NANOARROW_RETURN_NOT_OK(materialize_list_element(
+ child_converter, child_converter_xptr, offset, length));
+ SET_VECTOR_ELT(dst->vec_sexp, dst->offset + i,
+ nanoarrow_converter_release_result(child_converter_xptr));
}
}
break;
-
default:
- UNPROTECT(1);
- return R_NilValue;
+ return EINVAL;
}
- UNPROTECT(1);
- return result_sexp;
+ return NANOARROW_OK;
}
-SEXP nanoarrow_materialize_chr(struct ArrowArrayView* array_view) {
- SEXP result_sexp = PROTECT(Rf_allocVector(STRSXP, array_view->array->length));
-
- struct ArrowStringView item;
- for (R_xlen_t i = 0; i < array_view->array->length; i++) {
- if (ArrowArrayViewIsNull(array_view, i)) {
- SET_STRING_ELT(result_sexp, i, NA_STRING);
- } else {
- item = ArrowArrayViewGetStringUnsafe(array_view, i);
- SET_STRING_ELT(result_sexp, i, Rf_mkCharLenCE(item.data, item.n_bytes, CE_UTF8));
- }
- }
-
- UNPROTECT(1);
- return result_sexp;
-}
+int nanoarrow_materialize(struct RConverter* converter, SEXP converter_xptr) {
+ struct ArrayViewSlice* src = &converter->src;
+ struct VectorSlice* dst = &converter->dst;
+ struct MaterializeOptions* options = converter->options;
-SEXP nanoarrow_materialize_list_of_raw(struct ArrowArrayView* array_view) {
- switch(array_view->storage_type) {
- case NANOARROW_TYPE_STRING:
- case NANOARROW_TYPE_LARGE_STRING:
- case NANOARROW_TYPE_BINARY:
- case NANOARROW_TYPE_LARGE_BINARY:
- break;
+ switch (converter->ptype_view.vector_type) {
+ case VECTOR_TYPE_UNSPECIFIED:
+ return nanoarrow_materialize_unspecified(src, dst, options);
+ case VECTOR_TYPE_LGL:
+ return nanoarrow_materialize_lgl(src, dst, options);
+ case VECTOR_TYPE_INT:
+ return nanoarrow_materialize_int(src, dst, options);
+ case VECTOR_TYPE_DBL:
+ return nanoarrow_materialize_dbl(converter);
+ case VECTOR_TYPE_CHR:
+ return nanoarrow_materialize_chr(src, dst, options);
+ case VECTOR_TYPE_POSIXCT:
+ return nanoarrow_materialize_posixct(converter);
+ case VECTOR_TYPE_DATE:
+ return nanoarrow_materialize_date(converter);
+ case VECTOR_TYPE_DIFFTIME:
+ return nanoarrow_materialize_difftime(converter);
+ case VECTOR_TYPE_BLOB:
+ return nanoarrow_materialize_blob(src, dst, options);
+ case VECTOR_TYPE_LIST_OF:
+ return nanoarrow_materialize_list_of(converter, converter_xptr);
+ case VECTOR_TYPE_DATA_FRAME:
+ return nanoarrow_materialize_data_frame(converter, converter_xptr);
default:
- return R_NilValue;
- }
-
- SEXP result_sexp = PROTECT(Rf_allocVector(VECSXP, array_view->array->length));
-
- struct ArrowBufferView item;
- SEXP item_sexp;
- for (R_xlen_t i = 0; i < array_view->array->length; i++) {
- if (!ArrowArrayViewIsNull(array_view, i)) {
- item = ArrowArrayViewGetBytesUnsafe(array_view, i);
- item_sexp = PROTECT(Rf_allocVector(RAWSXP, item.n_bytes));
- memcpy(RAW(item_sexp), item.data.data, item.n_bytes);
- SET_VECTOR_ELT(result_sexp, i, item_sexp);
- UNPROTECT(1);
- }
+ return ENOTSUP;
}
-
- UNPROTECT(1);
- return result_sexp;
}
diff --git a/r/src/materialize.h b/r/src/materialize.h
index c9aac71..eab3dc3 100644
--- a/r/src/materialize.h
+++ b/r/src/materialize.h
@@ -21,16 +21,21 @@
#include <R.h>
#include <Rinternals.h>
-#include "nanoarrow.h"
-
-// These functions materialize a complete R vector or return R_NilValue
-// if they cannot (i.e., no conversion possible). These functions will warn
-// (once) if there are values that cannot be converted (e.g., because they
-// are out of range).
-SEXP nanoarrow_materialize_lgl(struct ArrowArrayView* array_view);
-SEXP nanoarrow_materialize_int(struct ArrowArrayView* array_view);
-SEXP nanoarrow_materialize_dbl(struct ArrowArrayView* array_view);
-SEXP nanoarrow_materialize_chr(struct ArrowArrayView* array_view);
-SEXP nanoarrow_materialize_list_of_raw(struct ArrowArrayView* array_view);
+#include "materialize_common.h"
+
+// A heuristic to identify prototypes that should be treated like data frames
+// (i.e., including record-style vectors like POSIXct). This heuristic returns
+// true if ptype is a data.frame or is an S3 list with names.
+int nanoarrow_ptype_is_data_frame(SEXP ptype);
+
+// Set rownames of a data.frame (with special handling if len > INT_MAX)
+void nanoarrow_set_rownames(SEXP x, R_xlen_t len);
+
+// Perform actual materializing of values (e.g., loop through buffers)
+int nanoarrow_materialize(struct RConverter* converter, SEXP converter_xptr);
+
+// Shortcut to allocate a vector based on a vector type or ptype
+SEXP nanoarrow_alloc_type(enum VectorType vector_type, R_xlen_t len);
+SEXP nanoarrow_materialize_realloc(SEXP ptype, R_xlen_t len);
#endif
diff --git a/r/src/materialize_blob.h b/r/src/materialize_blob.h
new file mode 100644
index 0000000..0038b39
--- /dev/null
+++ b/r/src/materialize_blob.h
@@ -0,0 +1,60 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#ifndef R_MATERIALIZE_BLOB_H_INCLUDED
+#define R_MATERIALIZE_BLOB_H_INCLUDED
+
+#include <R.h>
+#include <Rinternals.h>
+
+#include "materialize_common.h"
+#include "nanoarrow.h"
+
+static inline int nanoarrow_materialize_blob(struct ArrayViewSlice* src,
+ struct VectorSlice* dst,
+ struct MaterializeOptions* options) {
+ switch (src->array_view->storage_type) {
+ case NANOARROW_TYPE_NA:
+ case NANOARROW_TYPE_STRING:
+ case NANOARROW_TYPE_LARGE_STRING:
+ case NANOARROW_TYPE_BINARY:
+ case NANOARROW_TYPE_LARGE_BINARY:
+ break;
+ default:
+ return EINVAL;
+ }
+
+ if (src->array_view->storage_type == NANOARROW_TYPE_NA) {
+ return NANOARROW_OK;
+ }
+
+ struct ArrowBufferView item;
+ SEXP item_sexp;
+ for (R_xlen_t i = 0; i < dst->length; i++) {
+ if (!ArrowArrayViewIsNull(src->array_view, src->offset + i)) {
+ item = ArrowArrayViewGetBytesUnsafe(src->array_view, src->offset + i);
+ item_sexp = PROTECT(Rf_allocVector(RAWSXP, item.n_bytes));
+ memcpy(RAW(item_sexp), item.data.data, item.n_bytes);
+ SET_VECTOR_ELT(dst->vec_sexp, dst->offset + i, item_sexp);
+ UNPROTECT(1);
+ }
+ }
+
+ return NANOARROW_OK;
+}
+
+#endif
\ No newline at end of file
diff --git a/r/src/materialize_chr.h b/r/src/materialize_chr.h
new file mode 100644
index 0000000..c4316c0
--- /dev/null
+++ b/r/src/materialize_chr.h
@@ -0,0 +1,61 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#ifndef R_MATERIALIZE_CHR_H_INCLUDED
+#define R_MATERIALIZE_CHR_H_INCLUDED
+
+#include <R.h>
+#include <Rinternals.h>
+
+#include "materialize_common.h"
+#include "nanoarrow.h"
+
+static inline int nanoarrow_materialize_chr(struct ArrayViewSlice* src,
+ struct VectorSlice* dst,
+ struct MaterializeOptions* options) {
+ switch (src->array_view->storage_type) {
+ case NANOARROW_TYPE_NA:
+ case NANOARROW_TYPE_STRING:
+ case NANOARROW_TYPE_LARGE_STRING:
+ break;
+ default:
+ return EINVAL;
+ }
+
+ if (src->array_view->storage_type == NANOARROW_TYPE_NA) {
+ for (R_xlen_t i = 0; i < dst->length; i++) {
+ SET_STRING_ELT(dst->vec_sexp, dst->offset + i, NA_STRING);
+ }
+
+ return NANOARROW_OK;
+ }
+
+ struct ArrowStringView item;
+ for (R_xlen_t i = 0; i < dst->length; i++) {
+ if (ArrowArrayViewIsNull(src->array_view, src->offset + i)) {
+ SET_STRING_ELT(dst->vec_sexp, dst->offset + i, NA_STRING);
+ } else {
+ item = ArrowArrayViewGetStringUnsafe(src->array_view, src->offset + i);
+ SET_STRING_ELT(dst->vec_sexp, dst->offset + i,
+ Rf_mkCharLenCE(item.data, item.n_bytes, CE_UTF8));
+ }
+ }
+
+ return NANOARROW_OK;
+}
+
+#endif
diff --git a/r/src/materialize_common.h b/r/src/materialize_common.h
new file mode 100644
index 0000000..e084f05
--- /dev/null
+++ b/r/src/materialize_common.h
@@ -0,0 +1,106 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#ifndef R_MATERIALIZE_COMMON_H_INCLUDED
+#define R_MATERIALIZE_COMMON_H_INCLUDED
+
+#include <R.h>
+#include <Rinternals.h>
+
+#include "nanoarrow.h"
+
+// Vector types that have some special casing internally to avoid unnecessary allocations
+// or looping at the R level. Some of these types also need an SEXP ptype to communicate
+// additional information.
+enum VectorType {
+ VECTOR_TYPE_NULL,
+ VECTOR_TYPE_UNSPECIFIED,
+ VECTOR_TYPE_LGL,
+ VECTOR_TYPE_INT,
+ VECTOR_TYPE_DBL,
+ VECTOR_TYPE_ALTREP_CHR,
+ VECTOR_TYPE_CHR,
+ VECTOR_TYPE_POSIXCT,
+ VECTOR_TYPE_DATE,
+ VECTOR_TYPE_DIFFTIME,
+ VECTOR_TYPE_BLOB,
+ VECTOR_TYPE_LIST_OF,
+ VECTOR_TYPE_DATA_FRAME,
+ VECTOR_TYPE_OTHER
+};
+
+// More easily switch()able version of attr(difftime_obj, "units")
+enum RTimeUnits {
+ R_TIME_UNIT_SECONDS,
+ R_TIME_UNIT_MINUTES,
+ R_TIME_UNIT_HOURS,
+ R_TIME_UNIT_DAYS,
+ R_TIME_UNIT_WEEKS
+};
+
+// A "parsed" version of an SEXP ptype (like a SchemaView but for
+// R objects))
+struct PTypeView {
+ enum VectorType vector_type;
+ int sexp_type;
+ enum RTimeUnits r_time_units;
+ SEXP ptype;
+};
+
+// A wrapper around the ArrayView with an additional offset + length
+// representing a source of a materialization
+struct ArrayViewSlice {
+ struct ArrowArrayView* array_view;
+ int64_t offset;
+ int64_t length;
+};
+
+// A wapper around an SEXP vector with an additional offset + length.
+// This can be both a source and/or a target for copying from/to.
+struct VectorSlice {
+ SEXP vec_sexp;
+ R_xlen_t offset;
+ R_xlen_t length;
+};
+
+// Options for resolving a ptype and for materializing values. These are
+// currently unused but this struct is a placeholder for them when they
+// are implemented.
+struct MaterializeOptions {
+ double scale;
+};
+
+// A house for a conversion operation (i.e., zero or more arrays
+// getting converted into an R vector)). The structure of this
+// may change in the future but the API below should be relatively stable.
+// This is typically accessed via the external pointer whose API is defined
+// in convert.h
+struct RConverter {
+ struct PTypeView ptype_view;
+ struct ArrowSchemaView schema_view;
+ struct ArrowArrayView array_view;
+ struct ArrayViewSlice src;
+ struct VectorSlice dst;
+ struct MaterializeOptions* options;
+ struct ArrowError error;
+ R_xlen_t size;
+ R_xlen_t capacity;
+ R_xlen_t n_children;
+ struct RConverter** children;
+};
+
+#endif
diff --git a/r/src/materialize.h b/r/src/materialize_date.h
similarity index 57%
copy from r/src/materialize.h
copy to r/src/materialize_date.h
index c9aac71..0ef443c 100644
--- a/r/src/materialize.h
+++ b/r/src/materialize_date.h
@@ -1,3 +1,4 @@
+
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
@@ -15,22 +16,28 @@
// specific language governing permissions and limitations
// under the License.
-#ifndef R_MATERIALIZE_H_INCLUDED
-#define R_MATERIALIZE_H_INCLUDED
+#ifndef R_MATERIALIZE_DATE_H_INCLUDED
+#define R_MATERIALIZE_DATE_H_INCLUDED
#include <R.h>
#include <Rinternals.h>
+#include "materialize_common.h"
+#include "materialize_dbl.h"
#include "nanoarrow.h"
-// These functions materialize a complete R vector or return R_NilValue
-// if they cannot (i.e., no conversion possible). These functions will warn
-// (once) if there are values that cannot be converted (e.g., because they
-// are out of range).
-SEXP nanoarrow_materialize_lgl(struct ArrowArrayView* array_view);
-SEXP nanoarrow_materialize_int(struct ArrowArrayView* array_view);
-SEXP nanoarrow_materialize_dbl(struct ArrowArrayView* array_view);
-SEXP nanoarrow_materialize_chr(struct ArrowArrayView* array_view);
-SEXP nanoarrow_materialize_list_of_raw(struct ArrowArrayView* array_view);
+static int nanoarrow_materialize_date(struct RConverter* converter) {
+ if (converter->ptype_view.sexp_type == REALSXP) {
+ switch (converter->schema_view.data_type) {
+ case NANOARROW_TYPE_NA:
+ case NANOARROW_TYPE_DATE32:
+ return nanoarrow_materialize_dbl(converter);
+ default:
+ break;
+ }
+ }
+
+ return EINVAL;
+}
#endif
diff --git a/r/src/materialize_dbl.h b/r/src/materialize_dbl.h
new file mode 100644
index 0000000..865ef4a
--- /dev/null
+++ b/r/src/materialize_dbl.h
@@ -0,0 +1,124 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#ifndef R_MATERIALIZE_DBL_H_INCLUDED
+#define R_MATERIALIZE_DBL_H_INCLUDED
+
+#include <R.h>
+#include <Rinternals.h>
+
+#include "materialize_common.h"
+#include "nanoarrow.h"
+#include "util.h"
+
+// Fall back to arrow for decimal conversion via a package helper
+static inline void nanoarrow_materialize_decimal_to_dbl(struct RConverter* converter) {
+ // A unique situation where we don't want owning external pointers because we know
+ // these are protected for the duration of our call into R and because we don't want
+ // then to be garbage collected and invalidate the converter
+ SEXP array_xptr =
+ PROTECT(R_MakeExternalPtr(converter->array_view.array, R_NilValue, R_NilValue));
+ Rf_setAttrib(array_xptr, R_ClassSymbol, nanoarrow_cls_array);
+ SEXP schema_xptr =
+ PROTECT(R_MakeExternalPtr(converter->schema_view.schema, R_NilValue, R_NilValue));
+ Rf_setAttrib(schema_xptr, R_ClassSymbol, nanoarrow_cls_schema);
+
+ SEXP offset_sexp = PROTECT(Rf_ScalarReal(converter->src.offset));
+ SEXP length_sexp = PROTECT(Rf_ScalarReal(converter->src.length));
+
+ SEXP fun = PROTECT(Rf_install("convert_decimal_to_double"));
+ SEXP call = PROTECT(Rf_lang5(fun, array_xptr, schema_xptr, offset_sexp, length_sexp));
+ SEXP result_src = PROTECT(Rf_eval(call, nanoarrow_ns_pkg));
+ if (Rf_xlength(result_src) != converter->dst.length) {
+ Rf_error("Unexpected result in call to Arrow for decimal conversion");
+ }
+
+ memcpy(REAL(converter->dst.vec_sexp) + converter->dst.offset, REAL(result_src),
+ converter->dst.length * sizeof(double));
+ UNPROTECT(7);
+}
+
+static inline int nanoarrow_materialize_dbl(struct RConverter* converter) {
+ struct ArrayViewSlice* src = &converter->src;
+ struct VectorSlice* dst = &converter->dst;
+ double* result = REAL(dst->vec_sexp);
+
+ // True for all the types supported here
+ const uint8_t* is_valid = src->array_view->buffer_views[0].data.as_uint8;
+ int64_t raw_src_offset = src->array_view->array->offset + src->offset;
+
+ // Fill the buffer
+ switch (src->array_view->storage_type) {
+ case NANOARROW_TYPE_NA:
+ for (R_xlen_t i = 0; i < dst->length; i++) {
+ result[dst->offset + i] = NA_REAL;
+ }
+ break;
+ case NANOARROW_TYPE_DOUBLE:
+ memcpy(result + dst->offset,
+ src->array_view->buffer_views[1].data.as_double + raw_src_offset,
+ dst->length * sizeof(double));
+
+ // Set any nulls to NA_REAL
+ if (is_valid != NULL && src->array_view->array->null_count != 0) {
+ for (R_xlen_t i = 0; i < dst->length; i++) {
+ if (!ArrowBitGet(is_valid, raw_src_offset + i)) {
+ result[dst->offset + i] = NA_REAL;
+ }
+ }
+ }
+ break;
+ case NANOARROW_TYPE_BOOL:
+ case NANOARROW_TYPE_INT8:
+ case NANOARROW_TYPE_UINT8:
+ case NANOARROW_TYPE_INT16:
+ case NANOARROW_TYPE_UINT16:
+ case NANOARROW_TYPE_INT32:
+ case NANOARROW_TYPE_UINT32:
+ case NANOARROW_TYPE_INT64:
+ case NANOARROW_TYPE_UINT64:
+ case NANOARROW_TYPE_FLOAT:
+ // TODO: implement bounds check for int64 and uint64, but instead
+ // of setting to NA, just warn (because sequential values might not
+ // roundtrip above 2^51 ish)
+ for (R_xlen_t i = 0; i < dst->length; i++) {
+ result[dst->offset + i] =
+ ArrowArrayViewGetDoubleUnsafe(src->array_view, src->offset + i);
+ }
+
+ // Set any nulls to NA_REAL
+ if (is_valid != NULL && src->array_view->array->null_count != 0) {
+ for (R_xlen_t i = 0; i < dst->length; i++) {
+ if (!ArrowBitGet(is_valid, raw_src_offset + i)) {
+ result[dst->offset + i] = NA_REAL;
+ }
+ }
+ }
+ break;
+
+ case NANOARROW_TYPE_DECIMAL128:
+ nanoarrow_materialize_decimal_to_dbl(converter);
+ break;
+
+ default:
+ return EINVAL;
+ }
+
+ return NANOARROW_OK;
+}
+
+#endif
diff --git a/r/src/materialize_difftime.h b/r/src/materialize_difftime.h
new file mode 100644
index 0000000..6f91db6
--- /dev/null
+++ b/r/src/materialize_difftime.h
@@ -0,0 +1,92 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#ifndef R_MATERIALIZE_DIFFTIME_H_INCLUDED
+#define R_MATERIALIZE_DIFFTIME_H_INCLUDED
+
+#include <R.h>
+#include <Rinternals.h>
+
+#include "materialize_common.h"
+#include "materialize_dbl.h"
+#include "nanoarrow.h"
+
+static inline int nanoarrow_materialize_difftime(struct RConverter* converter) {
+ if (converter->ptype_view.sexp_type == REALSXP) {
+ switch (converter->schema_view.data_type) {
+ case NANOARROW_TYPE_NA:
+ NANOARROW_RETURN_NOT_OK(nanoarrow_materialize_dbl(converter));
+ return NANOARROW_OK;
+ case NANOARROW_TYPE_TIME32:
+ case NANOARROW_TYPE_TIME64:
+ case NANOARROW_TYPE_DURATION:
+ NANOARROW_RETURN_NOT_OK(nanoarrow_materialize_dbl(converter));
+ break;
+ default:
+ return EINVAL;
+ }
+
+ double scale;
+ switch (converter->ptype_view.r_time_units) {
+ case R_TIME_UNIT_MINUTES:
+ scale = 1.0 / 60;
+ break;
+ case R_TIME_UNIT_HOURS:
+ scale = 1.0 / (60 * 60);
+ break;
+ case R_TIME_UNIT_DAYS:
+ scale = 1.0 / (60 * 60 * 24);
+ break;
+ case R_TIME_UNIT_WEEKS:
+ scale = 1.0 / (60 * 60 * 24 * 7);
+ break;
+ default:
+ scale = 1.0;
+ break;
+ }
+
+ switch (converter->schema_view.time_unit) {
+ case NANOARROW_TIME_UNIT_SECOND:
+ scale *= 1;
+ break;
+ case NANOARROW_TIME_UNIT_MILLI:
+ scale *= 1e-3;
+ break;
+ case NANOARROW_TIME_UNIT_MICRO:
+ scale *= 1e-6;
+ break;
+ case NANOARROW_TIME_UNIT_NANO:
+ scale *= 1e-9;
+ break;
+ default:
+ return EINVAL;
+ }
+
+ if (scale != 1) {
+ double* result = REAL(converter->dst.vec_sexp);
+ for (int64_t i = 0; i < converter->dst.length; i++) {
+ result[converter->dst.offset + i] = result[converter->dst.offset + i] * scale;
+ }
+ }
+
+ return NANOARROW_OK;
+ }
+
+ return EINVAL;
+}
+
+#endif
diff --git a/r/src/materialize_int.h b/r/src/materialize_int.h
new file mode 100644
index 0000000..3d8a392
--- /dev/null
+++ b/r/src/materialize_int.h
@@ -0,0 +1,123 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#ifndef R_MATERIALIZE_INT_H_INCLUDED
+#define R_MATERIALIZE_INT_H_INCLUDED
+
+#include <R.h>
+#include <Rinternals.h>
+
+#include "materialize_common.h"
+#include "nanoarrow.h"
+
+static inline int nanoarrow_materialize_int(struct ArrayViewSlice* src,
+ struct VectorSlice* dst,
+ struct MaterializeOptions* options) {
+ int* result = INTEGER(dst->vec_sexp);
+ int64_t n_bad_values = 0;
+
+ // True for all the types supported here
+ const uint8_t* is_valid = src->array_view->buffer_views[0].data.as_uint8;
+ int64_t raw_src_offset = src->array_view->array->offset + src->offset;
+
+ // Fill the buffer
+ switch (src->array_view->storage_type) {
+ case NANOARROW_TYPE_NA:
+ for (R_xlen_t i = 0; i < dst->length; i++) {
+ result[dst->offset + i] = NA_INTEGER;
+ }
+ break;
+ case NANOARROW_TYPE_INT32:
+ memcpy(result + dst->offset,
+ src->array_view->buffer_views[1].data.as_int32 + raw_src_offset,
+ dst->length * sizeof(int32_t));
+
+ // Set any nulls to NA_INTEGER
+ if (is_valid != NULL && src->array_view->array->null_count != 0) {
+ for (R_xlen_t i = 0; i < dst->length; i++) {
+ if (!ArrowBitGet(is_valid, raw_src_offset + i)) {
+ result[dst->offset + i] = NA_INTEGER;
+ }
+ }
+ }
+ break;
+ case NANOARROW_TYPE_BOOL:
+ case NANOARROW_TYPE_INT8:
+ case NANOARROW_TYPE_UINT8:
+ case NANOARROW_TYPE_INT16:
+ case NANOARROW_TYPE_UINT16:
+ // No need to bounds check for these types
+ for (R_xlen_t i = 0; i < dst->length; i++) {
+ result[dst->offset + i] =
+ ArrowArrayViewGetIntUnsafe(src->array_view, src->offset + i);
+ }
+
+ // Set any nulls to NA_INTEGER
+ if (is_valid != NULL && src->array_view->array->null_count != 0) {
+ for (R_xlen_t i = 0; i < dst->length; i++) {
+ if (!ArrowBitGet(is_valid, raw_src_offset + i)) {
+ result[dst->offset + i] = NA_INTEGER;
+ }
+ }
+ }
+ break;
+ case NANOARROW_TYPE_UINT32:
+ case NANOARROW_TYPE_INT64:
+ case NANOARROW_TYPE_UINT64:
+ case NANOARROW_TYPE_FLOAT:
+ case NANOARROW_TYPE_DOUBLE:
+ // Loop + bounds check. Because we don't know what memory might be
+ // in a null slot, we have to check nulls if there are any.
+ if (is_valid != NULL && src->array_view->array->null_count != 0) {
+ for (R_xlen_t i = 0; i < dst->length; i++) {
+ if (ArrowBitGet(is_valid, raw_src_offset + i)) {
+ int64_t value = ArrowArrayViewGetIntUnsafe(src->array_view, src->offset + i);
+ if (value > INT_MAX || value <= NA_INTEGER) {
+ result[dst->offset + i] = NA_INTEGER;
+ n_bad_values++;
+ } else {
+ result[dst->offset + i] = value;
+ }
+ } else {
+ result[dst->offset + i] = NA_INTEGER;
+ }
+ }
+ } else {
+ for (R_xlen_t i = 0; i < dst->length; i++) {
+ int64_t value = ArrowArrayViewGetIntUnsafe(src->array_view, src->offset + i);
+ if (value > INT_MAX || value <= NA_INTEGER) {
+ result[dst->offset + i] = NA_INTEGER;
+ n_bad_values++;
+ } else {
+ result[dst->offset + i] = value;
+ }
+ }
+ }
+ break;
+
+ default:
+ return EINVAL;
+ }
+
+ if (n_bad_values > 0) {
+ Rf_warning("%ld value(s) outside integer range set to NA", (long)n_bad_values);
+ }
+
+ return NANOARROW_OK;
+}
+
+#endif
diff --git a/r/src/materialize_lgl.h b/r/src/materialize_lgl.h
new file mode 100644
index 0000000..cf9e149
--- /dev/null
+++ b/r/src/materialize_lgl.h
@@ -0,0 +1,88 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#ifndef R_MATERIALIZE_LGL_H_INCLUDED
+#define R_MATERIALIZE_LGL_H_INCLUDED
+
+#include <R.h>
+#include <Rinternals.h>
+
+#include "materialize_common.h"
+#include "nanoarrow.h"
+
+static int nanoarrow_materialize_lgl(struct ArrayViewSlice* src, struct VectorSlice* dst,
+ struct MaterializeOptions* options) {
+ // True for all the types supported here
+ const uint8_t* is_valid = src->array_view->buffer_views[0].data.as_uint8;
+ const uint8_t* data_buffer = src->array_view->buffer_views[1].data.as_uint8;
+ int64_t raw_src_offset = src->array_view->array->offset + src->offset;
+ int* result = LOGICAL(dst->vec_sexp);
+
+ // Fill the buffer
+ switch (src->array_view->storage_type) {
+ case NANOARROW_TYPE_NA:
+ for (R_xlen_t i = 0; i < dst->length; i++) {
+ result[dst->offset + i] = NA_LOGICAL;
+ }
+ break;
+ case NANOARROW_TYPE_BOOL:
+ for (R_xlen_t i = 0; i < dst->length; i++) {
+ result[dst->offset + i] = ArrowBitGet(data_buffer, src->offset + i);
+ }
+
+ // Set any nulls to NA_LOGICAL
+ if (is_valid != NULL && src->array_view->array->null_count != 0) {
+ for (R_xlen_t i = 0; i < dst->length; i++) {
+ if (!ArrowBitGet(is_valid, raw_src_offset + i)) {
+ result[dst->offset + i] = NA_LOGICAL;
+ }
+ }
+ }
+ break;
+ case NANOARROW_TYPE_INT8:
+ case NANOARROW_TYPE_UINT8:
+ case NANOARROW_TYPE_INT16:
+ case NANOARROW_TYPE_UINT16:
+ case NANOARROW_TYPE_INT32:
+ case NANOARROW_TYPE_UINT32:
+ case NANOARROW_TYPE_INT64:
+ case NANOARROW_TYPE_UINT64:
+ case NANOARROW_TYPE_FLOAT:
+ case NANOARROW_TYPE_DOUBLE:
+ for (R_xlen_t i = 0; i < src->array_view->array->length; i++) {
+ result[dst->offset + i] =
+ ArrowArrayViewGetIntUnsafe(src->array_view, src->offset + i) != 0;
+ }
+
+ // Set any nulls to NA_LOGICAL
+ if (is_valid != NULL && src->array_view->array->null_count != 0) {
+ for (R_xlen_t i = 0; i < dst->length; i++) {
+ if (!ArrowBitGet(is_valid, raw_src_offset + i)) {
+ result[dst->offset + i] = NA_LOGICAL;
+ }
+ }
+ }
+ break;
+
+ default:
+ return EINVAL;
+ }
+
+ return NANOARROW_OK;
+}
+
+#endif
diff --git a/r/src/materialize_posixct.h b/r/src/materialize_posixct.h
new file mode 100644
index 0000000..e8641e1
--- /dev/null
+++ b/r/src/materialize_posixct.h
@@ -0,0 +1,79 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#ifndef R_MATERIALIZE_POSIXCT_H_INCLUDED
+#define R_MATERIALIZE_POSIXCT_H_INCLUDED
+
+#include <R.h>
+#include <Rinternals.h>
+
+#include "materialize_common.h"
+#include "materialize_dbl.h"
+#include "nanoarrow.h"
+
+static inline int nanoarrow_materialize_posixct(struct RConverter* converter) {
+ if (converter->ptype_view.sexp_type == REALSXP) {
+ enum ArrowTimeUnit time_unit;
+ switch (converter->schema_view.data_type) {
+ case NANOARROW_TYPE_NA:
+ time_unit = NANOARROW_TIME_UNIT_SECOND;
+ NANOARROW_RETURN_NOT_OK(nanoarrow_materialize_dbl(converter));
+ break;
+ case NANOARROW_TYPE_DATE64:
+ time_unit = NANOARROW_TIME_UNIT_MILLI;
+ NANOARROW_RETURN_NOT_OK(nanoarrow_materialize_dbl(converter));
+ break;
+ case NANOARROW_TYPE_TIMESTAMP:
+ time_unit = converter->schema_view.time_unit;
+ NANOARROW_RETURN_NOT_OK(nanoarrow_materialize_dbl(converter));
+ break;
+ default:
+ return EINVAL;
+ }
+
+ double scale;
+ switch (time_unit) {
+ case NANOARROW_TIME_UNIT_SECOND:
+ scale = 1;
+ break;
+ case NANOARROW_TIME_UNIT_MILLI:
+ scale = 1e-3;
+ break;
+ case NANOARROW_TIME_UNIT_MICRO:
+ scale = 1e-6;
+ break;
+ case NANOARROW_TIME_UNIT_NANO:
+ scale = 1e-9;
+ break;
+ default:
+ return EINVAL;
+ }
+
+ if (scale != 1) {
+ double* result = REAL(converter->dst.vec_sexp);
+ for (int64_t i = 0; i < converter->dst.length; i++) {
+ result[converter->dst.offset + i] = result[converter->dst.offset + i] * scale;
+ }
+ }
+
+ return NANOARROW_OK;
+ }
+
+ return EINVAL;
+}
+
+#endif
diff --git a/r/src/materialize_unspecified.h b/r/src/materialize_unspecified.h
new file mode 100644
index 0000000..acf5f8b
--- /dev/null
+++ b/r/src/materialize_unspecified.h
@@ -0,0 +1,58 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#ifndef R_MATERIALIZE_UNSPECIFIED_H_INCLUDED
+#define R_MATERIALIZE_UNSPECIFIED_H_INCLUDED
+
+#include <R.h>
+#include <Rinternals.h>
+
+#include "materialize_common.h"
+#include "nanoarrow.h"
+
+static inline int nanoarrow_materialize_unspecified(struct ArrayViewSlice* src,
+ struct VectorSlice* dst,
+ struct MaterializeOptions* options) {
+ int* result = LOGICAL(dst->vec_sexp);
+
+ int64_t total_offset = src->array_view->array->offset + src->offset;
+ int64_t length = src->length;
+ const uint8_t* bits = src->array_view->buffer_views[0].data.as_uint8;
+
+ if (length == 0 || src->array_view->storage_type == NANOARROW_TYPE_NA ||
+ ArrowBitCountSet(bits, total_offset, length) == 0) {
+ // We can blindly set all the values to NA_LOGICAL without checking
+ for (int64_t i = 0; i < length; i++) {
+ result[dst->offset + i] = NA_LOGICAL;
+ }
+ } else {
+ // Count non-null values and warn
+ int64_t n_bad_values = 0;
+ for (int64_t i = 0; i < length; i++) {
+ n_bad_values += ArrowBitGet(bits, total_offset + i);
+ result[dst->offset + i] = NA_LOGICAL;
+ }
+
+ if (n_bad_values > 0) {
+ Rf_warning("%ld non-null value(s) set to NA", (long)n_bad_values);
+ }
+ }
+
+ return NANOARROW_OK;
+}
+
+#endif
\ No newline at end of file
diff --git a/r/src/schema.c b/r/src/schema.c
index 3726ef2..ab73d33 100644
--- a/r/src/schema.c
+++ b/r/src/schema.c
@@ -21,6 +21,7 @@
#include "nanoarrow.h"
#include "schema.h"
+#include "util.h"
void finalize_schema_xptr(SEXP schema_xptr) {
struct ArrowSchema* schema = (struct ArrowSchema*)R_ExternalPtrAddr(schema_xptr);
@@ -63,7 +64,7 @@ static SEXP schema_metadata_to_list(const char* metadata) {
static SEXP borrow_schema_xptr(struct ArrowSchema* schema, SEXP shelter) {
SEXP schema_xptr = PROTECT(R_MakeExternalPtr(schema, R_NilValue, shelter));
- Rf_setAttrib(schema_xptr, R_ClassSymbol, Rf_mkString("nanoarrow_schema"));
+ Rf_setAttrib(schema_xptr, R_ClassSymbol, nanoarrow_cls_schema);
UNPROTECT(1);
return schema_xptr;
}
@@ -104,7 +105,8 @@ SEXP nanoarrow_c_schema_to_list(SEXP schema_xptr) {
SEXP child_xptr = PROTECT(borrow_schema_xptr(schema->children[i], schema_xptr));
SET_VECTOR_ELT(children_sexp, i, child_xptr);
if (schema->children[i]->name != NULL) {
- SET_STRING_ELT(children_names_sexp, i, Rf_mkCharCE(schema->children[i]->name, CE_UTF8));
+ SET_STRING_ELT(children_names_sexp, i,
+ Rf_mkCharCE(schema->children[i]->name, CE_UTF8));
} else {
SET_STRING_ELT(children_names_sexp, i, Rf_mkCharCE("", CE_UTF8));
}
@@ -129,6 +131,98 @@ SEXP nanoarrow_c_schema_to_list(SEXP schema_xptr) {
return result;
}
+static SEXP mkStringView(struct ArrowStringView* view) {
+ if (view->data == NULL) {
+ return R_NilValue;
+ }
+
+ SEXP chr = PROTECT(Rf_mkCharLenCE(view->data, view->n_bytes, CE_UTF8));
+ SEXP str = PROTECT(Rf_allocVector(STRSXP, 1));
+ SET_STRING_ELT(str, 0, chr);
+ UNPROTECT(2);
+ return str;
+}
+
+SEXP nanoarrow_c_schema_parse(SEXP schema_xptr) {
+ struct ArrowSchema* schema = schema_from_xptr(schema_xptr);
+
+ struct ArrowSchemaView schema_view;
+ struct ArrowError error;
+ int status = ArrowSchemaViewInit(&schema_view, schema, &error);
+ if (status != NANOARROW_OK) {
+ Rf_error("ArrowSchemaViewInit(): %s", ArrowErrorMessage(&error));
+ }
+
+ const char* names[] = {
+ "type", "storage_type", "extension_name", "extension_metadata",
+ "fixed_size", "decimal_bitwidth", "decimal_precision", "decimal_scale",
+ "time_unit", "timezone", "union_type_ids", ""};
+
+ SEXP result = PROTECT(Rf_mkNamed(VECSXP, names));
+ SET_VECTOR_ELT(result, 0, Rf_mkString(ArrowTypeString((schema_view.data_type))));
+ SET_VECTOR_ELT(result, 1,
+ Rf_mkString(ArrowTypeString((schema_view.storage_data_type))));
+
+ if (schema_view.extension_name.data != NULL) {
+ SET_VECTOR_ELT(result, 2, mkStringView(&schema_view.extension_name));
+ }
+
+ if (schema_view.extension_metadata.data != NULL) {
+ SEXP metadata_sexp =
+ PROTECT(Rf_allocVector(RAWSXP, schema_view.extension_metadata.n_bytes));
+ memcpy(RAW(metadata_sexp), schema_view.extension_metadata.data,
+ schema_view.extension_metadata.n_bytes);
+ SET_VECTOR_ELT(result, 3, metadata_sexp);
+ UNPROTECT(1);
+ }
+
+ if (schema_view.data_type == NANOARROW_TYPE_FIXED_SIZE_LIST ||
+ schema_view.data_type == NANOARROW_TYPE_FIXED_SIZE_BINARY) {
+ SET_VECTOR_ELT(result, 4, Rf_ScalarInteger(schema_view.fixed_size));
+ }
+
+ if (schema_view.data_type == NANOARROW_TYPE_DECIMAL128 ||
+ schema_view.data_type == NANOARROW_TYPE_DECIMAL256) {
+ SET_VECTOR_ELT(result, 5, Rf_ScalarInteger(schema_view.decimal_bitwidth));
+ SET_VECTOR_ELT(result, 6, Rf_ScalarInteger(schema_view.decimal_precision));
+ SET_VECTOR_ELT(result, 7, Rf_ScalarInteger(schema_view.decimal_scale));
+ }
+
+ if (schema_view.data_type == NANOARROW_TYPE_TIME32 ||
+ schema_view.data_type == NANOARROW_TYPE_TIME64 ||
+ schema_view.data_type == NANOARROW_TYPE_TIMESTAMP ||
+ schema_view.data_type == NANOARROW_TYPE_DURATION) {
+ SET_VECTOR_ELT(result, 8, Rf_mkString(ArrowTimeUnitString((schema_view.time_unit))));
+ }
+
+ if (schema_view.data_type == NANOARROW_TYPE_TIMESTAMP) {
+ SET_VECTOR_ELT(result, 9, mkStringView(&schema_view.timezone));
+ }
+
+ if (schema_view.data_type == NANOARROW_TYPE_DENSE_UNION ||
+ schema_view.data_type == NANOARROW_TYPE_SPARSE_UNION) {
+ int num_type_ids = 1;
+ for (int64_t i = 0; i < schema_view.union_type_ids.n_bytes; i++) {
+ num_type_ids += schema_view.union_type_ids.data[i] == ',';
+ }
+
+ SEXP union_type_ids = PROTECT(Rf_allocVector(INTSXP, num_type_ids));
+ const char* ptr = schema_view.union_type_ids.data;
+ char* end_ptr = (char*)ptr;
+ int i = 0;
+ while (*end_ptr != '\0') {
+ INTEGER(union_type_ids)[i] = strtol(ptr, &end_ptr, 10);
+ i++;
+ ptr = end_ptr + 1;
+ }
+ SET_VECTOR_ELT(result, 10, union_type_ids);
+ UNPROTECT(1);
+ }
+
+ UNPROTECT(1);
+ return result;
+}
+
SEXP nanoarrow_c_schema_format(SEXP schema_xptr, SEXP recursive_sexp) {
int recursive = LOGICAL(recursive_sexp)[0];
@@ -148,7 +242,8 @@ SEXP nanoarrow_c_schema_format(SEXP schema_xptr, SEXP recursive_sexp) {
SEXP formatted_sexp = PROTECT(Rf_allocVector(RAWSXP, size_needed + 1));
ArrowSchemaToString(schema, (char*)RAW(formatted_sexp), size_needed + 1, recursive);
SEXP result_sexp = PROTECT(Rf_allocVector(STRSXP, 1));
- SET_STRING_ELT(result_sexp, 0, Rf_mkCharLenCE((char*)RAW(formatted_sexp), size_needed, CE_UTF8));
+ SET_STRING_ELT(result_sexp, 0,
+ Rf_mkCharLenCE((char*)RAW(formatted_sexp), size_needed, CE_UTF8));
UNPROTECT(2);
return result_sexp;
}
diff --git a/r/src/schema.h b/r/src/schema.h
index 9d38ef2..54bf4fa 100644
--- a/r/src/schema.h
+++ b/r/src/schema.h
@@ -22,6 +22,7 @@
#include <Rinternals.h>
#include "nanoarrow.h"
+#include "util.h"
void finalize_schema_xptr(SEXP schema_xptr);
@@ -72,7 +73,7 @@ static inline SEXP schema_owning_xptr() {
schema->release = NULL;
SEXP schema_xptr = PROTECT(R_MakeExternalPtr(schema, R_NilValue, R_NilValue));
- Rf_setAttrib(schema_xptr, R_ClassSymbol, Rf_mkString("nanoarrow_schema"));
+ Rf_setAttrib(schema_xptr, R_ClassSymbol, nanoarrow_cls_schema);
R_RegisterCFinalizer(schema_xptr, &finalize_schema_xptr);
UNPROTECT(1);
return schema_xptr;
diff --git a/r/src/util.c b/r/src/util.c
new file mode 100644
index 0000000..9e55a37
--- /dev/null
+++ b/r/src/util.c
@@ -0,0 +1,51 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#define R_NO_REMAP
+#include <R.h>
+#include <Rinternals.h>
+
+#include "util.h"
+
+SEXP nanoarrow_ns_pkg = NULL;
+SEXP nanoarrow_cls_array = NULL;
+SEXP nanoarrow_cls_altrep_chr = NULL;
+SEXP nanoarrow_cls_array_view = NULL;
+SEXP nanoarrow_cls_data_frame = NULL;
+SEXP nanoarrow_cls_schema = NULL;
+SEXP nanoarrow_cls_array_stream = NULL;
+
+void nanoarrow_init_cached_sexps() {
+ SEXP nanoarrow_str = PROTECT(Rf_mkString("nanoarrow"));
+ nanoarrow_ns_pkg = PROTECT(R_FindNamespace(nanoarrow_str));
+ nanoarrow_cls_array = PROTECT(Rf_mkString("nanoarrow_array"));
+ nanoarrow_cls_altrep_chr = PROTECT(Rf_mkString("nanoarrow::altrep_chr"));
+ nanoarrow_cls_array_view = PROTECT(Rf_mkString("nanoarrow_array_view"));
+ nanoarrow_cls_data_frame = PROTECT(Rf_mkString("data.frame"));
+ nanoarrow_cls_schema = PROTECT(Rf_mkString("nanoarrow_schema"));
+ nanoarrow_cls_array_stream = PROTECT(Rf_mkString("nanoarrow_array_stream"));
+
+ R_PreserveObject(nanoarrow_ns_pkg);
+ R_PreserveObject(nanoarrow_cls_array);
+ R_PreserveObject(nanoarrow_cls_altrep_chr);
+ R_PreserveObject(nanoarrow_cls_array_view);
+ R_PreserveObject(nanoarrow_cls_data_frame);
+ R_PreserveObject(nanoarrow_cls_schema);
+ R_PreserveObject(nanoarrow_cls_array_stream);
+
+ UNPROTECT(8);
+}
diff --git a/r/src/materialize.h b/r/src/util.h
similarity index 55%
copy from r/src/materialize.h
copy to r/src/util.h
index c9aac71..18e16b1 100644
--- a/r/src/materialize.h
+++ b/r/src/util.h
@@ -15,22 +15,20 @@
// specific language governing permissions and limitations
// under the License.
-#ifndef R_MATERIALIZE_H_INCLUDED
-#define R_MATERIALIZE_H_INCLUDED
+#ifndef R_UTIL_H_INCLUDED
+#define R_UTIL_H_INCLUDED
#include <R.h>
#include <Rinternals.h>
-#include "nanoarrow.h"
+extern SEXP nanoarrow_ns_pkg;
+extern SEXP nanoarrow_cls_array;
+extern SEXP nanoarrow_cls_altrep_chr;
+extern SEXP nanoarrow_cls_array_view;
+extern SEXP nanoarrow_cls_data_frame;
+extern SEXP nanoarrow_cls_schema;
+extern SEXP nanoarrow_cls_array_stream;
-// These functions materialize a complete R vector or return R_NilValue
-// if they cannot (i.e., no conversion possible). These functions will warn
-// (once) if there are values that cannot be converted (e.g., because they
-// are out of range).
-SEXP nanoarrow_materialize_lgl(struct ArrowArrayView* array_view);
-SEXP nanoarrow_materialize_int(struct ArrowArrayView* array_view);
-SEXP nanoarrow_materialize_dbl(struct ArrowArrayView* array_view);
-SEXP nanoarrow_materialize_chr(struct ArrowArrayView* array_view);
-SEXP nanoarrow_materialize_list_of_raw(struct ArrowArrayView* array_view);
+void nanoarrow_init_cached_sexps();
#endif
diff --git a/r/tests/testthat/test-array-convert-vector.R b/r/tests/testthat/test-array-convert-vector.R
deleted file mode 100644
index 6e40a03..0000000
--- a/r/tests/testthat/test-array-convert-vector.R
+++ /dev/null
@@ -1,375 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-test_that("infer_nanoarrow_ptype() works for basic types", {
- expect_identical(
- infer_nanoarrow_ptype(as_nanoarrow_array(logical())),
- logical()
- )
-
- expect_identical(
- infer_nanoarrow_ptype(as_nanoarrow_array(integer())),
- integer()
- )
-
- expect_identical(
- infer_nanoarrow_ptype(as_nanoarrow_array(double())),
- double()
- )
-
- expect_identical(
- infer_nanoarrow_ptype(as_nanoarrow_array(character())),
- character()
- )
-
- expect_identical(
- infer_nanoarrow_ptype(as_nanoarrow_array(data.frame(x = character()))),
- data.frame(x = character())
- )
-})
-
-test_that("infer_nanoarrow_ptype() errors for types it can't infer", {
- unsupported_array <- arrow::concat_arrays(type = arrow::decimal256(3, 4))
- expect_error(
- infer_nanoarrow_ptype(as_nanoarrow_array(unsupported_array)),
- "Can't infer R vector type for array <d:3,4,256>"
- )
-
- unsupported_struct <- arrow::concat_arrays(
- type = arrow::struct(col = arrow::decimal256(3, 4))
- )
- expect_error(
- infer_nanoarrow_ptype(as_nanoarrow_array(unsupported_struct)),
- "Can't infer R vector type for `col` <d:3,4,256>"
- )
-})
-
-test_that("from_nanoarrow_array() errors for invalid arrays", {
- array <- as_nanoarrow_array(1:10)
- nanoarrow_array_set_schema(
- array,
- infer_nanoarrow_schema("chr"),
- validate = FALSE
- )
-
- expect_error(
- from_nanoarrow_array(array),
- "Expected array with 3 buffer"
- )
-})
-
-test_that("from_nanoarrow_array() errors for unsupported ptype", {
- array <- as_nanoarrow_array(1:10)
-
- # an S3 unsupported type
- expect_error(
- from_nanoarrow_array(array, structure(list(), class = "some_class")),
- "Can't convert array <i> to R vector of type some_class"
- )
-
- # A non-S3 unsupported type
- expect_error(
- from_nanoarrow_array(array, environment()),
- "Can't convert array <i> to R vector of type environment"
- )
-})
-
-test_that("from_nanoarrow_array() errors for unsupported array", {
- unsupported_array <- arrow::concat_arrays(type = arrow::decimal256(3, 4))
- expect_error(
- from_nanoarrow_array(as_nanoarrow_array(unsupported_array)),
- "Can't infer R vector type for array <d:3,4,256>"
- )
-})
-
-test_that("convert to vector works for data.frame", {
- df <- data.frame(a = 1L, b = "two", c = 3, d = TRUE)
- array <- as_nanoarrow_array(df)
-
- expect_identical(from_nanoarrow_array(array, NULL), df)
- expect_identical(from_nanoarrow_array(array, df), df)
-
- expect_error(
- from_nanoarrow_array(array, data.frame(a = integer(), b = raw())),
- "Expected data.frame\\(\\) ptype with 4 column\\(s\\) but found 2 column\\(s\\)"
- )
-
- bad_ptype <- data.frame(a = integer(), b = raw(), c = double(), d = integer())
- expect_error(
- from_nanoarrow_array(array, bad_ptype),
- "Can't convert `b` <u> to R vector of type raw"
- )
-})
-
-test_that("convert to vector works for partial_frame", {
- array <- as_nanoarrow_array(data.frame(a = 1L, b = "two"))
- expect_identical(
- from_nanoarrow_array(array, vctrs::partial_frame()),
- data.frame(a = 1L, b = "two")
- )
-})
-
-test_that("convert to vector works for tibble", {
- array <- as_nanoarrow_array(data.frame(a = 1L, b = "two"))
- expect_identical(
- from_nanoarrow_array(array, tibble::tibble(a = integer(), b = character())),
- tibble::tibble(a = 1L, b = "two")
- )
-
- # Check nested tibble at both levels
- tbl_nested_df <- tibble::tibble(a = 1L, b = "two", c = data.frame(a = 3))
- array_nested <- as_nanoarrow_array(tbl_nested_df)
-
- expect_identical(
- from_nanoarrow_array(array_nested, tbl_nested_df),
- tbl_nested_df
- )
-
- df_nested_tbl <- as.data.frame(tbl_nested_df)
- df_nested_tbl$c <- tibble::as_tibble(df_nested_tbl$c)
- expect_identical(
- from_nanoarrow_array(array_nested, df_nested_tbl),
- df_nested_tbl
- )
-})
-
-test_that("convert to vector works for valid logical()", {
- arrow_numeric_types <- list(
- int8 = arrow::int8(),
- uint8 = arrow::uint8(),
- int16 = arrow::int16(),
- uint16 = arrow::uint16(),
- int32 = arrow::int32(),
- uint32 = arrow::uint32(),
- int64 = arrow::int64(),
- uint64 = arrow::uint64(),
- float32 = arrow::float32(),
- float64 = arrow::float64()
- )
-
- vals <- c(NA, 0:10)
- for (nm in names(arrow_numeric_types)) {
- expect_identical(
- from_nanoarrow_array(
- as_nanoarrow_array(vals, schema = arrow_numeric_types[[!!nm]]),
- logical()
- ),
- vals != 0
- )
- }
-
- vals_no_na <- 0:10
- for (nm in names(arrow_numeric_types)) {
- expect_identical(
- from_nanoarrow_array(
- as_nanoarrow_array(vals_no_na, schema = arrow_numeric_types[[!!nm]]),
- logical()
- ),
- vals_no_na != 0
- )
- }
-
- # Boolean array to logical
- expect_identical(
- from_nanoarrow_array(
- as_nanoarrow_array(c(NA, TRUE, FALSE), schema = arrow::boolean()),
- logical()
- ),
- c(NA, TRUE, FALSE)
- )
-
- expect_identical(
- from_nanoarrow_array(
- as_nanoarrow_array(c(TRUE, FALSE), schema = arrow::boolean()),
- logical()
- ),
- c(TRUE, FALSE)
- )
-})
-
-test_that("convert to vector errors for bad array to logical()", {
- expect_error(
- from_nanoarrow_array(as_nanoarrow_array(letters), logical()),
- "Can't convert array <u> to R vector of type logical"
- )
-})
-
-test_that("convert to vector works for valid integer()", {
- arrow_int_types <- list(
- int8 = arrow::int8(),
- uint8 = arrow::uint8(),
- int16 = arrow::int16(),
- uint16 = arrow::uint16(),
- int32 = arrow::int32(),
- uint32 = arrow::uint32(),
- int64 = arrow::int64(),
- uint64 = arrow::uint64(),
- float32 = arrow::float32(),
- float64 = arrow::float64()
- )
-
- ints <- c(NA, 0:10)
- for (nm in names(arrow_int_types)) {
- expect_identical(
- from_nanoarrow_array(
- as_nanoarrow_array(ints, schema = arrow_int_types[[!!nm]]),
- integer()
- ),
- ints
- )
- }
-
- ints_no_na <- 0:10
- for (nm in names(arrow_int_types)) {
- expect_identical(
- from_nanoarrow_array(
- as_nanoarrow_array(ints_no_na, schema = arrow_int_types[[!!nm]]),
- integer()
- ),
- ints_no_na
- )
- }
-
- # Boolean array to integer
- expect_identical(
- from_nanoarrow_array(
- as_nanoarrow_array(c(NA, TRUE, FALSE), schema = arrow::boolean()),
- integer()
- ),
- c(NA, 1L, 0L)
- )
-
- expect_identical(
- from_nanoarrow_array(
- as_nanoarrow_array(c(TRUE, FALSE), schema = arrow::boolean()),
- integer()
- ),
- c(1L, 0L)
- )
-})
-
-test_that("convert to vector warns for invalid integer()", {
- array <- as_nanoarrow_array(arrow::as_arrow_array(.Machine$double.xmax))
- expect_warning(
- expect_identical(from_nanoarrow_array(array, integer()), NA_integer_),
- "1 value\\(s\\) outside integer range set to NA"
- )
-
- array <- as_nanoarrow_array(arrow::as_arrow_array(c(NA, .Machine$double.xmax)))
- expect_warning(
- expect_identical(from_nanoarrow_array(array, integer()), c(NA_integer_, NA_integer_)),
- "1 value\\(s\\) outside integer range set to NA"
- )
-})
-
-test_that("convert to vector errors for bad array to integer()", {
- expect_error(
- from_nanoarrow_array(as_nanoarrow_array(letters), integer()),
- "Can't convert array <u> to R vector of type integer"
- )
-})
-
-test_that("convert to vector works for valid double()", {
- arrow_numeric_types <- list(
- int8 = arrow::int8(),
- uint8 = arrow::uint8(),
- int16 = arrow::int16(),
- uint16 = arrow::uint16(),
- int32 = arrow::int32(),
- uint32 = arrow::uint32(),
- int64 = arrow::int64(),
- uint64 = arrow::uint64(),
- float32 = arrow::float32(),
- float64 = arrow::float64()
- )
-
- vals <- as.double(c(NA, 0:10))
- for (nm in names(arrow_numeric_types)) {
- expect_identical(
- from_nanoarrow_array(
- as_nanoarrow_array(vals, schema = arrow_numeric_types[[!!nm]]),
- double()
- ),
- vals
- )
- }
-
- vals_no_na <- as.double(0:10)
- for (nm in names(arrow_numeric_types)) {
- expect_identical(
- from_nanoarrow_array(
- as_nanoarrow_array(vals_no_na, schema = arrow_numeric_types[[!!nm]]),
- double()
- ),
- vals_no_na
- )
- }
-
- # Boolean array to double
- expect_identical(
- from_nanoarrow_array(
- as_nanoarrow_array(c(NA, TRUE, FALSE), schema = arrow::boolean()),
- double()
- ),
- as.double(c(NA, 1L, 0L))
- )
-
- expect_identical(
- from_nanoarrow_array(
- as_nanoarrow_array(c(TRUE, FALSE), schema = arrow::boolean()),
- double()
- ),
- as.double(c(1L, 0L))
- )
-})
-
-test_that("convert to vector errors for bad array to double()", {
- expect_error(
- from_nanoarrow_array(as_nanoarrow_array(letters), double()),
- "Can't convert array <u> to R vector of type numeric"
- )
-})
-
-test_that("convert to vector works for character()", {
- array <- as_nanoarrow_array(letters)
- expect_identical(
- from_nanoarrow_array(array, character()),
- letters
- )
-
- # make sure we get altrep here
- expect_true(is_nanoarrow_altrep(from_nanoarrow_array(array, character())))
-
- # check an array that we can't convert
- expect_error(
- from_nanoarrow_array(as_nanoarrow_array(1:5), character()),
- "Can't convert array <i> to R vector of type character"
- )
-})
-
-test_that("convert to vector works for character()", {
- array <- as_nanoarrow_array(list(as.raw(1:5)), schema = arrow::binary())
- expect_identical(
- from_nanoarrow_array(array),
- list(as.raw(1:5))
- )
-
- expect_identical(
- from_nanoarrow_array(array, list()),
- list(as.raw(1:5))
- )
-})
diff --git a/r/tests/testthat/test-array-stream.R b/r/tests/testthat/test-array-stream.R
index 33cd9b1..cc0e243 100644
--- a/r/tests/testthat/test-array-stream.R
+++ b/r/tests/testthat/test-array-stream.R
@@ -46,6 +46,22 @@ test_that("infer_nanoarrow_schema() is implemented for streams", {
expect_identical(schema$children$x$format, "i")
})
+test_that("as.data.frame() is implemented for streams", {
+ stream <- as_nanoarrow_array_stream(data.frame(x = 1:5))
+ expect_identical(
+ as.data.frame(stream),
+ data.frame(x = 1:5)
+ )
+})
+
+test_that("as.vector() is implemented for streams", {
+ stream <- as_nanoarrow_array_stream(data.frame(x = 1:5))
+ expect_identical(
+ as.vector(stream),
+ data.frame(x = 1:5)
+ )
+})
+
test_that("nanoarrow_array_stream list interface works", {
stream <- as_nanoarrow_array_stream(data.frame(x = 1:5))
expect_identical(length(stream), 3L)
diff --git a/r/tests/testthat/test-array.R b/r/tests/testthat/test-array.R
index b58362d..d37d414 100644
--- a/r/tests/testthat/test-array.R
+++ b/r/tests/testthat/test-array.R
@@ -37,12 +37,12 @@ test_that("schemaless nanoarrow_array format, print, and str methods work", {
expect_output(expect_identical(print(array), array), "nanoarrow_array")
})
-test_that("as_nanoarrow_array() / from_nanoarrow_array() default method works", {
+test_that("as_nanoarrow_array() / convert_array() default method works", {
array <- as_nanoarrow_array(1:10)
- expect_identical(from_nanoarrow_array(array), 1:10)
+ expect_identical(convert_array(array), 1:10)
array <- as_nanoarrow_array(as.double(1:10), schema = arrow::float64())
- expect_identical(from_nanoarrow_array(array), as.double(1:10))
+ expect_identical(convert_array(array), as.double(1:10))
})
test_that("infer_nanoarrow_schema() works for nanoarrow_array", {
diff --git a/r/tests/testthat/test-convert-array-stream.R b/r/tests/testthat/test-convert-array-stream.R
new file mode 100644
index 0000000..5f58cba
--- /dev/null
+++ b/r/tests/testthat/test-convert-array-stream.R
@@ -0,0 +1,208 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+test_that("convert array stream works", {
+ stream0 <- arrow::RecordBatchReader$create(
+ schema = arrow::schema(x = arrow::int32())
+ )
+ stream0 <- as_nanoarrow_array_stream(stream0)
+ expect_identical(convert_array_stream(stream0), data.frame(x = integer()))
+
+ stream1 <- arrow::RecordBatchReader$create(
+ arrow::record_batch(x = 1:5)
+ )
+ stream1 <- as_nanoarrow_array_stream(stream1)
+ expect_identical(convert_array_stream(stream1), data.frame(x = 1:5))
+
+ stream2 <- arrow::RecordBatchReader$create(
+ arrow::record_batch(x = 1:5),
+ arrow::record_batch(x = 6:10)
+ )
+ stream2 <- as_nanoarrow_array_stream(stream2)
+ expect_identical(convert_array_stream(stream2), data.frame(x = 1:10))
+})
+
+test_that("convert array stream with explicit size works", {
+ stream0 <- arrow::RecordBatchReader$create(
+ schema = arrow::schema(x = arrow::int32())
+ )
+ stream0 <- as_nanoarrow_array_stream(stream0)
+ expect_identical(
+ convert_array_stream(stream0, size = 0),
+ data.frame(x = integer())
+ )
+
+ stream1 <- arrow::RecordBatchReader$create(
+ arrow::record_batch(x = 1:5)
+ )
+ stream1 <- as_nanoarrow_array_stream(stream1)
+ expect_identical(
+ convert_array_stream(stream1, size = 5),
+ data.frame(x = 1:5)
+ )
+
+ stream2 <- arrow::RecordBatchReader$create(
+ arrow::record_batch(x = 1:5),
+ arrow::record_batch(x = 6:10)
+ )
+ stream2 <- as_nanoarrow_array_stream(stream2)
+ expect_identical(
+ convert_array_stream(stream2, size = 10),
+ data.frame(x = 1:10)
+ )
+})
+
+test_that("convert array stream with functional ptype works", {
+ tibble_or_bust <- function(array, ptype) {
+ if (is.data.frame(ptype)) {
+ ptype <- tibble::as_tibble(ptype)
+ ptype[] <- Map(tibble_or_bust, list(NULL), ptype)
+ }
+
+ ptype
+ }
+
+ df_nested_df <- as.data.frame(
+ tibble::tibble(a = 1L, b = "two", c = data.frame(a = 3))
+ )
+ stream_nested <- as_nanoarrow_array_stream(df_nested_df)
+ expect_identical(
+ convert_array_stream(stream_nested, tibble_or_bust),
+ tibble::tibble(a = 1L, b = "two", c = tibble::tibble(a = 3))
+ )
+})
+
+test_that("convert array stream works for nested data.frames", {
+ tbl_nested_df <- tibble::tibble(a = 1L, b = "two", c = data.frame(a = 3))
+
+ stream_nested <- as_nanoarrow_array_stream(tbl_nested_df)
+ expect_identical(
+ convert_array_stream(stream_nested, tbl_nested_df),
+ tbl_nested_df
+ )
+
+ stream_nested <- as_nanoarrow_array_stream(tbl_nested_df)
+ expect_identical(
+ convert_array_stream(stream_nested, size = 1L),
+ as.data.frame(tbl_nested_df)
+ )
+
+ stream_nested <- as_nanoarrow_array_stream(tbl_nested_df)
+ expect_identical(
+ convert_array_stream(stream_nested, tbl_nested_df, size = 1L),
+ tbl_nested_df
+ )
+})
+
+test_that("convert array stream works for struct-style vectors", {
+ raw_posixlt <- as.data.frame(unclass(as.POSIXlt("2021-01-01", tz = "America/Halifax")))
+
+ stream <- as_nanoarrow_array_stream(raw_posixlt)
+ expect_identical(
+ convert_array_stream(stream),
+ raw_posixlt
+ )
+
+ stream <- as_nanoarrow_array_stream(raw_posixlt)
+ expect_identical(
+ convert_array_stream(stream, as.POSIXlt("2021-01-01", tz = "America/Halifax")),
+ as.POSIXlt("2021-01-01", tz = "America/Halifax")
+ )
+
+ # Check with fixed size since this takes a different code path
+ stream <- as_nanoarrow_array_stream(raw_posixlt)
+ expect_identical(
+ convert_array_stream(stream, size = 1L),
+ raw_posixlt
+ )
+
+ stream <- as_nanoarrow_array_stream(raw_posixlt)
+ expect_identical(
+ convert_array_stream(
+ stream,
+ as.POSIXlt("2021-01-01", tz = "America/Halifax"),
+ size = 1
+ ),
+ as.POSIXlt("2021-01-01", tz = "America/Halifax")
+ )
+})
+
+test_that("convert array stream respects the value of n", {
+ batches <- list(
+ arrow::record_batch(x = 1:5),
+ arrow::record_batch(x = 6:10),
+ arrow::record_batch(x = 11:15)
+ )
+
+ reader3 <- arrow::RecordBatchReader$create(batches = batches)
+ stream3 <- as_nanoarrow_array_stream(reader3)
+ expect_identical(
+ convert_array_stream(stream3, n = 0),
+ data.frame(x = integer())
+ )
+
+ reader3 <- arrow::RecordBatchReader$create(batches = batches)
+ stream3 <- as_nanoarrow_array_stream(reader3)
+ expect_identical(
+ convert_array_stream(stream3, n = 1),
+ data.frame(x = 1:5)
+ )
+
+ reader3 <- arrow::RecordBatchReader$create(batches = batches)
+ stream3 <- as_nanoarrow_array_stream(reader3)
+ expect_identical(
+ convert_array_stream(stream3, n = 2),
+ data.frame(x = 1:10)
+ )
+})
+
+test_that("fixed-size convert array stream respects the value of n", {
+ batches <- list(
+ arrow::record_batch(x = 1:5),
+ arrow::record_batch(x = 6:10),
+ arrow::record_batch(x = 11:15)
+ )
+
+ reader3 <- arrow::RecordBatchReader$create(batches = batches)
+ stream3 <- as_nanoarrow_array_stream(reader3)
+ expect_identical(
+ convert_array_stream(stream3, n = 0, size = 0),
+ data.frame(x = integer())
+ )
+
+ reader3 <- arrow::RecordBatchReader$create(batches = batches)
+ stream3 <- as_nanoarrow_array_stream(reader3)
+ expect_identical(
+ convert_array_stream(stream3, n = 1, size = 5),
+ data.frame(x = 1:5)
+ )
+
+ reader3 <- arrow::RecordBatchReader$create(batches = batches)
+ stream3 <- as_nanoarrow_array_stream(reader3)
+ expect_identical(
+ convert_array_stream(stream3, n = 2, size = 10),
+ data.frame(x = 1:10)
+ )
+})
+
+test_that("fixed-size stream conversion errors when the output has insufficient size", {
+ stream <- as_nanoarrow_array_stream(data.frame(x = 1:100))
+ expect_error(
+ convert_array_stream(stream, size = 2),
+ "Expected to materialize 100 values in batch 1 but materialized 2"
+ )
+})
diff --git a/r/tests/testthat/test-convert-array.R b/r/tests/testthat/test-convert-array.R
new file mode 100644
index 0000000..e6705a6
--- /dev/null
+++ b/r/tests/testthat/test-convert-array.R
@@ -0,0 +1,755 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+test_that("convert_array() errors for invalid arrays", {
+ array <- as_nanoarrow_array(1:10)
+ nanoarrow_array_set_schema(
+ array,
+ infer_nanoarrow_schema("chr"),
+ validate = FALSE
+ )
+
+ expect_error(
+ convert_array(array),
+ "Expected array with 3 buffer"
+ )
+})
+
+test_that("convert_array() errors for unsupported ptype", {
+ array <- as_nanoarrow_array(1:10)
+
+ # an S3 unsupported type
+ expect_error(
+ convert_array(array, structure(list(), class = "some_class")),
+ "Can't convert array <int32> to R vector of type some_class"
+ )
+
+ # A non-S3 unsupported type
+ expect_error(
+ convert_array(array, environment()),
+ "Can't convert array <int32> to R vector of type environment"
+ )
+
+ # An array with a name to an unsupported type
+ struct_array <- as_nanoarrow_array(data.frame(x = 1L))
+ expect_error(
+ convert_array(struct_array$children$x, environment()),
+ "Can't convert `x`"
+ )
+})
+
+test_that("convert_array() errors for unsupported array", {
+ unsupported_array <- arrow::concat_arrays(type = arrow::decimal256(3, 4))
+ expect_error(
+ convert_array(as_nanoarrow_array(unsupported_array)),
+ "Can't infer R vector type for array <decimal256\\(3, 4\\)>"
+ )
+})
+
+test_that("convert to vector works for data.frame", {
+ df <- data.frame(a = 1L, b = "two", c = 3, d = TRUE)
+ array <- as_nanoarrow_array(df)
+
+ expect_identical(convert_array(array, NULL), df)
+ expect_identical(convert_array(array, df), df)
+
+ expect_error(
+ convert_array(array, data.frame(a = integer(), b = raw())),
+ "Expected data.frame\\(\\) ptype with 4 column\\(s\\) but found 2 column\\(s\\)"
+ )
+
+ bad_ptype <- data.frame(a = integer(), b = raw(), c = double(), d = integer())
+ expect_error(
+ convert_array(array, bad_ptype),
+ "Can't convert `b` <string> to R vector of type raw"
+ )
+})
+
+test_that("convert to vector works for partial_frame", {
+ array <- as_nanoarrow_array(data.frame(a = 1L, b = "two"))
+ expect_identical(
+ convert_array(array, vctrs::partial_frame()),
+ data.frame(a = 1L, b = "two")
+ )
+})
+
+test_that("convert to vector works for function()", {
+ tibble_or_bust <- function(array, ptype) {
+ if (is.data.frame(ptype)) {
+ ptype <- tibble::as_tibble(ptype)
+ ptype[] <- Map(tibble_or_bust, list(NULL), ptype)
+ }
+
+ ptype
+ }
+
+ df_nested_df <- as.data.frame(
+ tibble::tibble(a = 1L, b = "two", c = data.frame(a = 3))
+ )
+ array_nested <- as_nanoarrow_array(df_nested_df)
+ expect_identical(
+ convert_array(array_nested, tibble_or_bust),
+ tibble::tibble(a = 1L, b = "two", c = tibble::tibble(a = 3))
+ )
+})
+
+test_that("convert to vector works for tibble", {
+ array <- as_nanoarrow_array(data.frame(a = 1L, b = "two"))
+ expect_identical(
+ convert_array(array, tibble::tibble(a = integer(), b = character())),
+ tibble::tibble(a = 1L, b = "two")
+ )
+
+ # Check nested tibble at both levels
+ tbl_nested_df <- tibble::tibble(a = 1L, b = "two", c = data.frame(a = 3))
+ array_nested <- as_nanoarrow_array(tbl_nested_df)
+
+ expect_identical(
+ convert_array(array_nested, tbl_nested_df),
+ tbl_nested_df
+ )
+
+ df_nested_tbl <- as.data.frame(tbl_nested_df)
+ df_nested_tbl$c <- tibble::as_tibble(df_nested_tbl$c)
+ expect_identical(
+ convert_array(array_nested, df_nested_tbl),
+ df_nested_tbl
+ )
+})
+
+test_that("convert to vector works for struct-style vectors", {
+ array <- as_nanoarrow_array(as.POSIXlt("2021-01-01", tz = "America/Halifax"))
+ expect_identical(
+ convert_array(array),
+ as.data.frame(unclass(as.POSIXlt("2021-01-01", tz = "America/Halifax")))
+ )
+
+ array <- as_nanoarrow_array(as.POSIXlt("2021-01-01", tz = "America/Halifax"))
+ expect_identical(
+ convert_array(array, as.POSIXlt("2021-01-01", tz = "America/Halifax")),
+ as.POSIXlt("2021-01-01", tz = "America/Halifax")
+ )
+})
+
+test_that("convert to vector works for unspecified()", {
+ array <- as_nanoarrow_array(arrow::Array$create(rep(NA, 10), arrow::null()))
+
+ # implicit for null type
+ expect_identical(
+ convert_array(array, to = NULL),
+ vctrs::vec_cast(rep(NA, 10), vctrs::unspecified())
+ )
+
+ # explicit for null type
+ expect_identical(
+ convert_array(array, vctrs::unspecified()),
+ vctrs::vec_cast(rep(NA, 10), vctrs::unspecified())
+ )
+
+ # explicit for non-null type that is all NAs
+ array <- as_nanoarrow_array(rep(NA_integer_, 10))
+ expect_identical(
+ convert_array(array, vctrs::unspecified()),
+ vctrs::vec_cast(rep(NA, 10), vctrs::unspecified())
+ )
+
+ # explicit for non-null type that is not all NAs
+ array <- as_nanoarrow_array(c(1L, rep(NA_integer_, 9)))
+ expect_warning(
+ expect_identical(
+ convert_array(array, vctrs::unspecified()),
+ vctrs::vec_cast(rep(NA, 10), vctrs::unspecified())
+ ),
+ "1 non-null value\\(s\\) set to NA"
+ )
+})
+
+test_that("convert to vector works for valid logical()", {
+ arrow_numeric_types <- list(
+ int8 = arrow::int8(),
+ uint8 = arrow::uint8(),
+ int16 = arrow::int16(),
+ uint16 = arrow::uint16(),
+ int32 = arrow::int32(),
+ uint32 = arrow::uint32(),
+ int64 = arrow::int64(),
+ uint64 = arrow::uint64(),
+ float32 = arrow::float32(),
+ float64 = arrow::float64()
+ )
+
+ vals <- c(NA, 0:10)
+ for (nm in names(arrow_numeric_types)) {
+ expect_identical(
+ convert_array(
+ as_nanoarrow_array(vals, schema = arrow_numeric_types[[!!nm]]),
+ logical()
+ ),
+ vals != 0
+ )
+ }
+
+ vals_no_na <- 0:10
+ for (nm in names(arrow_numeric_types)) {
+ expect_identical(
+ convert_array(
+ as_nanoarrow_array(vals_no_na, schema = arrow_numeric_types[[!!nm]]),
+ logical()
+ ),
+ vals_no_na != 0
+ )
+ }
+
+ # Boolean array to logical
+ expect_identical(
+ convert_array(
+ as_nanoarrow_array(c(NA, TRUE, FALSE), schema = arrow::boolean()),
+ logical()
+ ),
+ c(NA, TRUE, FALSE)
+ )
+
+ expect_identical(
+ convert_array(
+ as_nanoarrow_array(c(TRUE, FALSE), schema = arrow::boolean()),
+ logical()
+ ),
+ c(TRUE, FALSE)
+ )
+})
+
+test_that("convert to vector works for null -> logical()", {
+ array <- as_nanoarrow_array(arrow::Array$create(rep(NA, 10), arrow::null()))
+ expect_identical(
+ convert_array(array, logical()),
+ rep(NA, 10)
+ )
+})
+
+test_that("convert to vector errors for bad array to logical()", {
+ expect_error(
+ convert_array(as_nanoarrow_array(letters), logical()),
+ "Can't convert array <string> to R vector of type logical"
+ )
+})
+
+test_that("convert to vector works for valid integer()", {
+ arrow_int_types <- list(
+ int8 = arrow::int8(),
+ uint8 = arrow::uint8(),
+ int16 = arrow::int16(),
+ uint16 = arrow::uint16(),
+ int32 = arrow::int32(),
+ uint32 = arrow::uint32(),
+ int64 = arrow::int64(),
+ uint64 = arrow::uint64(),
+ float32 = arrow::float32(),
+ float64 = arrow::float64()
+ )
+
+ ints <- c(NA, 0:10)
+ for (nm in names(arrow_int_types)) {
+ expect_identical(
+ convert_array(
+ as_nanoarrow_array(ints, schema = arrow_int_types[[!!nm]]),
+ integer()
+ ),
+ ints
+ )
+ }
+
+ ints_no_na <- 0:10
+ for (nm in names(arrow_int_types)) {
+ expect_identical(
+ convert_array(
+ as_nanoarrow_array(ints_no_na, schema = arrow_int_types[[!!nm]]),
+ integer()
+ ),
+ ints_no_na
+ )
+ }
+
+ # Boolean array to integer
+ expect_identical(
+ convert_array(
+ as_nanoarrow_array(c(NA, TRUE, FALSE), schema = arrow::boolean()),
+ integer()
+ ),
+ c(NA, 1L, 0L)
+ )
+
+ expect_identical(
+ convert_array(
+ as_nanoarrow_array(c(TRUE, FALSE), schema = arrow::boolean()),
+ integer()
+ ),
+ c(1L, 0L)
+ )
+})
+
+test_that("convert to vector works for null -> logical()", {
+ array <- as_nanoarrow_array(arrow::Array$create(rep(NA, 10), arrow::null()))
+ expect_identical(
+ convert_array(array, integer()),
+ rep(NA_integer_, 10)
+ )
+})
+
+test_that("convert to vector warns for invalid integer()", {
+ array <- as_nanoarrow_array(arrow::as_arrow_array(.Machine$double.xmax))
+ expect_warning(
+ expect_identical(convert_array(array, integer()), NA_integer_),
+ "1 value\\(s\\) outside integer range set to NA"
+ )
+
+ array <- as_nanoarrow_array(arrow::as_arrow_array(c(NA, .Machine$double.xmax)))
+ expect_warning(
+ expect_identical(convert_array(array, integer()), c(NA_integer_, NA_integer_)),
+ "1 value\\(s\\) outside integer range set to NA"
+ )
+})
+
+test_that("convert to vector errors for bad array to integer()", {
+ expect_error(
+ convert_array(as_nanoarrow_array(letters), integer()),
+ "Can't convert array <string> to R vector of type integer"
+ )
+})
+
+test_that("convert to vector works for valid double()", {
+ arrow_numeric_types <- list(
+ int8 = arrow::int8(),
+ uint8 = arrow::uint8(),
+ int16 = arrow::int16(),
+ uint16 = arrow::uint16(),
+ int32 = arrow::int32(),
+ uint32 = arrow::uint32(),
+ int64 = arrow::int64(),
+ uint64 = arrow::uint64(),
+ float32 = arrow::float32(),
+ float64 = arrow::float64()
+ )
+
+ vals <- as.double(c(NA, 0:10))
+ for (nm in names(arrow_numeric_types)) {
+ expect_identical(
+ convert_array(
+ as_nanoarrow_array(vals, schema = arrow_numeric_types[[!!nm]]),
+ double()
+ ),
+ vals
+ )
+ }
+
+ vals_no_na <- as.double(0:10)
+ for (nm in names(arrow_numeric_types)) {
+ expect_identical(
+ convert_array(
+ as_nanoarrow_array(vals_no_na, schema = arrow_numeric_types[[!!nm]]),
+ double()
+ ),
+ vals_no_na
+ )
+ }
+
+ # Boolean array to double
+ expect_identical(
+ convert_array(
+ as_nanoarrow_array(c(NA, TRUE, FALSE), schema = arrow::boolean()),
+ double()
+ ),
+ as.double(c(NA, 1L, 0L))
+ )
+
+ expect_identical(
+ convert_array(
+ as_nanoarrow_array(c(TRUE, FALSE), schema = arrow::boolean()),
+ double()
+ ),
+ as.double(c(1L, 0L))
+ )
+})
+
+test_that("convert to vector works for decimal128 -> double()", {
+ array <- as_nanoarrow_array(arrow::Array$create(1:10)$cast(arrow::decimal128(20, 10)))
+ expect_equal(
+ convert_array(array, double()),
+ as.double(1:10)
+ )
+})
+
+test_that("convert to vector works for null -> double()", {
+ array <- as_nanoarrow_array(arrow::Array$create(rep(NA, 10), arrow::null()))
+ expect_identical(
+ convert_array(array, double()),
+ rep(NA_real_, 10)
+ )
+})
+
+test_that("convert to vector errors for bad array to double()", {
+ expect_error(
+ convert_array(as_nanoarrow_array(letters), double()),
+ "Can't convert array <string> to R vector of type numeric"
+ )
+})
+
+test_that("convert to vector works for character()", {
+ array <- as_nanoarrow_array(letters)
+ expect_identical(
+ convert_array(array, character()),
+ letters
+ )
+
+ # make sure we get altrep here
+ expect_true(is_nanoarrow_altrep(convert_array(array, character())))
+
+ # check an array that we can't convert
+ expect_error(
+ convert_array(as_nanoarrow_array(1:5), character()),
+ "Can't convert array <int32> to R vector of type character"
+ )
+})
+
+test_that("convert to vector works for null -> character()", {
+ array <- as_nanoarrow_array(arrow::Array$create(rep(NA, 10), arrow::null()))
+ all_nulls <- convert_array(array, character())
+ nanoarrow_altrep_force_materialize(all_nulls)
+ expect_identical(
+ all_nulls,
+ rep(NA_character_, 10)
+ )
+})
+
+test_that("convert to vector works for blob::blob()", {
+ array <- as_nanoarrow_array(list(as.raw(1:5)), schema = arrow::binary())
+ expect_identical(
+ convert_array(array),
+ blob::blob(as.raw(1:5))
+ )
+
+ expect_identical(
+ convert_array(array, blob::blob()),
+ blob::blob(as.raw(1:5))
+ )
+})
+
+test_that("convert to vector works for null -> blob::blob()", {
+ array <- as_nanoarrow_array(arrow::Array$create(rep(NA, 10), arrow::null()))
+ expect_identical(
+ convert_array(array, blob::blob()),
+ blob::new_blob(rep(list(NULL), 10))
+ )
+})
+
+test_that("convert to vector works for list -> vctrs::list_of", {
+ array_list <- as_nanoarrow_array(
+ arrow::Array$create(
+ list(1:5, 6:10, NULL),
+ type = arrow::list_of(arrow::int32())
+ )
+ )
+
+ # Default conversion
+ expect_identical(
+ convert_array(array_list),
+ vctrs::list_of(1:5, 6:10, NULL, .ptype = integer())
+ )
+
+ # With explicit ptype
+ expect_identical(
+ convert_array(array_list, vctrs::list_of(.ptype = double())),
+ vctrs::list_of(as.double(1:5), as.double(6:10), NULL, .ptype = double())
+ )
+
+ # With bad ptype
+ expect_error(
+ convert_array(array_list, vctrs::list_of(.ptype = character())),
+ "Can't convert array"
+ )
+
+ # With malformed ptype
+ ptype <- vctrs::list_of(.ptype = character())
+ attr(ptype, "ptype") <- NULL
+ expect_error(
+ convert_array(array_list, ptype),
+ "Expected attribute 'ptype'"
+ )
+})
+
+test_that("convert to vector works for large_list -> vctrs::list_of", {
+ array_list <- as_nanoarrow_array(
+ arrow::Array$create(
+ list(1:5, 6:10, NULL),
+ type = arrow::large_list_of(arrow::int32())
+ )
+ )
+
+ # Default conversion
+ expect_identical(
+ convert_array(array_list),
+ vctrs::list_of(1:5, 6:10, NULL, .ptype = integer())
+ )
+
+ # With explicit ptype
+ expect_identical(
+ convert_array(array_list, vctrs::list_of(.ptype = double())),
+ vctrs::list_of(as.double(1:5), as.double(6:10), NULL, .ptype = double())
+ )
+
+ # With bad ptype
+ expect_error(
+ convert_array(array_list, vctrs::list_of(.ptype = character())),
+ "Can't convert array"
+ )
+})
+
+test_that("convert to vector works for fixed_size_list -> vctrs::list_of", {
+ array_list <- as_nanoarrow_array(
+ arrow::Array$create(
+ list(1:5, 6:10, NULL),
+ type = arrow::fixed_size_list_of(arrow::int32(), 5)
+ )
+ )
+
+ # Default conversion
+ expect_identical(
+ convert_array(array_list),
+ vctrs::list_of(1:5, 6:10, NULL, .ptype = integer())
+ )
+
+ # With explicit ptype
+ expect_identical(
+ convert_array(array_list, vctrs::list_of(.ptype = double())),
+ vctrs::list_of(as.double(1:5), as.double(6:10), NULL, .ptype = double())
+ )
+
+ # With bad ptype
+ expect_error(
+ convert_array(array_list, vctrs::list_of(.ptype = character())),
+ "Can't convert array"
+ )
+})
+
+test_that("convert to vector works for null -> vctrs::list_of()", {
+ array <- as_nanoarrow_array(arrow::Array$create(rep(NA, 10), arrow::null()))
+ expect_identical(
+ convert_array(array, vctrs::list_of(.ptype = integer())),
+ vctrs::new_list_of(rep(list(NULL), 10), ptype = integer())
+ )
+})
+
+test_that("convert to vector works for Date", {
+ array_date <- as_nanoarrow_array(as.Date(c(NA, "2000-01-01")))
+ expect_identical(
+ convert_array(array_date),
+ as.Date(c(NA, "2000-01-01"))
+ )
+
+ array_date <- as_nanoarrow_array(
+ arrow::Array$create(as.Date(c(NA, "2000-01-01")), arrow::date64())
+ )
+ expect_identical(
+ convert_array(array_date),
+ as.POSIXct(c(NA, "2000-01-01"), tz = "UTC")
+ )
+})
+
+test_that("convert to vector works for null -> Date", {
+ array <- as_nanoarrow_array(arrow::Array$create(rep(NA, 10), arrow::null()))
+ expect_identical(
+ convert_array(array, as.Date(character())),
+ as.Date(rep(NA_character_, 10))
+ )
+})
+
+test_that("convert to vector works for hms", {
+ array_time <- as_nanoarrow_array(hms::parse_hm("12:34"))
+ expect_identical(
+ convert_array(array_time),
+ hms::parse_hm("12:34")
+ )
+})
+
+test_that("convert to vector works for null -> hms", {
+ array <- as_nanoarrow_array(arrow::Array$create(rep(NA, 10), arrow::null()))
+ expect_identical(
+ convert_array(array, hms::hms()),
+ hms::parse_hms(rep(NA_character_, 10))
+ )
+})
+
+test_that("convert to vector works for POSIXct", {
+ array_timestamp <- as_nanoarrow_array(
+ as.POSIXct("2000-01-01 12:33", tz = "America/Halifax")
+ )
+
+ expect_identical(
+ convert_array(array_timestamp),
+ as.POSIXct("2000-01-01 12:33", tz = "America/Halifax")
+ )
+})
+
+test_that("convert to vector works for null -> POSIXct", {
+ array <- as_nanoarrow_array(arrow::Array$create(rep(NA, 10), arrow::null()))
+ expect_identical(
+ convert_array(array, as.POSIXct(character(), tz = "America/Halifax")),
+ as.POSIXct(rep(NA_character_, 10), tz = "America/Halifax")
+ )
+})
+
+test_that("convert to vector works for difftime", {
+ x <- as.difftime(123, units = "secs")
+ array_duration <- as_nanoarrow_array(x)
+
+ # default
+ expect_identical(convert_array(array_duration), x)
+
+ # explicit
+ expect_identical(convert_array(array_duration, x), x)
+
+ # explicit with other difftime units
+ units(x) <- "mins"
+ expect_identical(convert_array(array_duration, x), x)
+
+ units(x) <- "hours"
+ expect_identical(convert_array(array_duration, x), x)
+
+ units(x) <- "days"
+ expect_identical(convert_array(array_duration, x), x)
+
+ units(x) <- "weeks"
+ expect_equal(convert_array(array_duration, x), x)
+
+ # with all Arrow units
+ x <- as.difftime(123, units = "secs")
+ array_duration <- as_nanoarrow_array(
+ arrow::Array$create(x, arrow::duration("s"))
+ )
+ expect_identical(convert_array(array_duration), x)
+
+ array_duration <- as_nanoarrow_array(
+ arrow::Array$create(x, arrow::duration("ms"))
+ )
+ expect_identical(convert_array(array_duration), x)
+
+ array_duration <- as_nanoarrow_array(
+ arrow::Array$create(x, arrow::duration("us"))
+ )
+ expect_identical(convert_array(array_duration), x)
+
+ array_duration <- as_nanoarrow_array(
+ arrow::Array$create(x, arrow::duration("ns"))
+ )
+ expect_equal(convert_array(array_duration), x)
+
+ # bad ptype values
+ attr(x, "units") <- NULL
+ expect_error(
+ convert_array(array_duration, x),
+ "Expected difftime 'units' attribute of type"
+ )
+
+ attr(x, "units") <- character()
+ expect_error(
+ convert_array(array_duration, x),
+ "Expected difftime 'units' attribute of type"
+ )
+
+ attr(x, "units") <- integer(1)
+ expect_error(
+ convert_array(array_duration, x),
+ "Expected difftime 'units' attribute of type"
+ )
+
+ attr(x, "units") <- "gazornenplat"
+ expect_error(
+ convert_array(array_duration, x),
+ "Unexpected value for difftime 'units' attribute"
+ )
+
+ attr(x, "units") <- NA_character_
+ expect_error(
+ convert_array(array_duration, x),
+ "Unexpected value for difftime 'units' attribute"
+ )
+})
+
+test_that("convert to vector works for null -> difftime", {
+ array <- as_nanoarrow_array(arrow::Array$create(rep(NA, 10), arrow::null()))
+ expect_identical(
+ convert_array(array, as.difftime(numeric(), units = "secs")),
+ as.difftime(rep(NA_real_, 10), units = "secs")
+ )
+})
+
+test_that("convert to vector works for data frames nested inside lists", {
+ df_in_list <- vctrs::list_of(
+ data.frame(x = 1:5),
+ data.frame(x = 6:10),
+ data.frame(x = 11:15)
+ )
+
+ nested_array <- as_nanoarrow_array(df_in_list)
+ expect_identical(
+ convert_array(nested_array),
+ df_in_list
+ )
+})
+
+test_that("convert to vector works for lists nested in data frames", {
+ df_in_list_in_df <- data.frame(
+ x = vctrs::list_of(
+ data.frame(x = 1:5),
+ data.frame(x = 6:10),
+ data.frame(x = 11:15)
+ )
+ )
+
+ nested_array <- as_nanoarrow_array(df_in_list_in_df)
+ expect_identical(
+ convert_array(nested_array),
+ df_in_list_in_df
+ )
+})
+
+test_that("convert to vector warns for stripped extension type", {
+ ext_arr <- as_nanoarrow_array(
+ arrow::Array$create(vctrs::new_vctr(1:5, class = "my_vctr"))
+ )
+ expect_warning(
+ expect_identical(convert_array(ext_arr), 1:5),
+ "Converting unknown extension arrow.r.vctrs"
+ )
+
+ nested_ext_array <- as_nanoarrow_array(
+ arrow::record_batch(
+ x = vctrs::new_vctr(1:5, class = "my_vctr")
+ )
+ )
+ expect_warning(
+ expect_identical(convert_array(nested_ext_array), data.frame(x = 1:5)),
+ "x: Converting unknown extension arrow.r.vctrs"
+ )
+})
+
+test_that("convert to vector errors for dictionary types", {
+ dict_array <- as_nanoarrow_array(factor(letters[1:5]))
+ expect_error(
+ convert_array(dict_array, character()),
+ "Conversion to dictionary-encoded array is not supported"
+ )
+})
diff --git a/r/tests/testthat/test-infer-ptype.R b/r/tests/testthat/test-infer-ptype.R
new file mode 100644
index 0000000..76ed25b
--- /dev/null
+++ b/r/tests/testthat/test-infer-ptype.R
@@ -0,0 +1,132 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+test_that("infer_nanoarrow_ptype() works on arrays, schemas, and streams", {
+ array <- as_nanoarrow_array(logical())
+ expect_identical(infer_nanoarrow_ptype(array), logical())
+
+ schema <- infer_nanoarrow_schema(array)
+ expect_identical(infer_nanoarrow_ptype(schema), logical())
+
+ stream <- as_nanoarrow_array_stream(data.frame(x = logical()))
+ expect_identical(infer_nanoarrow_ptype(stream), data.frame(x = logical()))
+
+ expect_error(
+ infer_nanoarrow_ptype("not valid"),
+ "must be a nanoarrow_schema"
+ )
+})
+
+test_that("infer_nanoarrow_ptype() works for basic types", {
+ expect_identical(
+ infer_nanoarrow_ptype(as_nanoarrow_array(vctrs::unspecified())),
+ vctrs::unspecified()
+ )
+
+ expect_identical(
+ infer_nanoarrow_ptype(as_nanoarrow_array(logical())),
+ logical()
+ )
+
+ expect_identical(
+ infer_nanoarrow_ptype(as_nanoarrow_array(integer())),
+ integer()
+ )
+
+ expect_identical(
+ infer_nanoarrow_ptype(as_nanoarrow_array(double())),
+ double()
+ )
+
+ expect_identical(
+ infer_nanoarrow_ptype(as_nanoarrow_schema(arrow::decimal128(2, 3))),
+ double()
+ )
+
+ expect_identical(
+ infer_nanoarrow_ptype(as_nanoarrow_array(character())),
+ character()
+ )
+
+ expect_identical(
+ infer_nanoarrow_ptype(as_nanoarrow_array(data.frame(x = character()))),
+ data.frame(x = character())
+ )
+})
+
+test_that("infer_nanoarrow_ptype() infers ptypes for date/time types", {
+ array_date <- as_nanoarrow_array(as.Date("2000-01-01"))
+ expect_identical(
+ infer_nanoarrow_ptype(array_date),
+ as.Date(character())
+ )
+
+ array_time <- as_nanoarrow_array(hms::parse_hm("12:34"))
+ expect_identical(
+ infer_nanoarrow_ptype(array_time),
+ hms::hms()
+ )
+
+ array_duration <- as_nanoarrow_array(as.difftime(123, units = "secs"))
+ expect_identical(
+ infer_nanoarrow_ptype(array_duration),
+ as.difftime(numeric(), units = "secs")
+ )
+
+ array_timestamp <- as_nanoarrow_array(
+ as.POSIXct("2000-01-01 12:33", tz = "America/Halifax")
+ )
+ expect_identical(
+ infer_nanoarrow_ptype(array_timestamp),
+ as.POSIXct(character(), tz = "America/Halifax")
+ )
+})
+
+test_that("infer_nanoarrow_ptype() infers ptypes for nested types", {
+ array_list <- as_nanoarrow_array(vctrs::list_of(integer()))
+ expect_identical(
+ infer_nanoarrow_ptype(array_list),
+ vctrs::list_of(.ptype = integer())
+ )
+
+ array_fixed_size <- as_nanoarrow_array(
+ arrow::Array$create(
+ list(1:5),
+ arrow::fixed_size_list_of(arrow::int32(), 5)
+ )
+ )
+ expect_identical(
+ infer_nanoarrow_ptype(array_fixed_size),
+ vctrs::list_of(.ptype = integer())
+ )
+})
+
+test_that("infer_nanoarrow_ptype() errors for types it can't infer", {
+ unsupported_array <- arrow::concat_arrays(type = arrow::decimal256(3, 4))
+ expect_error(
+ infer_nanoarrow_ptype(as_nanoarrow_array(unsupported_array)),
+ "Can't infer R vector type for array <decimal256\\(3, 4\\)>"
+ )
+
+ unsupported_struct <- arrow::concat_arrays(
+ type = arrow::struct(col = arrow::decimal256(3, 4))
+ )
+ expect_error(
+ infer_nanoarrow_ptype(as_nanoarrow_array(unsupported_struct)),
+ "Can't infer R vector type for `col` <decimal256\\(3, 4\\)>"
+ )
+})
diff --git a/r/tests/testthat/test-schema.R b/r/tests/testthat/test-schema.R
index 1c1c878..65d21a6 100644
--- a/r/tests/testthat/test-schema.R
+++ b/r/tests/testthat/test-schema.R
@@ -39,6 +39,42 @@ test_that("infer_nanoarrow_schema() default method works", {
expect_true(arrow::as_data_type(schema)$Equals(arrow::int32()))
})
+test_that("nanoarrow_schema_parse() works", {
+ simple_info <- nanoarrow_schema_parse(arrow::int32())
+ expect_identical(simple_info$type, "int32")
+ expect_identical(simple_info$storage_type, "int32")
+
+ fixed_size_info <- nanoarrow_schema_parse(arrow::fixed_size_binary(1234))
+ expect_identical(fixed_size_info$fixed_size, 1234L)
+
+ decimal_info <- nanoarrow_schema_parse(arrow::decimal128(4, 5))
+ expect_identical(decimal_info$decimal_bitwidth, 128L)
+ expect_identical(decimal_info$decimal_precision, 4L)
+ expect_identical(decimal_info$decimal_scale, 5L)
+
+ time_unit_info <- nanoarrow_schema_parse(arrow::time32("s"))
+ expect_identical(time_unit_info$time_unit, "s")
+
+ timezone_info <- nanoarrow_schema_parse(arrow::timestamp("s", "America/Halifax"))
+ expect_identical(timezone_info$timezone, "America/Halifax")
+
+ recursive_info <- nanoarrow_schema_parse(
+ infer_nanoarrow_schema(data.frame(x = 1L)),
+ recursive = FALSE
+ )
+ expect_null(recursive_info$children)
+
+ recursive_info <- nanoarrow_schema_parse(
+ infer_nanoarrow_schema(data.frame(x = 1L)),
+ recursive = TRUE
+ )
+ expect_length(recursive_info$children, 1L)
+ expect_identical(
+ recursive_info$children$x,
+ nanoarrow_schema_parse(infer_nanoarrow_schema(1L))
+ )
+})
+
test_that("schema list interface works for non-nested types", {
schema <- infer_nanoarrow_schema(1:10)
expect_identical(length(schema), 6L)