You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by jo...@apache.org on 2022/04/08 15:52:21 UTC
[arrow] branch master updated: ARROW-15471: [R] ExtensionType support in R
This is an automated email from the ASF dual-hosted git repository.
jonkeane pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 489aada557 ARROW-15471: [R] ExtensionType support in R
489aada557 is described below
commit 489aada557f267f4b9745b039034e9f5b0e1f485
Author: Dewey Dunnington <de...@fishandwhistle.net>
AuthorDate: Fri Apr 8 10:52:12 2022 -0500
ARROW-15471: [R] ExtensionType support in R
This PR implements extension type support and registration in the R bindings (as has been possible in the Python bindings for some time). The details still need to be worked out, but we at least have a working pattern:
``` r
library(arrow, warn.conflicts = FALSE)
library(R6)
SomeExtensionTypeSubclass <- R6Class(
"SomeExtensionTypeSubclass", inherit = arrow:::ExtensionType,
public = list(
some_custom_method = function() {
private$some_custom_field
},
.Deserialize = function(storage_type, extension_name, extension_metadata) {
private$some_custom_field <- head(extension_metadata, 5)
}
),
private = list(
some_custom_field = NULL
)
)
SomeExtensionArraySubclass <- R6Class(
"SomeExtensionArraySubclass", inherit = arrow:::ExtensionArray,
public = list(
some_custom_method = function() {
self$type$some_custom_method()
}
)
)
type <- arrow:::MakeExtensionType(
int32(),
"some_extension_subclass",
charToRaw("some custom metadata"),
type_class = SomeExtensionTypeSubclass,
array_class = SomeExtensionArraySubclass
)
arrow:::RegisterExtensionType(type)
# survives the C API round trip
ptr_type <- arrow:::allocate_arrow_schema()
type$export_to_c(ptr_type)
type2 <- arrow:::DataType$import_from_c(ptr_type)
type2
#> SomeExtensionTypeSubclass
#> SomeExtensionTypeSubclass <some custom metadata>
type2$some_custom_method()
#> [1] 73 6f 6d 65 20
(array <- type$WrapArray(Array$create(1:10)))
#> SomeExtensionArraySubclass
#> <SomeExtensionTypeSubclass <some custom metadata>>
#> [
#> 1,
#> 2,
#> 3,
#> 4,
#> 5,
#> 6,
#> 7,
#> 8,
#> 9,
#> 10
#> ]
array$some_custom_method()
#> [1] 73 6f 6d 65 20
ptr_array <- arrow:::allocate_arrow_array()
array$export_to_c(ptr_array, ptr_type)
(array2 <- Array$import_from_c(ptr_array, ptr_type))
#> SomeExtensionArraySubclass
#> <SomeExtensionTypeSubclass <some custom metadata>>
#> [
#> 1,
#> 2,
#> 3,
#> 4,
#> 5,
#> 6,
#> 7,
#> 8,
#> 9,
#> 10
#> ]
arrow:::delete_arrow_schema(ptr_type)
arrow:::delete_arrow_array(ptr_array)
```
<sup>Created on 2022-02-18 by the [reprex package](https://reprex.tidyverse.org) (v2.0.1)</sup>
Closes #12467 from paleolimbot/r-extension-type
Authored-by: Dewey Dunnington <de...@fishandwhistle.net>
Signed-off-by: Jonathan Keane <jk...@gmail.com>
---
r/DESCRIPTION | 1 +
r/NAMESPACE | 9 +
r/R/arrow-package.R | 5 +
r/R/arrowExports.R | 36 +++
r/R/extension.R | 545 +++++++++++++++++++++++++++++++++++
r/_pkgdown.yml | 5 +
r/man/ExtensionArray.Rd | 23 ++
r/man/ExtensionType.Rd | 48 +++
r/man/new_extension_type.Rd | 167 +++++++++++
r/man/vctrs_extension_array.Rd | 50 ++++
r/src/array.cpp | 2 +
r/src/array_to_vector.cpp | 33 +++
r/src/arrowExports.cpp | 150 ++++++++++
r/src/datatype.cpp | 2 +
r/src/extension-impl.cpp | 198 +++++++++++++
r/src/extension.h | 75 +++++
r/tests/testthat/_snaps/extension.md | 10 +
r/tests/testthat/test-extension.R | 345 ++++++++++++++++++++++
18 files changed, 1704 insertions(+)
diff --git a/r/DESCRIPTION b/r/DESCRIPTION
index 36a55c05b2..a5fb1ee9a4 100644
--- a/r/DESCRIPTION
+++ b/r/DESCRIPTION
@@ -108,6 +108,7 @@ Collate:
'table.R'
'dplyr.R'
'duckdb.R'
+ 'extension.R'
'feather.R'
'field.R'
'filesystem.R'
diff --git a/r/NAMESPACE b/r/NAMESPACE
index f32e73f537..da43a3f511 100644
--- a/r/NAMESPACE
+++ b/r/NAMESPACE
@@ -134,6 +134,8 @@ export(DictionaryArray)
export(DirectoryPartitioning)
export(DirectoryPartitioningFactory)
export(Expression)
+export(ExtensionArray)
+export(ExtensionType)
export(FeatherReader)
export(Field)
export(FileFormat)
@@ -267,6 +269,8 @@ export(match_arrow)
export(matches)
export(mmap_create)
export(mmap_open)
+export(new_extension_array)
+export(new_extension_type)
export(null)
export(num_range)
export(one_of)
@@ -282,6 +286,8 @@ export(read_parquet)
export(read_schema)
export(read_tsv_arrow)
export(record_batch)
+export(register_extension_type)
+export(reregister_extension_type)
export(s3_bucket)
export(schema)
export(set_cpu_count)
@@ -300,8 +306,11 @@ export(uint32)
export(uint64)
export(uint8)
export(unify_schemas)
+export(unregister_extension_type)
export(utf8)
export(value_counts)
+export(vctrs_extension_array)
+export(vctrs_extension_type)
export(write_arrow)
export(write_csv_arrow)
export(write_dataset)
diff --git a/r/R/arrow-package.R b/r/R/arrow-package.R
index 256bc7aefa..896363a478 100644
--- a/r/R/arrow-package.R
+++ b/r/R/arrow-package.R
@@ -80,6 +80,11 @@
}
}
+ if (arrow_available()) {
+ # register extension types that we use internally
+ reregister_extension_type(vctrs_extension_type(vctrs::unspecified()))
+ }
+
invisible()
}
diff --git a/r/R/arrowExports.R b/r/R/arrowExports.R
index 5ef6312196..7bf77f1e66 100644
--- a/r/R/arrowExports.R
+++ b/r/R/arrowExports.R
@@ -1068,6 +1068,42 @@ compute___expr__type_id <- function(x, schema) {
.Call(`_arrow_compute___expr__type_id`, x, schema)
}
+ExtensionType__initialize <- function(storage_type, extension_name, extension_metadata, r6_class) {
+ .Call(`_arrow_ExtensionType__initialize`, storage_type, extension_name, extension_metadata, r6_class)
+}
+
+ExtensionType__extension_name <- function(type) {
+ .Call(`_arrow_ExtensionType__extension_name`, type)
+}
+
+ExtensionType__Serialize <- function(type) {
+ .Call(`_arrow_ExtensionType__Serialize`, type)
+}
+
+ExtensionType__storage_type <- function(type) {
+ .Call(`_arrow_ExtensionType__storage_type`, type)
+}
+
+ExtensionType__MakeArray <- function(type, data) {
+ .Call(`_arrow_ExtensionType__MakeArray`, type, data)
+}
+
+ExtensionType__r6_class <- function(type) {
+ .Call(`_arrow_ExtensionType__r6_class`, type)
+}
+
+ExtensionArray__storage <- function(array) {
+ .Call(`_arrow_ExtensionArray__storage`, array)
+}
+
+arrow__RegisterRExtensionType <- function(type) {
+ invisible(.Call(`_arrow_arrow__RegisterRExtensionType`, type))
+}
+
+arrow__UnregisterRExtensionType <- function(type_name) {
+ invisible(.Call(`_arrow_arrow__UnregisterRExtensionType`, type_name))
+}
+
ipc___WriteFeather__Table <- function(stream, table, version, chunk_size, compression, compression_level) {
invisible(.Call(`_arrow_ipc___WriteFeather__Table`, stream, table, version, chunk_size, compression, compression_level))
}
diff --git a/r/R/extension.R b/r/R/extension.R
new file mode 100644
index 0000000000..111a0e8620
--- /dev/null
+++ b/r/R/extension.R
@@ -0,0 +1,545 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+#' @include arrow-package.R
+
+
+#' @title class arrow::ExtensionArray
+#'
+#' @usage NULL
+#' @format NULL
+#' @docType class
+#'
+#' @section Methods:
+#'
+#' The `ExtensionArray` class inherits from `Array`, but also provides
+#' access to the underlying storage of the extension.
+#'
+#' - `$storage()`: Returns the underlying [Array] used to store
+#' values.
+#'
+#' The `ExtensionArray` is not intended to be subclassed for extension
+#' types.
+#'
+#' @rdname ExtensionArray
+#' @name ExtensionArray
+#' @export
+ExtensionArray <- R6Class("ExtensionArray",
+ inherit = Array,
+ public = list(
+ storage = function() {
+ ExtensionArray__storage(self)
+ },
+
+ as_vector = function() {
+ self$type$as_vector(self)
+ }
+ )
+)
+
+ExtensionArray$create <- function(x, type) {
+ assert_is(type, "ExtensionType")
+ if (inherits(x, "ExtensionArray") && type$Equals(x$type)) {
+ return(x)
+ }
+
+ storage <- Array$create(x, type = type$storage_type())
+ type$WrapArray(storage)
+}
+
+#' @title class arrow::ExtensionType
+#'
+#' @usage NULL
+#' @format NULL
+#' @docType class
+#'
+#' @section Methods:
+#'
+#' The `ExtensionType` class inherits from `DataType`, but also defines
+#' extra methods specific to extension types:
+#'
+#' - `$storage_type()`: Returns the underlying [DataType] used to store
+#' values.
+#' - `$storage_id()`: Returns the [Type] identifier corresponding to the
+#' `$storage_type()`.
+#' - `$extension_name()`: Returns the extension name.
+#' - `$extension_metadata()`: Returns the serialized version of the extension
+#' metadata as a [raw()] vector.
+#' - `$extension_metadata_utf8()`: Returns the serialized version of the
+#' extension metadata as a UTF-8 encoded string.
+#' - `$WrapArray(array)`: Wraps a storage [Array] into an [ExtensionArray]
+#' with this extension type.
+#'
+#' In addition, subclasses may override the following methos to customize
+#' the behaviour of extension classes.
+#'
+#' - `$deserialize_instance()`: This method is called when a new [ExtensionType]
+#' is initialized and is responsible for parsing and validating
+#' the serialized extension_metadata (a [raw()] vector)
+#' such that its contents can be inspected by fields and/or methods
+#' of the R6 ExtensionType subclass. Implementations must also check the
+#' `storage_type` to make sure it is compatible with the extension type.
+#' - `$as_vector(extension_array)`: Convert an [Array] or [ChunkedArray] to an R
+#' vector. This method is called by [as.vector()] on [ExtensionArray]
+#' objects, when a [RecordBatch] containing an [ExtensionArray] is
+#' converted to a [data.frame()], or when a [ChunkedArray] (e.g., a column
+#' in a [Table]) is converted to an R vector. The default method returns the
+#' converted storage array.
+#' - `$ToString()` Return a string representation that will be printed
+#' to the console when this type or an Array of this type is printed.
+#'
+#' @rdname ExtensionType
+#' @name ExtensionType
+#' @export
+ExtensionType <- R6Class("ExtensionType",
+ inherit = DataType,
+ public = list(
+
+ # In addition to the initialization that occurs for all
+ # ArrowObject instances, we call deserialize_instance(), which can
+ # be overridden to populate custom fields
+ initialize = function(xp) {
+ super$initialize(xp)
+ self$deserialize_instance()
+ },
+
+ # Because of how C++ shared_ptr<> objects are converted to R objects,
+ # the initial object that is instantiated will be of this class
+ # (ExtensionType), but the R6Class object that was registered is
+ # available from C++. We need this in order to produce the correct
+ # R6 subclass when a shared_ptr<ExtensionType> is returned to R.
+ r6_class = function() {
+ ExtensionType__r6_class(self)
+ },
+
+ storage_type = function() {
+ ExtensionType__storage_type(self)
+ },
+
+ storage_id = function() {
+ self$storage_type()$id
+ },
+
+ extension_name = function() {
+ ExtensionType__extension_name(self)
+ },
+
+ extension_metadata = function() {
+ ExtensionType__Serialize(self)
+ },
+
+ # To make sure this conversion is done properly
+ extension_metadata_utf8 = function() {
+ metadata_utf8 <- rawToChar(self$extension_metadata())
+ Encoding(metadata_utf8) <- "UTF-8"
+ metadata_utf8
+ },
+
+ WrapArray = function(array) {
+ assert_is(array, "Array")
+ ExtensionType__MakeArray(self, array$data())
+ },
+
+ deserialize_instance = function() {
+ # Do nothing by default but allow other classes to override this method
+ # to populate R6 class members.
+ },
+
+ ExtensionEquals = function(other) {
+ inherits(other, "ExtensionType") &&
+ identical(other$extension_name(), self$extension_name()) &&
+ identical(other$extension_metadata(), self$extension_metadata())
+ },
+
+ as_vector = function(extension_array) {
+ if (inherits(extension_array, "ChunkedArray")) {
+ # Converting one array at a time so that users don't have to remember
+ # to implement two methods. Converting all the storage arrays to
+ # a ChunkedArray and then converting is probably faster
+ # (VctrsExtensionType does this).
+ storage_vectors <- lapply(
+ seq_len(extension_array$num_chunks) - 1L,
+ function(i) self$as_vector(extension_array$chunk(i))
+ )
+
+ vctrs::vec_c(!!! storage_vectors)
+ } else if (inherits(extension_array, "ExtensionArray")) {
+ extension_array$storage()$as_vector()
+ } else {
+ classes <- paste(class(extension_array), collapse = " / ")
+ abort(
+ c(
+ "`extension_array` must be a ChunkedArray or ExtensionArray",
+ i = glue::glue("Got object of type {classes}")
+ )
+ )
+ }
+ },
+
+ ToString = function() {
+ # metadata is probably valid UTF-8 (e.g., JSON), but might not be
+ # and it's confusing to error when printing the object. This herustic
+ # isn't perfect (but subclasses should override this method anyway)
+ metadata_raw <- self$extension_metadata()
+
+ if (as.raw(0x00) %in% metadata_raw) {
+ if (length(metadata_raw) > 20) {
+ sprintf(
+ "<%s %s...>",
+ class(self)[1],
+ paste(format(utils::head(metadata_raw, 20)), collapse = " ")
+ )
+ } else {
+ sprintf(
+ "<%s %s>",
+ class(self)[1],
+ paste(format(metadata_raw), collapse = " ")
+ )
+ }
+
+ } else {
+ paste0(class(self)[1], " <", self$extension_metadata_utf8(), ">")
+ }
+ }
+ )
+)
+
+# ExtensionType$new() is what gets used by the generated wrapper code to
+# create an R6 object when a shared_ptr<DataType> is returned to R and
+# that object has type_id() EXTENSION_TYPE. Rather than add complexity
+# to the wrapper code, we modify ExtensionType$new() to do what we need
+# it to do here (which is to return an instance of a custom R6
+# type whose .deserialize_instance method is called to populate custom fields).
+ExtensionType$.default_new <- ExtensionType$new
+ExtensionType$new <- function(xp) {
+ super <- ExtensionType$.default_new(xp)
+ r6_class <- super$r6_class()
+ if (identical(r6_class$classname, "ExtensionType")) {
+ super
+ } else {
+ r6_class$new(xp)
+ }
+}
+
+ExtensionType$create <- function(storage_type,
+ extension_name,
+ extension_metadata = raw(),
+ type_class = ExtensionType) {
+ if (is.string(extension_metadata)) {
+ extension_metadata <- charToRaw(enc2utf8(extension_metadata))
+ }
+
+ assert_that(is.string(extension_name), is.raw(extension_metadata))
+ assert_is(storage_type, "DataType")
+ assert_is(type_class, "R6ClassGenerator")
+
+ ExtensionType__initialize(
+ storage_type,
+ extension_name,
+ extension_metadata,
+ type_class
+ )
+}
+
+#' Extension types
+#'
+#' Extension arrays are wrappers around regular Arrow [Array] objects
+#' that provide some customized behaviour and/or storage. A common use-case
+#' for extension types is to define a customized conversion between an
+#' an Arrow [Array] and an R object when the default conversion is slow
+#' or looses metadata important to the interpretation of values in the array.
+#' For most types, the built-in
+#' [vctrs extension type][vctrs_extension_type] is probably sufficient.
+#'
+#' These functions create, register, and unregister [ExtensionType]
+#' and [ExtensionArray] objects. To use an extension type you will have to:
+#'
+#' - Define an [R6::R6Class] that inherits from [ExtensionType] and reimplement
+#' one or more methods (e.g., `deserialize_instance()`).
+#' - Make a type constructor function (e.g., `my_extension_type()`) that calls
+#' [new_extension_type()] to create an R6 instance that can be used as a
+#' [data type][data-type] elsewhere in the package.
+#' - Make an array constructor function (e.g., `my_extension_array()`) that
+#' calls [new_extension_array()] to create an [Array] instance of your
+#' extension type.
+#' - Register a dummy instance of your extension type created using
+#' you constructor function using [register_extension_type()].
+#'
+#' If defining an extension type in an R package, you will probably want to
+#' use [reregister_extension_type()] in that package's [.onLoad()] hook
+#' since your package will probably get reloaded in the same R session
+#' during its development and [register_extension_type()] will error if
+#' called twice for the same `extension_name`. For an example of an
+#' extension type that uses most of these features, see
+#' [vctrs_extension_type()].
+#'
+#' @param storage_type The [data type][data-type] of the underlying storage
+#' array.
+#' @param storage_array An [Array] object of the underlying storage.
+#' @param extension_type An [ExtensionType] instance.
+#' @param extension_name The extension name. This should be namespaced using
+#' "dot" syntax (i.e., "some_package.some_type"). The namespace "arrow"
+#' is reserved for extension types defined by the Apache Arrow libraries.
+#' @param extension_metadata A [raw()] or [character()] vector containing the
+#' serialized version of the type. Chatacter vectors must be length 1 and
+#' are converted to UTF-8 before converting to [raw()].
+#' @param type_class An [R6::R6Class] whose `$new()` class method will be
+#' used to construct a new instance of the type.
+#'
+#' @return
+#' - `new_extension_type()` returns an [ExtensionType] instance according
+#' to the `type_class` specified.
+#' - `new_extension_array()` returns an [ExtensionArray] whose `$type`
+#' corresponds to `extension_type`.
+#' - `register_extension_type()`, `unregister_extension_type()`
+#' and `reregister_extension_type()` return `NULL`, invisibly.
+#' @export
+#'
+#' @examplesIf arrow_available()
+#' # Create the R6 type whose methods control how Array objects are
+#' # converted to R objects, how equality between types is computed,
+#' # and how types are printed.
+#' QuantizedType <- R6::R6Class(
+#' "QuantizedType",
+#' inherit = ExtensionType,
+#' public = list(
+#' # methods to access the custom metadata fields
+#' center = function() private$.center,
+#' scale = function() private$.scale,
+#'
+#' # called when an Array of this type is converted to an R vector
+#' as_vector = function(extension_array) {
+#' if (inherits(extension_array, "ExtensionArray")) {
+#' unquantized_arrow <-
+#' (extension_array$storage()$cast(float64()) / private$.scale) +
+#' private$.center
+#'
+#' as.vector(unquantized_arrow)
+#' } else {
+#' super$as_vector(extension_array)
+#' }
+#' },
+#'
+#' # populate the custom metadata fields from the serialized metadata
+#' deserialize_instance = function() {
+#' vals <- as.numeric(strsplit(self$extension_metadata_utf8(), ";")[[1]])
+#' private$.center <- vals[1]
+#' private$.scale <- vals[2]
+#' }
+#' ),
+#'
+#' private = list(
+#' .center = NULL,
+#' .scale = NULL
+#' )
+#' )
+#'
+#' # Create a helper type constructor that calls new_extension_type()
+#' quantized <- function(center = 0, scale = 1, storage_type = int32()) {
+#' new_extension_type(
+#' storage_type = storage_type,
+#' extension_name = "arrow.example.quantized",
+#' extension_metadata = paste(center, scale, sep = ";"),
+#' type_class = QuantizedType
+#' )
+#' }
+#'
+#' # Create a helper array constructor that calls new_extension_array()
+#' quantized_array <- function(x, center = 0, scale = 1,
+#' storage_type = int32()) {
+#' type <- quantized(center, scale, storage_type)
+#' new_extension_array(
+#' Array$create((x - center) * scale, type = storage_type),
+#' type
+#' )
+#' }
+#'
+#' # Register the extension type so that Arrow knows what to do when
+#' # it encounters this extension type
+#' reregister_extension_type(quantized())
+#'
+#' # Create Array objects and use them!
+#' (vals <- runif(5, min = 19, max = 21))
+#'
+#' (array <- quantized_array(
+#' vals,
+#' center = 20,
+#' scale = 2 ^ 15 - 1,
+#' storage_type = int16())
+#' )
+#'
+#' array$type$center()
+#' array$type$scale()
+#'
+#' as.vector(array)
+new_extension_type <- function(storage_type,
+ extension_name,
+ extension_metadata = raw(),
+ type_class = ExtensionType) {
+ ExtensionType$create(
+ storage_type,
+ extension_name,
+ extension_metadata,
+ type_class
+ )
+}
+
+#' @rdname new_extension_type
+#' @export
+new_extension_array <- function(storage_array, extension_type) {
+ ExtensionArray$create(storage_array, extension_type)
+}
+
+#' @rdname new_extension_type
+#' @export
+register_extension_type <- function(extension_type) {
+ assert_is(extension_type, "ExtensionType")
+ arrow__RegisterRExtensionType(extension_type)
+}
+
+#' @rdname new_extension_type
+#' @export
+reregister_extension_type <- function(extension_type) {
+ tryCatch(
+ register_extension_type(extension_type),
+ error = function(e) {
+ unregister_extension_type(extension_type$extension_name())
+ register_extension_type(extension_type)
+ }
+ )
+}
+
+#' @rdname new_extension_type
+#' @export
+unregister_extension_type <- function(extension_name) {
+ arrow__UnregisterRExtensionType(extension_name)
+}
+
+VctrsExtensionType <- R6Class("VctrsExtensionType",
+ inherit = ExtensionType,
+ public = list(
+ ptype = function() {
+ private$.ptype
+ },
+
+ ToString = function() {
+ tf <- tempfile()
+ sink(tf)
+ on.exit({
+ sink(NULL)
+ unlink(tf)
+ })
+ print(self$ptype())
+ paste0(readLines(tf), collapse = "\n")
+ },
+
+ deserialize_instance = function() {
+ private$.ptype <- unserialize(self$extension_metadata())
+ },
+
+ ExtensionEquals = function(other) {
+ if (!inherits(other, "VctrsExtensionType")) {
+ return(FALSE)
+ }
+
+ identical(self$ptype(), other$ptype())
+ },
+
+ as_vector = function(extension_array) {
+ if (inherits(extension_array, "ChunkedArray")) {
+ # rather than convert one array at a time, use more Arrow
+ # machinery to convert the whole ChunkedArray at once
+ storage_arrays <- lapply(
+ seq_len(extension_array$num_chunks) - 1L,
+ function(i) extension_array$chunk(i)$storage()
+ )
+ storage <- chunked_array(!!! storage_arrays, type = self$storage_type())
+
+ vctrs::vec_restore(storage$as_vector(), self$ptype())
+ } else if (inherits(extension_array, "Array")) {
+ vctrs::vec_restore(
+ super$as_vector(extension_array),
+ self$ptype()
+ )
+ } else {
+ super$as_vector(extension_array)
+ }
+ }
+ ),
+ private = list(
+ .ptype = NULL
+ )
+)
+
+
+#' Extension type for generic typed vectors
+#'
+#' Most common R vector types are converted automatically to a suitable
+#' Arrow [data type][data-type] without the need for an extension type. For
+#' vector types whose conversion is not suitably handled by default, you can
+#' create a [vctrs_extension_array()], which passes [vctrs::vec_data()] to
+#' `Array$create()` and calls [vctrs::vec_restore()] when the [Array] is
+#' converted back into an R vector.
+#'
+#' @param x A vctr (i.e., [vctrs::vec_is()] returns `TRUE`).
+#' @param ptype A [vctrs::vec_ptype()], which is usually a zero-length
+#' version of the object with the appropriate attributes set. This value
+#' will be serialized using [serialize()], so it should not refer to any
+#' R object that can't be saved/reloaded.
+#' @inheritParams new_extension_type
+#'
+#' @return
+#' - `vctrs_extension_array()` returns an [ExtensionArray] instance with a
+#' `vctrs_extension_type()`.
+#' - `vctrs_extension_type()` returns an [ExtensionType] instance for the
+#' extension name "arrow.r.vctrs".
+#' @export
+#'
+#' @examplesIf arrow_available()
+#' (array <- vctrs_extension_array(as.POSIXlt("2022-01-02 03:45", tz = "UTC")))
+#' array$type
+#' as.vector(array)
+#'
+#' temp_feather <- tempfile()
+#' write_feather(arrow_table(col = array), temp_feather)
+#' read_feather(temp_feather)
+#' unlink(temp_feather)
+vctrs_extension_array <- function(x, ptype = vctrs::vec_ptype(x),
+ storage_type = NULL) {
+ if (inherits(x, "ExtensionArray") && inherits(x$type, "VctrsExtensionType")) {
+ return(x)
+ }
+
+ vctrs::vec_assert(x)
+ storage <- Array$create(vctrs::vec_data(x), type = storage_type)
+ type <- vctrs_extension_type(ptype, storage$type)
+ new_extension_array(storage, type)
+}
+
+#' @rdname vctrs_extension_array
+#' @export
+vctrs_extension_type <- function(ptype,
+ storage_type = type(vctrs::vec_data(ptype))) {
+ ptype <- vctrs::vec_ptype(ptype)
+
+ new_extension_type(
+ storage_type = storage_type,
+ extension_name = "arrow.r.vctrs",
+ extension_metadata = serialize(ptype, NULL),
+ type_class = VctrsExtensionType
+ )
+}
diff --git a/r/_pkgdown.yml b/r/_pkgdown.yml
index fcb7b2016a..c3810cdf09 100644
--- a/r/_pkgdown.yml
+++ b/r/_pkgdown.yml
@@ -144,6 +144,8 @@ reference:
- buffer
- read_message
- concat_arrays
+ - ExtensionArray
+ - vctrs_extension_array
- title: Arrow data types and schema
contents:
- Schema
@@ -156,6 +158,9 @@ reference:
- DataType
- DictionaryType
- FixedWidthType
+ - new_extension_type
+ - vctrs_extension_type
+ - ExtensionType
- title: Flight
contents:
- load_flight_server
diff --git a/r/man/ExtensionArray.Rd b/r/man/ExtensionArray.Rd
new file mode 100644
index 0000000000..84a63c9bb9
--- /dev/null
+++ b/r/man/ExtensionArray.Rd
@@ -0,0 +1,23 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/extension.R
+\docType{class}
+\name{ExtensionArray}
+\alias{ExtensionArray}
+\title{class arrow::ExtensionArray}
+\description{
+class arrow::ExtensionArray
+}
+\section{Methods}{
+
+
+The \code{ExtensionArray} class inherits from \code{Array}, but also provides
+access to the underlying storage of the extension.
+\itemize{
+\item \verb{$storage()}: Returns the underlying \link{Array} used to store
+values.
+}
+
+The \code{ExtensionArray} is not intended to be subclassed for extension
+types.
+}
+
diff --git a/r/man/ExtensionType.Rd b/r/man/ExtensionType.Rd
new file mode 100644
index 0000000000..6b05f3490d
--- /dev/null
+++ b/r/man/ExtensionType.Rd
@@ -0,0 +1,48 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/extension.R
+\docType{class}
+\name{ExtensionType}
+\alias{ExtensionType}
+\title{class arrow::ExtensionType}
+\description{
+class arrow::ExtensionType
+}
+\section{Methods}{
+
+
+The \code{ExtensionType} class inherits from \code{DataType}, but also defines
+extra methods specific to extension types:
+\itemize{
+\item \verb{$storage_type()}: Returns the underlying \link{DataType} used to store
+values.
+\item \verb{$storage_id()}: Returns the \link{Type} identifier corresponding to the
+\verb{$storage_type()}.
+\item \verb{$extension_name()}: Returns the extension name.
+\item \verb{$extension_metadata()}: Returns the serialized version of the extension
+metadata as a \code{\link[=raw]{raw()}} vector.
+\item \verb{$extension_metadata_utf8()}: Returns the serialized version of the
+extension metadata as a UTF-8 encoded string.
+\item \verb{$WrapArray(array)}: Wraps a storage \link{Array} into an \link{ExtensionArray}
+with this extension type.
+}
+
+In addition, subclasses may override the following methos to customize
+the behaviour of extension classes.
+\itemize{
+\item \verb{$deserialize_instance()}: This method is called when a new \link{ExtensionType}
+is initialized and is responsible for parsing and validating
+the serialized extension_metadata (a \code{\link[=raw]{raw()}} vector)
+such that its contents can be inspected by fields and/or methods
+of the R6 ExtensionType subclass. Implementations must also check the
+\code{storage_type} to make sure it is compatible with the extension type.
+\item \verb{$as_vector(extension_array)}: Convert an \link{Array} or \link{ChunkedArray} to an R
+vector. This method is called by \code{\link[=as.vector]{as.vector()}} on \link{ExtensionArray}
+objects, when a \link{RecordBatch} containing an \link{ExtensionArray} is
+converted to a \code{\link[=data.frame]{data.frame()}}, or when a \link{ChunkedArray} (e.g., a column
+in a \link{Table}) is converted to an R vector. The default method returns the
+converted storage array.
+\item \verb{$ToString()} Return a string representation that will be printed
+to the console when this type or an Array of this type is printed.
+}
+}
+
diff --git a/r/man/new_extension_type.Rd b/r/man/new_extension_type.Rd
new file mode 100644
index 0000000000..96d5c10c93
--- /dev/null
+++ b/r/man/new_extension_type.Rd
@@ -0,0 +1,167 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/extension.R
+\name{new_extension_type}
+\alias{new_extension_type}
+\alias{new_extension_array}
+\alias{register_extension_type}
+\alias{reregister_extension_type}
+\alias{unregister_extension_type}
+\title{Extension types}
+\usage{
+new_extension_type(
+ storage_type,
+ extension_name,
+ extension_metadata = raw(),
+ type_class = ExtensionType
+)
+
+new_extension_array(storage_array, extension_type)
+
+register_extension_type(extension_type)
+
+reregister_extension_type(extension_type)
+
+unregister_extension_type(extension_name)
+}
+\arguments{
+\item{storage_type}{The \link[=data-type]{data type} of the underlying storage
+array.}
+
+\item{extension_name}{The extension name. This should be namespaced using
+"dot" syntax (i.e., "some_package.some_type"). The namespace "arrow"
+is reserved for extension types defined by the Apache Arrow libraries.}
+
+\item{extension_metadata}{A \code{\link[=raw]{raw()}} or \code{\link[=character]{character()}} vector containing the
+serialized version of the type. Chatacter vectors must be length 1 and
+are converted to UTF-8 before converting to \code{\link[=raw]{raw()}}.}
+
+\item{type_class}{An \link[R6:R6Class]{R6::R6Class} whose \verb{$new()} class method will be
+used to construct a new instance of the type.}
+
+\item{storage_array}{An \link{Array} object of the underlying storage.}
+
+\item{extension_type}{An \link{ExtensionType} instance.}
+}
+\value{
+\itemize{
+\item \code{new_extension_type()} returns an \link{ExtensionType} instance according
+to the \code{type_class} specified.
+\item \code{new_extension_array()} returns an \link{ExtensionArray} whose \verb{$type}
+corresponds to \code{extension_type}.
+\item \code{register_extension_type()}, \code{unregister_extension_type()}
+and \code{reregister_extension_type()} return \code{NULL}, invisibly.
+}
+}
+\description{
+Extension arrays are wrappers around regular Arrow \link{Array} objects
+that provide some customized behaviour and/or storage. A common use-case
+for extension types is to define a customized conversion between an
+an Arrow \link{Array} and an R object when the default conversion is slow
+or looses metadata important to the interpretation of values in the array.
+For most types, the built-in
+\link[=vctrs_extension_type]{vctrs extension type} is probably sufficient.
+}
+\details{
+These functions create, register, and unregister \link{ExtensionType}
+and \link{ExtensionArray} objects. To use an extension type you will have to:
+\itemize{
+\item Define an \link[R6:R6Class]{R6::R6Class} that inherits from \link{ExtensionType} and reimplement
+one or more methods (e.g., \code{deserialize_instance()}).
+\item Make a type constructor function (e.g., \code{my_extension_type()}) that calls
+\code{\link[=new_extension_type]{new_extension_type()}} to create an R6 instance that can be used as a
+\link[=data-type]{data type} elsewhere in the package.
+\item Make an array constructor function (e.g., \code{my_extension_array()}) that
+calls \code{\link[=new_extension_array]{new_extension_array()}} to create an \link{Array} instance of your
+extension type.
+\item Register a dummy instance of your extension type created using
+you constructor function using \code{\link[=register_extension_type]{register_extension_type()}}.
+}
+
+If defining an extension type in an R package, you will probably want to
+use \code{\link[=reregister_extension_type]{reregister_extension_type()}} in that package's \code{\link[=.onLoad]{.onLoad()}} hook
+since your package will probably get reloaded in the same R session
+during its development and \code{\link[=register_extension_type]{register_extension_type()}} will error if
+called twice for the same \code{extension_name}. For an example of an
+extension type that uses most of these features, see
+\code{\link[=vctrs_extension_type]{vctrs_extension_type()}}.
+}
+\examples{
+\dontshow{if (arrow_available()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf}
+# Create the R6 type whose methods control how Array objects are
+# converted to R objects, how equality between types is computed,
+# and how types are printed.
+QuantizedType <- R6::R6Class(
+ "QuantizedType",
+ inherit = ExtensionType,
+ public = list(
+ # methods to access the custom metadata fields
+ center = function() private$.center,
+ scale = function() private$.scale,
+
+ # called when an Array of this type is converted to an R vector
+ as_vector = function(extension_array) {
+ if (inherits(extension_array, "ExtensionArray")) {
+ unquantized_arrow <-
+ (extension_array$storage()$cast(float64()) / private$.scale) +
+ private$.center
+
+ as.vector(unquantized_arrow)
+ } else {
+ super$as_vector(extension_array)
+ }
+ },
+
+ # populate the custom metadata fields from the serialized metadata
+ deserialize_instance = function() {
+ vals <- as.numeric(strsplit(self$extension_metadata_utf8(), ";")[[1]])
+ private$.center <- vals[1]
+ private$.scale <- vals[2]
+ }
+ ),
+
+ private = list(
+ .center = NULL,
+ .scale = NULL
+ )
+)
+
+# Create a helper type constructor that calls new_extension_type()
+quantized <- function(center = 0, scale = 1, storage_type = int32()) {
+ new_extension_type(
+ storage_type = storage_type,
+ extension_name = "arrow.example.quantized",
+ extension_metadata = paste(center, scale, sep = ";"),
+ type_class = QuantizedType
+ )
+}
+
+# Create a helper array constructor that calls new_extension_array()
+quantized_array <- function(x, center = 0, scale = 1,
+ storage_type = int32()) {
+ type <- quantized(center, scale, storage_type)
+ new_extension_array(
+ Array$create((x - center) * scale, type = storage_type),
+ type
+ )
+}
+
+# Register the extension type so that Arrow knows what to do when
+# it encounters this extension type
+reregister_extension_type(quantized())
+
+# Create Array objects and use them!
+(vals <- runif(5, min = 19, max = 21))
+
+(array <- quantized_array(
+ vals,
+ center = 20,
+ scale = 2 ^ 15 - 1,
+ storage_type = int16())
+)
+
+array$type$center()
+array$type$scale()
+
+as.vector(array)
+\dontshow{\}) # examplesIf}
+}
diff --git a/r/man/vctrs_extension_array.Rd b/r/man/vctrs_extension_array.Rd
new file mode 100644
index 0000000000..b80ce48dc2
--- /dev/null
+++ b/r/man/vctrs_extension_array.Rd
@@ -0,0 +1,50 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/extension.R
+\name{vctrs_extension_array}
+\alias{vctrs_extension_array}
+\alias{vctrs_extension_type}
+\title{Extension type for generic typed vectors}
+\usage{
+vctrs_extension_array(x, ptype = vctrs::vec_ptype(x), storage_type = NULL)
+
+vctrs_extension_type(ptype, storage_type = type(vctrs::vec_data(ptype)))
+}
+\arguments{
+\item{x}{A vctr (i.e., \code{\link[vctrs:vec_assert]{vctrs::vec_is()}} returns \code{TRUE}).}
+
+\item{ptype}{A \code{\link[vctrs:vec_ptype]{vctrs::vec_ptype()}}, which is usually a zero-length
+version of the object with the appropriate attributes set. This value
+will be serialized using \code{\link[=serialize]{serialize()}}, so it should not refer to any
+R object that can't be saved/reloaded.}
+
+\item{storage_type}{The \link[=data-type]{data type} of the underlying storage
+array.}
+}
+\value{
+\itemize{
+\item \code{vctrs_extension_array()} returns an \link{ExtensionArray} instance with a
+\code{vctrs_extension_type()}.
+\item \code{vctrs_extension_type()} returns an \link{ExtensionType} instance for the
+extension name "arrow.r.vctrs".
+}
+}
+\description{
+Most common R vector types are converted automatically to a suitable
+Arrow \link[=data-type]{data type} without the need for an extension type. For
+vector types whose conversion is not suitably handled by default, you can
+create a \code{\link[=vctrs_extension_array]{vctrs_extension_array()}}, which passes \code{\link[vctrs:vec_data]{vctrs::vec_data()}} to
+\code{Array$create()} and calls \code{\link[vctrs:vec_proxy]{vctrs::vec_restore()}} when the \link{Array} is
+converted back into an R vector.
+}
+\examples{
+\dontshow{if (arrow_available()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf}
+(array <- vctrs_extension_array(as.POSIXlt("2022-01-02 03:45", tz = "UTC")))
+array$type
+as.vector(array)
+
+temp_feather <- tempfile()
+write_feather(arrow_table(col = array), temp_feather)
+read_feather(temp_feather)
+unlink(temp_feather)
+\dontshow{\}) # examplesIf}
+}
diff --git a/r/src/array.cpp b/r/src/array.cpp
index 8fcc96e0d4..16490bbaec 100644
--- a/r/src/array.cpp
+++ b/r/src/array.cpp
@@ -41,6 +41,8 @@ const char* r6_class_name<arrow::Array>::get(const std::shared_ptr<arrow::Array>
return "FixedSizeListArray";
case arrow::Type::MAP:
return "MapArray";
+ case arrow::Type::EXTENSION:
+ return "ExtensionArray";
default:
return "Array";
diff --git a/r/src/array_to_vector.cpp b/r/src/array_to_vector.cpp
index 06d0a87a10..b89738d6c6 100644
--- a/r/src/array_to_vector.cpp
+++ b/r/src/array_to_vector.cpp
@@ -29,6 +29,7 @@
#include <cpp11/altrep.hpp>
#include <type_traits>
+#include "./extension.h"
#include "./r_task_group.h"
namespace arrow {
@@ -1154,6 +1155,35 @@ class Converter_Null : public Converter {
}
};
+// Unlike other types, conversion of ExtensionType (chunked) arrays occurs at
+// R level via the ExtensionType (or subclass) R6 instance. We do this via Allocate,
+// since it is called once per ChunkedArray.
+class Converter_Extension : public Converter {
+ public:
+ explicit Converter_Extension(const std::shared_ptr<ChunkedArray>& chunked_array)
+ : Converter(chunked_array) {}
+
+ SEXP Allocate(R_xlen_t n) const {
+ auto extension_type =
+ dynamic_cast<const RExtensionType*>(chunked_array_->type().get());
+ if (extension_type == nullptr) {
+ Rf_error("Converter_Extension can't be used with a non-R extension type");
+ }
+
+ return extension_type->Convert(chunked_array_);
+ }
+
+ // At this point we have already done the conversion
+ Status Ingest_all_nulls(SEXP data, R_xlen_t start, R_xlen_t n) const {
+ return Status::OK();
+ }
+
+ Status Ingest_some_nulls(SEXP data, const std::shared_ptr<arrow::Array>& array,
+ R_xlen_t start, R_xlen_t n, size_t chunk_index) const {
+ return Status::OK();
+ }
+};
+
bool ArraysCanFitInteger(ArrayVector arrays) {
bool all_can_fit = true;
auto i32 = arrow::int32();
@@ -1316,6 +1346,9 @@ std::shared_ptr<Converter> Converter::Make(
case Type::NA:
return std::make_shared<arrow::r::Converter_Null>(chunked_array);
+ case Type::EXTENSION:
+ return std::make_shared<arrow::r::Converter_Extension>(chunked_array);
+
default:
break;
}
diff --git a/r/src/arrowExports.cpp b/r/src/arrowExports.cpp
index 0a29ed0872..c4271a19aa 100644
--- a/r/src/arrowExports.cpp
+++ b/r/src/arrowExports.cpp
@@ -4167,6 +4167,147 @@ extern "C" SEXP _arrow_compute___expr__type_id(SEXP x_sexp, SEXP schema_sexp){
}
#endif
+// extension-impl.cpp
+#if defined(ARROW_R_WITH_ARROW)
+cpp11::environment ExtensionType__initialize(const std::shared_ptr<arrow::DataType>& storage_type, std::string extension_name, cpp11::raws extension_metadata, cpp11::environment r6_class);
+extern "C" SEXP _arrow_ExtensionType__initialize(SEXP storage_type_sexp, SEXP extension_name_sexp, SEXP extension_metadata_sexp, SEXP r6_class_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::DataType>&>::type storage_type(storage_type_sexp);
+ arrow::r::Input<std::string>::type extension_name(extension_name_sexp);
+ arrow::r::Input<cpp11::raws>::type extension_metadata(extension_metadata_sexp);
+ arrow::r::Input<cpp11::environment>::type r6_class(r6_class_sexp);
+ return cpp11::as_sexp(ExtensionType__initialize(storage_type, extension_name, extension_metadata, r6_class));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_ExtensionType__initialize(SEXP storage_type_sexp, SEXP extension_name_sexp, SEXP extension_metadata_sexp, SEXP r6_class_sexp){
+ Rf_error("Cannot call ExtensionType__initialize(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// extension-impl.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::string ExtensionType__extension_name(const std::shared_ptr<arrow::ExtensionType>& type);
+extern "C" SEXP _arrow_ExtensionType__extension_name(SEXP type_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::ExtensionType>&>::type type(type_sexp);
+ return cpp11::as_sexp(ExtensionType__extension_name(type));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_ExtensionType__extension_name(SEXP type_sexp){
+ Rf_error("Cannot call ExtensionType__extension_name(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// extension-impl.cpp
+#if defined(ARROW_R_WITH_ARROW)
+cpp11::raws ExtensionType__Serialize(const std::shared_ptr<arrow::ExtensionType>& type);
+extern "C" SEXP _arrow_ExtensionType__Serialize(SEXP type_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::ExtensionType>&>::type type(type_sexp);
+ return cpp11::as_sexp(ExtensionType__Serialize(type));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_ExtensionType__Serialize(SEXP type_sexp){
+ Rf_error("Cannot call ExtensionType__Serialize(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// extension-impl.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::DataType> ExtensionType__storage_type(const std::shared_ptr<arrow::ExtensionType>& type);
+extern "C" SEXP _arrow_ExtensionType__storage_type(SEXP type_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::ExtensionType>&>::type type(type_sexp);
+ return cpp11::as_sexp(ExtensionType__storage_type(type));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_ExtensionType__storage_type(SEXP type_sexp){
+ Rf_error("Cannot call ExtensionType__storage_type(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// extension-impl.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::Array> ExtensionType__MakeArray(const std::shared_ptr<arrow::ExtensionType>& type, const std::shared_ptr<arrow::ArrayData>& data);
+extern "C" SEXP _arrow_ExtensionType__MakeArray(SEXP type_sexp, SEXP data_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::ExtensionType>&>::type type(type_sexp);
+ arrow::r::Input<const std::shared_ptr<arrow::ArrayData>&>::type data(data_sexp);
+ return cpp11::as_sexp(ExtensionType__MakeArray(type, data));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_ExtensionType__MakeArray(SEXP type_sexp, SEXP data_sexp){
+ Rf_error("Cannot call ExtensionType__MakeArray(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// extension-impl.cpp
+#if defined(ARROW_R_WITH_ARROW)
+cpp11::environment ExtensionType__r6_class(const std::shared_ptr<arrow::ExtensionType>& type);
+extern "C" SEXP _arrow_ExtensionType__r6_class(SEXP type_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::ExtensionType>&>::type type(type_sexp);
+ return cpp11::as_sexp(ExtensionType__r6_class(type));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_ExtensionType__r6_class(SEXP type_sexp){
+ Rf_error("Cannot call ExtensionType__r6_class(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// extension-impl.cpp
+#if defined(ARROW_R_WITH_ARROW)
+std::shared_ptr<arrow::Array> ExtensionArray__storage(const std::shared_ptr<arrow::ExtensionArray>& array);
+extern "C" SEXP _arrow_ExtensionArray__storage(SEXP array_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::ExtensionArray>&>::type array(array_sexp);
+ return cpp11::as_sexp(ExtensionArray__storage(array));
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_ExtensionArray__storage(SEXP array_sexp){
+ Rf_error("Cannot call ExtensionArray__storage(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// extension-impl.cpp
+#if defined(ARROW_R_WITH_ARROW)
+void arrow__RegisterRExtensionType(const std::shared_ptr<arrow::DataType>& type);
+extern "C" SEXP _arrow_arrow__RegisterRExtensionType(SEXP type_sexp){
+BEGIN_CPP11
+ arrow::r::Input<const std::shared_ptr<arrow::DataType>&>::type type(type_sexp);
+ arrow__RegisterRExtensionType(type);
+ return R_NilValue;
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_arrow__RegisterRExtensionType(SEXP type_sexp){
+ Rf_error("Cannot call arrow__RegisterRExtensionType(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
+// extension-impl.cpp
+#if defined(ARROW_R_WITH_ARROW)
+void arrow__UnregisterRExtensionType(std::string type_name);
+extern "C" SEXP _arrow_arrow__UnregisterRExtensionType(SEXP type_name_sexp){
+BEGIN_CPP11
+ arrow::r::Input<std::string>::type type_name(type_name_sexp);
+ arrow__UnregisterRExtensionType(type_name);
+ return R_NilValue;
+END_CPP11
+}
+#else
+extern "C" SEXP _arrow_arrow__UnregisterRExtensionType(SEXP type_name_sexp){
+ Rf_error("Cannot call arrow__UnregisterRExtensionType(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. ");
+}
+#endif
+
// feather.cpp
#if defined(ARROW_R_WITH_ARROW)
void ipc___WriteFeather__Table(const std::shared_ptr<arrow::io::OutputStream>& stream, const std::shared_ptr<arrow::Table>& table, int version, int chunk_size, arrow::Compression::type compression, int compression_level);
@@ -8011,6 +8152,15 @@ static const R_CallMethodDef CallEntries[] = {
{ "_arrow_compute___expr__ToString", (DL_FUNC) &_arrow_compute___expr__ToString, 1},
{ "_arrow_compute___expr__type", (DL_FUNC) &_arrow_compute___expr__type, 2},
{ "_arrow_compute___expr__type_id", (DL_FUNC) &_arrow_compute___expr__type_id, 2},
+ { "_arrow_ExtensionType__initialize", (DL_FUNC) &_arrow_ExtensionType__initialize, 4},
+ { "_arrow_ExtensionType__extension_name", (DL_FUNC) &_arrow_ExtensionType__extension_name, 1},
+ { "_arrow_ExtensionType__Serialize", (DL_FUNC) &_arrow_ExtensionType__Serialize, 1},
+ { "_arrow_ExtensionType__storage_type", (DL_FUNC) &_arrow_ExtensionType__storage_type, 1},
+ { "_arrow_ExtensionType__MakeArray", (DL_FUNC) &_arrow_ExtensionType__MakeArray, 2},
+ { "_arrow_ExtensionType__r6_class", (DL_FUNC) &_arrow_ExtensionType__r6_class, 1},
+ { "_arrow_ExtensionArray__storage", (DL_FUNC) &_arrow_ExtensionArray__storage, 1},
+ { "_arrow_arrow__RegisterRExtensionType", (DL_FUNC) &_arrow_arrow__RegisterRExtensionType, 1},
+ { "_arrow_arrow__UnregisterRExtensionType", (DL_FUNC) &_arrow_arrow__UnregisterRExtensionType, 1},
{ "_arrow_ipc___WriteFeather__Table", (DL_FUNC) &_arrow_ipc___WriteFeather__Table, 6},
{ "_arrow_ipc___feather___Reader__version", (DL_FUNC) &_arrow_ipc___feather___Reader__version, 1},
{ "_arrow_ipc___feather___Reader__Read", (DL_FUNC) &_arrow_ipc___feather___Reader__Read, 2},
diff --git a/r/src/datatype.cpp b/r/src/datatype.cpp
index fd083f66d4..68b6c8fada 100644
--- a/r/src/datatype.cpp
+++ b/r/src/datatype.cpp
@@ -101,6 +101,8 @@ const char* r6_class_name<arrow::DataType>::get(
return "StructType";
case Type::DICTIONARY:
return "DictionaryType";
+ case Type::EXTENSION:
+ return "ExtensionType";
default:
break;
diff --git a/r/src/extension-impl.cpp b/r/src/extension-impl.cpp
new file mode 100644
index 0000000000..57c4874c97
--- /dev/null
+++ b/r/src/extension-impl.cpp
@@ -0,0 +1,198 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "./arrow_types.h"
+
+#if defined(ARROW_R_WITH_ARROW)
+
+#include <thread>
+
+#include <arrow/array.h>
+#include <arrow/extension_type.h>
+#include <arrow/type.h>
+
+#include "./extension.h"
+#include "./safe-call-into-r.h"
+
+bool RExtensionType::ExtensionEquals(const arrow::ExtensionType& other) const {
+ // Avoid materializing the R6 instance if at all possible
+ if (other.extension_name() != extension_name()) {
+ return false;
+ }
+
+ if (other.Serialize() == Serialize()) {
+ return true;
+ }
+
+ // With any ambiguity, we need to materialize the R6 instance and call its
+ // ExtensionEquals method. We can't do this on the non-R thread.
+ // After ARROW-15841, we can use SafeCallIntoR.
+ arrow::Result<bool> result = SafeCallIntoR<bool>([&]() {
+ cpp11::environment instance = r6_instance();
+ cpp11::function instance_ExtensionEquals(instance["ExtensionEquals"]);
+
+ std::shared_ptr<DataType> other_shared =
+ ValueOrStop(other.Deserialize(other.storage_type(), other.Serialize()));
+ cpp11::sexp other_r6 = cpp11::to_r6<DataType>(other_shared, "ExtensionType");
+
+ cpp11::logicals result(instance_ExtensionEquals(other_r6));
+ return cpp11::as_cpp<bool>(result);
+ });
+
+ if (!result.ok()) {
+ throw std::runtime_error(result.status().message());
+ }
+
+ return result.ValueUnsafe();
+}
+
+std::shared_ptr<arrow::Array> RExtensionType::MakeArray(
+ std::shared_ptr<arrow::ArrayData> data) const {
+ std::shared_ptr<arrow::ArrayData> new_data = data->Copy();
+ std::unique_ptr<RExtensionType> cloned = Clone();
+ new_data->type = std::shared_ptr<RExtensionType>(cloned.release());
+ return std::make_shared<arrow::ExtensionArray>(new_data);
+}
+
+arrow::Result<std::shared_ptr<arrow::DataType>> RExtensionType::Deserialize(
+ std::shared_ptr<arrow::DataType> storage_type,
+ const std::string& serialized_data) const {
+ std::unique_ptr<RExtensionType> cloned = Clone();
+ cloned->storage_type_ = storage_type;
+ cloned->extension_metadata_ = serialized_data;
+
+ // We could create an ephemeral R6 instance here, which will call the R6 instance's
+ // deserialize_instance() method, possibly erroring when the metadata is
+ // invalid or the deserialized values are invalid. The complexity of setting up
+ // an event loop from wherever this *might* be called is high and hard to
+ // predict. As a compromise, just create the instance when it is safe to
+ // do so.
+ if (GetMainRThread().IsMainThread()) {
+ r6_instance();
+ }
+
+ return std::shared_ptr<RExtensionType>(cloned.release());
+}
+
+std::string RExtensionType::ToString() const {
+ arrow::Result<std::string> result = SafeCallIntoR<std::string>([&]() {
+ cpp11::environment instance = r6_instance();
+ cpp11::function instance_ToString(instance["ToString"]);
+ cpp11::sexp result = instance_ToString();
+ return cpp11::as_cpp<std::string>(result);
+ });
+
+ // In the event of an error (e.g., we are not on the main thread
+ // and we are not inside RunWithCapturedR()), just call the default method
+ if (!result.ok()) {
+ return ExtensionType::ToString();
+ } else {
+ return result.ValueUnsafe();
+ }
+}
+
+cpp11::sexp RExtensionType::Convert(
+ const std::shared_ptr<arrow::ChunkedArray>& array) const {
+ cpp11::environment instance = r6_instance();
+ cpp11::function instance_Convert(instance["as_vector"]);
+ cpp11::sexp array_sexp = cpp11::to_r6<arrow::ChunkedArray>(array, "ChunkedArray");
+ return instance_Convert(array_sexp);
+}
+
+std::unique_ptr<RExtensionType> RExtensionType::Clone() const {
+ RExtensionType* ptr =
+ new RExtensionType(storage_type(), extension_name_, extension_metadata_, r6_class_);
+ return std::unique_ptr<RExtensionType>(ptr);
+}
+
+cpp11::environment RExtensionType::r6_instance(
+ std::shared_ptr<arrow::DataType> storage_type,
+ const std::string& serialized_data) const {
+ // This is a version of to_r6<>() that is a more direct route to creating the object.
+ // This is done to avoid circular calls, since to_r6<>() has to go through
+ // ExtensionType$new(), which then calls back to C++ to get r6_class_ to then
+ // return the correct subclass.
+ std::unique_ptr<RExtensionType> cloned = Clone();
+ cpp11::external_pointer<std::shared_ptr<RExtensionType>> xp(
+ new std::shared_ptr<RExtensionType>(cloned.release()));
+
+ cpp11::function r6_class_new(r6_class()["new"]);
+ return r6_class_new(xp);
+}
+
+// [[arrow::export]]
+cpp11::environment ExtensionType__initialize(
+ const std::shared_ptr<arrow::DataType>& storage_type, std::string extension_name,
+ cpp11::raws extension_metadata, cpp11::environment r6_class) {
+ std::string metadata_string(extension_metadata.begin(), extension_metadata.end());
+ auto r6_class_shared = std::make_shared<cpp11::environment>(r6_class);
+ RExtensionType cpp_type(storage_type, extension_name, metadata_string, r6_class_shared);
+ return cpp_type.r6_instance();
+}
+
+// [[arrow::export]]
+std::string ExtensionType__extension_name(
+ const std::shared_ptr<arrow::ExtensionType>& type) {
+ return type->extension_name();
+}
+
+// [[arrow::export]]
+cpp11::raws ExtensionType__Serialize(const std::shared_ptr<arrow::ExtensionType>& type) {
+ std::string serialized_string = type->Serialize();
+ cpp11::writable::raws bytes(serialized_string.begin(), serialized_string.end());
+ return bytes;
+}
+
+// [[arrow::export]]
+std::shared_ptr<arrow::DataType> ExtensionType__storage_type(
+ const std::shared_ptr<arrow::ExtensionType>& type) {
+ return type->storage_type();
+}
+
+// [[arrow::export]]
+std::shared_ptr<arrow::Array> ExtensionType__MakeArray(
+ const std::shared_ptr<arrow::ExtensionType>& type,
+ const std::shared_ptr<arrow::ArrayData>& data) {
+ return type->MakeArray(data);
+}
+
+// [[arrow::export]]
+cpp11::environment ExtensionType__r6_class(
+ const std::shared_ptr<arrow::ExtensionType>& type) {
+ auto r_type =
+ arrow::internal::checked_pointer_cast<RExtensionType, arrow::ExtensionType>(type);
+ return r_type->r6_class();
+}
+
+// [[arrow::export]]
+std::shared_ptr<arrow::Array> ExtensionArray__storage(
+ const std::shared_ptr<arrow::ExtensionArray>& array) {
+ return array->storage();
+}
+
+// [[arrow::export]]
+void arrow__RegisterRExtensionType(const std::shared_ptr<arrow::DataType>& type) {
+ auto ext_type = std::dynamic_pointer_cast<arrow::ExtensionType>(type);
+ StopIfNotOk(arrow::RegisterExtensionType(ext_type));
+}
+
+// [[arrow::export]]
+void arrow__UnregisterRExtensionType(std::string type_name) {
+ StopIfNotOk(arrow::UnregisterExtensionType(type_name));
+}
+
+#endif
diff --git a/r/src/extension.h b/r/src/extension.h
new file mode 100644
index 0000000000..fbd3ad4846
--- /dev/null
+++ b/r/src/extension.h
@@ -0,0 +1,75 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// or more contributor license agreements. See the NOTICE file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "./arrow_types.h"
+
+#include <arrow/array.h>
+#include <arrow/extension_type.h>
+#include <arrow/type.h>
+
+// A wrapper around arrow::ExtensionType that allows R to register extension
+// types whose Deserialize, ExtensionEquals, and Serialize methods are
+// in meaningfully handled at the R level. At the C++ level, the type is
+// already serialized to minimize calls to R from C++.
+//
+// Using a std::shared_ptr<> to wrap a cpp11::sexp type is unusual, but we
+// need it here to avoid calling the copy constructor from another thread,
+// since this might call into the R API. If we don't do this, we get crashes
+// when reading a multi-file Dataset.
+class RExtensionType : public arrow::ExtensionType {
+ public:
+ RExtensionType(const std::shared_ptr<arrow::DataType> storage_type,
+ std::string extension_name, std::string extension_metadata,
+ std::shared_ptr<cpp11::environment> r6_class)
+ : arrow::ExtensionType(storage_type),
+ extension_name_(extension_name),
+ extension_metadata_(extension_metadata),
+ r6_class_(r6_class) {}
+
+ std::string extension_name() const { return extension_name_; }
+
+ bool ExtensionEquals(const arrow::ExtensionType& other) const;
+
+ std::shared_ptr<arrow::Array> MakeArray(std::shared_ptr<arrow::ArrayData> data) const;
+
+ arrow::Result<std::shared_ptr<arrow::DataType>> Deserialize(
+ std::shared_ptr<arrow::DataType> storage_type,
+ const std::string& serialized_data) const;
+
+ std::string Serialize() const { return extension_metadata_; }
+
+ std::string ToString() const;
+
+ cpp11::sexp Convert(const std::shared_ptr<arrow::ChunkedArray>& array) const;
+
+ std::unique_ptr<RExtensionType> Clone() const;
+
+ cpp11::environment r6_class() const { return *r6_class_; }
+
+ cpp11::environment r6_instance(std::shared_ptr<arrow::DataType> storage_type,
+ const std::string& serialized_data) const;
+
+ cpp11::environment r6_instance() const {
+ return r6_instance(storage_type(), Serialize());
+ }
+
+ private:
+ std::string extension_name_;
+ std::string extension_metadata_;
+ std::string cached_to_string_;
+ std::shared_ptr<cpp11::environment> r6_class_;
+};
diff --git a/r/tests/testthat/_snaps/extension.md b/r/tests/testthat/_snaps/extension.md
new file mode 100644
index 0000000000..4335958b8a
--- /dev/null
+++ b/r/tests/testthat/_snaps/extension.md
@@ -0,0 +1,10 @@
+# extension types can be created
+
+ `extension_array` must be a ChunkedArray or ExtensionArray
+ i Got object of type character
+
+# vctrs extension type works
+
+ `extension_array` must be a ChunkedArray or ExtensionArray
+ i Got object of type character
+
diff --git a/r/tests/testthat/test-extension.R b/r/tests/testthat/test-extension.R
new file mode 100644
index 0000000000..cf82b2f1f2
--- /dev/null
+++ b/r/tests/testthat/test-extension.R
@@ -0,0 +1,345 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+test_that("extension types can be created", {
+ type <- new_extension_type(
+ int32(),
+ "arrow_r.simple_extension",
+ charToRaw("some custom metadata"),
+ )
+
+ expect_r6_class(type, "ExtensionType")
+ expect_identical(type$extension_name(), "arrow_r.simple_extension")
+ expect_true(type$storage_type() == int32())
+ expect_identical(type$storage_id(), int32()$id)
+ expect_identical(type$extension_metadata(), charToRaw("some custom metadata"))
+ expect_identical(type$ToString(), "ExtensionType <some custom metadata>")
+
+ storage <- Array$create(1:10)
+ array <- type$WrapArray(storage)
+ expect_r6_class(array, "ExtensionArray")
+ expect_r6_class(array$type, "ExtensionType")
+
+ expect_true(array$type == type)
+ expect_true(all(array$storage() == storage))
+
+ expect_identical(array$as_vector(), 1:10)
+ expect_identical(chunked_array(array)$as_vector(), 1:10)
+
+ expect_snapshot_error(
+ type$as_vector("not an extension array or chunked array")
+ )
+})
+
+test_that("extension type subclasses work", {
+ SomeExtensionTypeSubclass <- R6Class(
+ "SomeExtensionTypeSubclass", inherit = ExtensionType,
+ public = list(
+ some_custom_method = function() {
+ private$some_custom_field
+ },
+
+ deserialize_instance = function() {
+ private$some_custom_field <- head(self$extension_metadata(), 5)
+ }
+ ),
+ private = list(
+ some_custom_field = NULL
+ )
+ )
+
+ type <- new_extension_type(
+ int32(),
+ "some_extension_subclass",
+ charToRaw("some custom metadata"),
+ type_class = SomeExtensionTypeSubclass
+ )
+
+ expect_r6_class(type, "SomeExtensionTypeSubclass")
+ expect_identical(type$some_custom_method(), charToRaw("some "))
+
+ register_extension_type(type)
+
+ # create a new type instance with storage/metadata not identical
+ # to the registered type
+ type2 <- new_extension_type(
+ float64(),
+ "some_extension_subclass",
+ charToRaw("some other custom metadata"),
+ type_class = SomeExtensionTypeSubclass
+ )
+
+ ptr_type <- allocate_arrow_schema()
+ type2$export_to_c(ptr_type)
+ type3 <- DataType$import_from_c(ptr_type)
+ delete_arrow_schema(ptr_type)
+
+ expect_identical(type3$extension_name(), "some_extension_subclass")
+ expect_identical(type3$some_custom_method(), type2$some_custom_method())
+ expect_identical(type3$extension_metadata(), type2$extension_metadata())
+ expect_true(type3$storage_type() == type2$storage_type())
+
+ array <- type3$WrapArray(Array$create(1:10))
+ expect_r6_class(array, "ExtensionArray")
+
+ unregister_extension_type("some_extension_subclass")
+})
+
+test_that("extension types can use UTF-8 for metadata", {
+ type <- new_extension_type(
+ int32(),
+ "arrow.test.simple_extension",
+ "\U0001f4a9\U0001f4a9\U0001f4a9\U0001f4a9"
+ )
+
+ expect_identical(
+ type$extension_metadata_utf8(),
+ "\U0001f4a9\U0001f4a9\U0001f4a9\U0001f4a9"
+ )
+
+ expect_match(type$ToString(), "\U0001f4a9", fixed = TRUE)
+})
+
+test_that("extension types can be printed that don't use UTF-8 for metadata", {
+ type <- new_extension_type(
+ int32(),
+ "arrow.test.simple_extension",
+ as.raw(0:5)
+ )
+
+ expect_match(type$ToString(), "00 01 02 03 04 05")
+})
+
+test_that("extension subclasses can override the ExtensionEquals method", {
+ SomeExtensionTypeSubclass <- R6Class(
+ "SomeExtensionTypeSubclass", inherit = ExtensionType,
+ public = list(
+ field_values = NULL,
+
+ deserialize_instance = function() {
+ self$field_values <- unserialize(self$extension_metadata())
+ },
+
+ ExtensionEquals = function(other) {
+ if (!inherits(other, "SomeExtensionTypeSubclass")) {
+ return(FALSE)
+ }
+
+ setequal(names(other$field_values), names(self$field_values)) &&
+ identical(
+ other$field_values[names(self$field_values)],
+ self$field_values
+ )
+ }
+ )
+ )
+
+ type <- new_extension_type(
+ int32(),
+ "some_extension_subclass",
+ serialize(list(field1 = "value1", field2 = "value2"), NULL),
+ type_class = SomeExtensionTypeSubclass
+ )
+
+ register_extension_type(type)
+
+ expect_true(type$ExtensionEquals(type))
+ expect_true(type$Equals(type))
+
+ type2 <- new_extension_type(
+ int32(),
+ "some_extension_subclass",
+ serialize(list(field2 = "value2", field1 = "value1"), NULL),
+ type_class = SomeExtensionTypeSubclass
+ )
+
+ expect_true(type$ExtensionEquals(type2))
+ expect_true(type$Equals(type2))
+
+ unregister_extension_type("some_extension_subclass")
+})
+
+test_that("vctrs extension type works", {
+ custom_vctr <- vctrs::new_vctr(
+ 1:4,
+ attr_key = "attr_val",
+ class = "arrow_custom_test"
+ )
+
+ type <- vctrs_extension_type(custom_vctr)
+ expect_r6_class(type, "VctrsExtensionType")
+ expect_identical(type$ptype(), vctrs::vec_ptype(custom_vctr))
+ expect_true(type$Equals(type))
+ expect_match(type$ToString(), "arrow_custom_test")
+
+ array_in <- vctrs_extension_array(custom_vctr)
+ expect_true(array_in$type$Equals(type))
+ expect_identical(vctrs_extension_array(array_in), array_in)
+
+ tf <- tempfile()
+ on.exit(unlink(tf))
+ write_feather(arrow_table(col = array_in), tf)
+ table_out <- read_feather(tf, as_data_frame = FALSE)
+ array_out <- table_out$col$chunk(0)
+
+ expect_r6_class(array_out$type, "VctrsExtensionType")
+ expect_r6_class(array_out, "ExtensionArray")
+
+ expect_true(array_out$type$Equals(type))
+ expect_identical(
+ array_out$as_vector(),
+ custom_vctr
+ )
+
+ chunked_array_out <- table_out$col
+ expect_true(chunked_array_out$type$Equals(type))
+ expect_identical(
+ chunked_array_out$as_vector(),
+ custom_vctr
+ )
+
+ expect_snapshot_error(
+ type$as_vector("not an extension array or chunked array")
+ )
+})
+
+test_that("chunked arrays can roundtrip extension types", {
+ custom_vctr1 <- vctrs::new_vctr(1:4, class = "arrow_custom_test")
+ custom_vctr2 <- vctrs::new_vctr(5:8, class = "arrow_custom_test")
+ custom_array1 <- vctrs_extension_array(custom_vctr1)
+ custom_array2 <- vctrs_extension_array(custom_vctr2)
+
+ custom_chunked <- chunked_array(custom_array1, custom_array2)
+ expect_r6_class(custom_chunked$type, "VctrsExtensionType")
+ expect_identical(
+ custom_chunked$as_vector(),
+ vctrs::new_vctr(1:8, class = "arrow_custom_test")
+ )
+})
+
+test_that("RecordBatch can roundtrip extension types", {
+ custom_vctr <- vctrs::new_vctr(1:8, class = "arrow_custom_test")
+ custom_array <- vctrs_extension_array(custom_vctr)
+ normal_vctr <- letters[1:8]
+
+ custom_record_batch <- record_batch(custom = custom_array)
+ expect_identical(
+ custom_record_batch$to_data_frame(),
+ tibble::tibble(
+ custom = custom_vctr
+ )
+ )
+
+ mixed_record_batch <- record_batch(
+ custom = custom_array,
+ normal = normal_vctr
+ )
+ expect_identical(
+ mixed_record_batch$to_data_frame(),
+ tibble::tibble(
+ custom = custom_vctr,
+ normal = normal_vctr
+ )
+ )
+
+ # check both column orders, since column order should stay in the same
+ # order whether the colunns are are extension types or not
+ mixed_record_batch2 <- record_batch(
+ normal = normal_vctr,
+ custom = custom_array
+ )
+ expect_identical(
+ mixed_record_batch2$to_data_frame(),
+ tibble::tibble(
+ normal = normal_vctr,
+ custom = custom_vctr
+ )
+ )
+})
+
+test_that("Table can roundtrip extension types", {
+ custom_vctr <- vctrs::new_vctr(1:8, class = "arrow_custom_test")
+ custom_array <- vctrs_extension_array(custom_vctr)
+ normal_vctr <- letters[1:8]
+
+ custom_table <- arrow_table(custom = custom_array)
+ expect_identical(
+ custom_table$to_data_frame(),
+ tibble::tibble(
+ custom = custom_vctr
+ )
+ )
+
+ mixed_table <- arrow_table(
+ custom = custom_array,
+ normal = normal_vctr
+ )
+ expect_identical(
+ mixed_table$to_data_frame(),
+ tibble::tibble(
+ custom = custom_vctr,
+ normal = normal_vctr
+ )
+ )
+
+ # check both column orders, since column order should stay in the same
+ # order whether the colunns are are extension types or not
+ mixed_table2 <- arrow_table(
+ normal = normal_vctr,
+ custom = custom_array
+ )
+ expect_identical(
+ mixed_table2$to_data_frame(),
+ tibble::tibble(
+ normal = normal_vctr,
+ custom = custom_vctr
+ )
+ )
+})
+
+test_that("Dataset/arrow_dplyr_query can roundtrip extension types", {
+ skip_if_not_available("dataset")
+
+ tf <- tempfile()
+ on.exit(unlink(tf, recursive = TRUE))
+
+ df <- expand.grid(
+ number = 1:10,
+ letter = letters,
+ stringsAsFactors = FALSE,
+ KEEP.OUT.ATTRS = FALSE
+ ) %>%
+ tibble::as_tibble()
+
+ df$extension <- vctrs::new_vctr(df$letter, class = "arrow_custom_vctr")
+
+ table <- arrow_table(
+ number = df$number,
+ letter = df$letter,
+ extension = vctrs_extension_array(df$extension)
+ )
+
+ table %>%
+ dplyr::group_by(number) %>%
+ write_dataset(tf)
+
+ roundtripped <- open_dataset(tf) %>%
+ dplyr::select(number, letter, extension) %>%
+ dplyr::collect()
+
+ expect_identical(unclass(roundtripped$extension), roundtripped$letter)
+})