You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by pa...@apache.org on 2023/05/17 13:01:30 UTC

[arrow-nanoarrow] branch main updated: feat(r): Union array support (#195)

This is an automated email from the ASF dual-hosted git repository.

paleolimbot pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-nanoarrow.git


The following commit(s) were added to refs/heads/main by this push:
     new f559d59  feat(r): Union array support (#195)
f559d59 is described below

commit f559d59aaa031111a07c8c163163205c60899ee0
Author: Dewey Dunnington <de...@dunnington.ca>
AuthorDate: Wed May 17 09:01:24 2023 -0400

    feat(r): Union array support (#195)
    
    Needed to support all the types that ADBC might return (see
    https://github.com/r-dbi/adbc/discussions/4 ). This currently returns
    unions as `data.frame()` with NA values (basically a sparse union
    equivalent for R). There are certainly other ways to represent that in R
    (maybe just a `list()`). R/nanoarrow models conversions in both
    directions as many <-> many, so a future PR could implement
    `as_nanoarrow_array(list(), schema = na_sparse|dense_union()` and/or
    `convert_array(union_array, list())`.
    
    ``` r
    library(nanoarrow)
    
    dense_union <- as_nanoarrow_array(
      data.frame(dbl = c(NA, 2), chr = c("one", NA)),
      schema = na_dense_union(list(dbl = na_double(), chr = na_string()))
    )
    
    sparse_union <- as_nanoarrow_array(
      data.frame(dbl = c(NA, 2), chr = c("one", NA)),
      schema = na_sparse_union(list(dbl = na_double(), chr = na_string()))
    )
    
    dense_union
    #> <nanoarrow_array dense_union([0,1])[2]>
    #>  $ length    : int 2
    #>  $ null_count: int 0
    #>  $ offset    : int 0
    #>  $ buffers   :List of 2
    #>   ..$ :<nanoarrow_buffer_type_id[2 b] at 0x116b8b310>
    #>   ..$ :<nanoarrow_buffer_union_offset[8 b] at 0x1270ca660>
    #>  $ children  :List of 2
    #>   ..$ dbl:<nanoarrow_array double[1]>
    #>   .. ..$ length    : int 1
    #>   .. ..$ null_count: int 0
    #>   .. ..$ offset    : int 0
    #>   .. ..$ buffers   :List of 2
    #>   .. .. ..$ :<nanoarrow_buffer_validity[0 b] at 0x0>
    #>   .. .. ..$ :<nanoarrow_buffer_data_double[8 b] at 0x11693e528>
    #>   .. ..$ dictionary: NULL
    #>   .. ..$ children  : list()
    #>   ..$ chr:<nanoarrow_array string[1]>
    #>   .. ..$ length    : int 1
    #>   .. ..$ null_count: int 0
    #>   .. ..$ offset    : int 0
    #>   .. ..$ buffers   :List of 3
    #>   .. .. ..$ :<nanoarrow_buffer_validity[0 b] at 0x0>
    #>   .. .. ..$ :<nanoarrow_buffer_data_offset32[8 b] at 0x13700ceb0>
    #>   .. .. ..$ :<nanoarrow_buffer_data_utf8[3 b] at 0x1370eff30>
    #>   .. ..$ dictionary: NULL
    #>   .. ..$ children  : list()
    #>  $ dictionary: NULL
    convert_array(dense_union)
    #>   dbl  chr
    #> 1  NA  one
    #> 2   2 <NA>
    
    sparse_union
    #> <nanoarrow_array sparse_union([0,1])[2]>
    #>  $ length    : int 2
    #>  $ null_count: int 0
    #>  $ offset    : int 0
    #>  $ buffers   :List of 1
    #>   ..$ :<nanoarrow_buffer_type_id[2 b] at 0x11731f630>
    #>  $ children  :List of 2
    #>   ..$ dbl:<nanoarrow_array double[2]>
    #>   .. ..$ length    : int 2
    #>   .. ..$ null_count: int 1
    #>   .. ..$ offset    : int 0
    #>   .. ..$ buffers   :List of 2
    #>   .. .. ..$ :<nanoarrow_buffer_validity[1 b] at 0x1370e6070>
    #>   .. .. ..$ :<nanoarrow_buffer_data_double[16 b] at 0x1200305f8>
    #>   .. ..$ dictionary: NULL
    #>   .. ..$ children  : list()
    #>   ..$ chr:<nanoarrow_array string[2]>
    #>   .. ..$ length    : int 2
    #>   .. ..$ null_count: int 1
    #>   .. ..$ offset    : int 0
    #>   .. ..$ buffers   :List of 3
    #>   .. .. ..$ :<nanoarrow_buffer_validity[1 b] at 0x1370fd540>
    #>   .. .. ..$ :<nanoarrow_buffer_data_offset32[12 b] at 0x1370f1e00>
    #>   .. .. ..$ :<nanoarrow_buffer_data_utf8[3 b] at 0x1370f1e10>
    #>   .. ..$ dictionary: NULL
    #>   .. ..$ children  : list()
    #>  $ dictionary: NULL
    convert_array(sparse_union)
    #>   dbl  chr
    #> 1  NA  one
    #> 2   2 <NA>
    ```
    
    <sup>Created on 2023-05-15 with [reprex
    v2.0.2](https://reprex.tidyverse.org)</sup>
---
 r/R/as-array.R                   | 65 +++++++++++++++++++++++++++++
 r/src/array_stream.c             | 11 +++--
 r/src/as_array.c                 | 31 ++++++++------
 r/src/convert_array.c            | 23 ++++++++--
 r/src/infer_ptype.c              |  2 +
 r/src/materialize.c              | 90 ++++++++++++++++++++++++++++++++++++----
 r/src/schema.h                   |  7 ++++
 r/tests/testthat/test-as-array.R | 76 +++++++++++++++++++++++++++++++++
 8 files changed, 277 insertions(+), 28 deletions(-)

diff --git a/r/R/as-array.R b/r/R/as-array.R
index c381dfe..a9781b1 100644
--- a/r/R/as-array.R
+++ b/r/R/as-array.R
@@ -210,6 +210,71 @@ as_nanoarrow_array.vctrs_unspecified <- function(x, ..., schema = NULL) {
   )
 }
 
+# Called from C to create a union array when requested.
+# There are other types of objects that might make sense to
+# convert to a union but we basically just need enough to
+# for testing at this point.
+union_array_from_data_frame <- function(x, schema) {
+  if (length(x) == 0 || length(x) > 127) {
+    stop(
+      sprintf(
+        "Can't convert data frame with %d columns to union array",
+        length(x)
+      )
+    )
+  }
+
+  # Compute NAs
+  x_is_na <- do.call("cbind", lapply(x, is.na))
+
+  # Make sure we only have one non-NA value per row to make sure we don't drop
+  # values
+  stopifnot(all(rowSums(!x_is_na) <= 1))
+
+  child_index <- rep_len(0L, nrow(x))
+  seq_x <- seq_along(x)
+  for (i in seq_along(child_index)) {
+    for (j in seq_x) {
+      if (!x_is_na[i, j]) {
+        child_index[i] <- j - 1L
+        break;
+      }
+    }
+  }
+
+  switch(
+    nanoarrow_schema_parse(schema)$storage_type,
+    "dense_union" = {
+      is_child <- lapply(seq_x - 1L, "==", child_index)
+      child_offset_each <- lapply(is_child, function(x) cumsum(x) - 1L)
+      child_offset <- lapply(seq_along(child_index), function(i) {
+        child_offset_each[[child_index[i] + 1]][i]
+      })
+
+      children <- Map("[", x, is_child, drop = FALSE)
+      names(children) <- names(schema$children)
+      array <- nanoarrow_array_init(schema)
+      nanoarrow_array_modify(
+        array,
+        list(
+          length = length(child_index),
+          null_count = 0,
+          buffers = list(as.raw(child_index), as.integer(child_offset)),
+          children = children
+        )
+      )
+    },
+    "sparse_union" = {
+      struct_schema <- na_struct(schema$children)
+      array <- as_nanoarrow_array(x, array = struct_schema)
+      nanoarrow_array_set_schema(array, schema, validate = FALSE)
+      array$buffers[[1]] <- as.raw(child_index)
+      array
+    },
+    stop("Attempt to create union from non-union array type")
+  )
+}
+
 # This is defined because it's verbose to pass named arguments from C.
 # When converting data frame columns, we try the internal C conversions
 # first to save R evaluation overhead. When the internal conversions fail,
diff --git a/r/src/array_stream.c b/r/src/array_stream.c
index f0832b8..685b57c 100644
--- a/r/src/array_stream.c
+++ b/r/src/array_stream.c
@@ -78,14 +78,19 @@ SEXP nanoarrow_c_array_stream_get_next(SEXP array_stream_xptr) {
 SEXP nanoarrow_c_basic_array_stream(SEXP batches_sexp, SEXP schema_xptr,
                                     SEXP validate_sexp) {
   int validate = LOGICAL(validate_sexp)[0];
-  struct ArrowSchema* schema = schema_from_xptr(schema_xptr);
+
+  // Schema needs a copy here because ArrowBasicArrayStreamInit() takes ownership
+  SEXP schema_copy_xptr = PROTECT(schema_owning_xptr());
+  struct ArrowSchema* schema_copy =
+      (struct ArrowSchema*)R_ExternalPtrAddr(schema_copy_xptr);
+  schema_export(schema_xptr, schema_copy);
 
   SEXP array_stream_xptr = PROTECT(array_stream_owning_xptr());
   struct ArrowArrayStream* array_stream =
       (struct ArrowArrayStream*)R_ExternalPtrAddr(array_stream_xptr);
 
   int64_t n_arrays = Rf_xlength(batches_sexp);
-  if (ArrowBasicArrayStreamInit(array_stream, schema, n_arrays) != NANOARROW_OK) {
+  if (ArrowBasicArrayStreamInit(array_stream, schema_copy, n_arrays) != NANOARROW_OK) {
     Rf_error("Failed to initialize array stream");
   }
 
@@ -102,7 +107,7 @@ SEXP nanoarrow_c_basic_array_stream(SEXP batches_sexp, SEXP schema_xptr,
     }
   }
 
-  UNPROTECT(1);
+  UNPROTECT(2);
   return array_stream_xptr;
 }
 
diff --git a/r/src/as_array.c b/r/src/as_array.c
index 6221d8e..adc3ef7 100644
--- a/r/src/as_array.c
+++ b/r/src/as_array.c
@@ -29,8 +29,8 @@
 #include "util.h"
 
 static void call_as_nanoarrow_array(SEXP x_sexp, struct ArrowArray* array,
-                                    SEXP schema_xptr) {
-  SEXP fun = PROTECT(Rf_install("as_nanoarrow_array_from_c"));
+                                    SEXP schema_xptr, const char* fun_name) {
+  SEXP fun = PROTECT(Rf_install(fun_name));
   SEXP call = PROTECT(Rf_lang3(fun, x_sexp, schema_xptr));
   SEXP result = PROTECT(Rf_eval(call, nanoarrow_ns_pkg));
 
@@ -57,7 +57,7 @@ static void as_array_int(SEXP x_sexp, struct ArrowArray* array, SEXP schema_xptr
 
   // Only consider the default create for now
   if (schema_view.type != NANOARROW_TYPE_INT32) {
-    call_as_nanoarrow_array(x_sexp, array, schema_xptr);
+    call_as_nanoarrow_array(x_sexp, array, schema_xptr, "as_nanoarrow_array_from_c");
     return;
   }
 
@@ -132,7 +132,7 @@ static void as_array_lgl(SEXP x_sexp, struct ArrowArray* array, SEXP schema_xptr
 
   // Only consider bool for now
   if (schema_view.type != NANOARROW_TYPE_BOOL) {
-    call_as_nanoarrow_array(x_sexp, array, schema_xptr);
+    call_as_nanoarrow_array(x_sexp, array, schema_xptr, "as_nanoarrow_array_from_c");
     return;
   }
 
@@ -210,7 +210,7 @@ static void as_array_dbl(SEXP x_sexp, struct ArrowArray* array, SEXP schema_xptr
     case NANOARROW_TYPE_INT32:
       break;
     default:
-      call_as_nanoarrow_array(x_sexp, array, schema_xptr);
+      call_as_nanoarrow_array(x_sexp, array, schema_xptr, "as_nanoarrow_array_from_c");
       return;
   }
 
@@ -329,7 +329,7 @@ static void as_array_chr(SEXP x_sexp, struct ArrowArray* array, SEXP schema_xptr
 
   // Only consider the default create for now
   if (schema_view.type != NANOARROW_TYPE_STRING) {
-    call_as_nanoarrow_array(x_sexp, array, schema_xptr);
+    call_as_nanoarrow_array(x_sexp, array, schema_xptr, "as_nanoarrow_array_from_c");
     return;
   }
 
@@ -415,9 +415,16 @@ static void as_array_data_frame(SEXP x_sexp, struct ArrowArray* array, SEXP sche
     Rf_error("ArrowSchemaViewInit(): %s", error->message);
   }
 
-  if (schema_view.type != NANOARROW_TYPE_STRUCT) {
-    call_as_nanoarrow_array(x_sexp, array, schema_xptr);
-    return;
+  switch (schema_view.type) {
+    case NANOARROW_TYPE_SPARSE_UNION:
+    case NANOARROW_TYPE_DENSE_UNION:
+      call_as_nanoarrow_array(x_sexp, array, schema_xptr, "union_array_from_data_frame");
+      return;
+    case NANOARROW_TYPE_STRUCT:
+      break;
+    default:
+      call_as_nanoarrow_array(x_sexp, array, schema_xptr, "as_nanoarrow_array_from_c");
+      return;
   }
 
   if (Rf_xlength(x_sexp) != schema->n_children) {
@@ -459,7 +466,7 @@ static void as_array_list(SEXP x_sexp, struct ArrowArray* array, SEXP schema_xpt
   // Arbitrary nested list support is complicated without some concept of a
   // "builder", which we don't use.
   if (schema_view.type != NANOARROW_TYPE_BINARY) {
-    call_as_nanoarrow_array(x_sexp, array, schema_xptr);
+    call_as_nanoarrow_array(x_sexp, array, schema_xptr, "as_nanoarrow_array_from_c");
     return;
   }
 
@@ -542,7 +549,7 @@ static void as_array_default(SEXP x_sexp, struct ArrowArray* array, SEXP schema_
       as_array_data_frame(x_sexp, array, schema_xptr, error);
       return;
     } else {
-      call_as_nanoarrow_array(x_sexp, array, schema_xptr);
+      call_as_nanoarrow_array(x_sexp, array, schema_xptr, "as_nanoarrow_array_from_c");
       return;
     }
   }
@@ -564,7 +571,7 @@ static void as_array_default(SEXP x_sexp, struct ArrowArray* array, SEXP schema_
       as_array_list(x_sexp, array, schema_xptr, error);
       return;
     default:
-      call_as_nanoarrow_array(x_sexp, array, schema_xptr);
+      call_as_nanoarrow_array(x_sexp, array, schema_xptr, "as_nanoarrow_array_from_c");
       return;
   }
 }
diff --git a/r/src/convert_array.c b/r/src/convert_array.c
index bb31958..0cd66be 100644
--- a/r/src/convert_array.c
+++ b/r/src/convert_array.c
@@ -33,6 +33,10 @@
 // (i.e., no need to allocate a zero-size ptype) and returning ALTREP
 // where possible.
 
+// borrow nanoarrow_c_infer_ptype() from infer_ptype.c
+SEXP nanoarrow_c_infer_ptype(SEXP schema_xptr);
+enum VectorType nanoarrow_infer_vector_type_array(SEXP array_xptr);
+
 // This calls nanoarrow::convert_array() (via a package helper) to try S3
 // dispatch to find a convert_array() method (or error if there
 // isn't one)
@@ -107,6 +111,21 @@ static SEXP convert_array_chr(SEXP array_xptr) {
 SEXP nanoarrow_c_convert_array(SEXP array_xptr, SEXP ptype_sexp);
 
 static SEXP convert_array_data_frame(SEXP array_xptr, SEXP ptype_sexp) {
+  // If array_xptr is a union, use default convert behaviour
+  struct ArrowSchema* schema = schema_from_array_xptr(array_xptr);
+  struct ArrowSchemaView schema_view;
+  if (ArrowSchemaViewInit(&schema_view, schema, NULL) != NANOARROW_OK) {
+    Rf_error("Invalid schema");
+  }
+
+  if (schema_view.storage_type != NANOARROW_TYPE_STRUCT) {
+    ptype_sexp = PROTECT(nanoarrow_c_infer_ptype(array_xptr_get_schema(array_xptr)));
+    SEXP default_result =
+        convert_array_default(array_xptr, VECTOR_TYPE_DATA_FRAME, ptype_sexp);
+    UNPROTECT(1);
+    return default_result;
+  }
+
   struct ArrowArray* array = array_from_xptr(array_xptr);
   R_xlen_t n_col = array->n_children;
   SEXP result = PROTECT(Rf_allocVector(VECSXP, n_col));
@@ -155,10 +174,6 @@ static SEXP convert_array_data_frame(SEXP array_xptr, SEXP ptype_sexp) {
   return result;
 }
 
-// borrow nanoarrow_c_infer_ptype() from infer_ptype.c
-SEXP nanoarrow_c_infer_ptype(SEXP schema_xptr);
-enum VectorType nanoarrow_infer_vector_type_array(SEXP array_xptr);
-
 SEXP nanoarrow_c_convert_array(SEXP array_xptr, SEXP ptype_sexp) {
   // See if we can skip any ptype resolution at all
   if (ptype_sexp == R_NilValue) {
diff --git a/r/src/infer_ptype.c b/r/src/infer_ptype.c
index ed72522..1e0879f 100644
--- a/r/src/infer_ptype.c
+++ b/r/src/infer_ptype.c
@@ -58,6 +58,8 @@ enum VectorType nanoarrow_infer_vector_type(enum ArrowType type) {
     case NANOARROW_TYPE_LARGE_STRING:
       return VECTOR_TYPE_CHR;
 
+    case NANOARROW_TYPE_DENSE_UNION:
+    case NANOARROW_TYPE_SPARSE_UNION:
     case NANOARROW_TYPE_STRUCT:
       return VECTOR_TYPE_DATA_FRAME;
 
diff --git a/r/src/materialize.c b/r/src/materialize.c
index ad88f83..48b8c32 100644
--- a/r/src/materialize.c
+++ b/r/src/materialize.c
@@ -137,22 +137,94 @@ SEXP nanoarrow_materialize_realloc(SEXP ptype, R_xlen_t len) {
   return result;
 }
 
+// Used in union building to pre-set all values to null
+static void fill_vec_with_nulls(SEXP x, R_xlen_t offset, R_xlen_t len) {
+  if (nanoarrow_ptype_is_data_frame(x)) {
+    for (R_xlen_t i = 0; i < Rf_xlength(x); i++) {
+      fill_vec_with_nulls(VECTOR_ELT(x, i), offset, len);
+    }
+
+    return;
+  }
+
+  switch (TYPEOF(x)) {
+    case LGLSXP:
+    case INTSXP: {
+      int* values = INTEGER(x);
+      for (R_xlen_t i = 0; i < len; i++) {
+        values[offset + i] = NA_INTEGER;
+      }
+      return;
+    }
+    case REALSXP: {
+      double* values = REAL(x);
+      for (R_xlen_t i = 0; i < len; i++) {
+        values[offset + i] = NA_REAL;
+      }
+      return;
+    }
+    case STRSXP:
+      for (R_xlen_t i = 0; i < len; i++) {
+        SET_STRING_ELT(x, offset + i, NA_STRING);
+      }
+      return;
+    case VECSXP:
+      for (R_xlen_t i = 0; i < len; i++) {
+        SET_VECTOR_ELT(x, offset + i, R_NilValue);
+      }
+      return;
+    default:
+      Rf_error("Attempt to fill vector with nulls with unsupported type");
+  }
+}
+
 static int nanoarrow_materialize_data_frame(struct RConverter* converter,
                                             SEXP converter_xptr) {
   if (converter->ptype_view.vector_type != VECTOR_TYPE_DATA_FRAME) {
     return EINVAL;
   }
 
-  for (R_xlen_t i = 0; i < converter->n_children; i++) {
-    converter->children[i]->src.offset = converter->src.offset;
-    converter->children[i]->src.length = converter->src.length;
-    converter->children[i]->dst.offset = converter->dst.offset;
-    converter->children[i]->dst.length = converter->dst.length;
-    NANOARROW_RETURN_NOT_OK(
-        nanoarrow_materialize(converter->children[i], converter_xptr));
-  }
+  SEXP converter_shelter = R_ExternalPtrProtected(converter_xptr);
+  SEXP child_converter_xptrs = VECTOR_ELT(converter_shelter, 3);
 
-  return NANOARROW_OK;
+  switch (converter->array_view.storage_type) {
+    case NANOARROW_TYPE_STRUCT:
+      for (R_xlen_t i = 0; i < converter->n_children; i++) {
+        converter->children[i]->src.offset = converter->src.offset;
+        converter->children[i]->src.length = converter->src.length;
+        converter->children[i]->dst.offset = converter->dst.offset;
+        converter->children[i]->dst.length = converter->dst.length;
+        SEXP child_converter_xptr = VECTOR_ELT(child_converter_xptrs, i);
+        NANOARROW_RETURN_NOT_OK(
+            nanoarrow_materialize(converter->children[i], child_converter_xptr));
+      }
+      return NANOARROW_OK;
+
+    case NANOARROW_TYPE_DENSE_UNION:
+    case NANOARROW_TYPE_SPARSE_UNION:
+      // Pre-fill everything with nulls
+      fill_vec_with_nulls(converter->dst.vec_sexp, converter->dst.offset,
+                          converter->dst.length);
+
+      // Fill in the possibly non-null values one at a time
+      for (R_xlen_t i = 0; i < converter->dst.length; i++) {
+        int64_t child_index = ArrowArrayViewUnionChildIndex(&converter->array_view,
+                                                            converter->src.offset + i);
+        int64_t child_offset = ArrowArrayViewUnionChildOffset(&converter->array_view,
+                                                              converter->src.offset + i);
+        converter->children[child_index]->src.offset = child_offset;
+        converter->children[child_index]->src.length = 1;
+        converter->children[child_index]->dst.offset = converter->dst.offset + i;
+        converter->children[child_index]->dst.length = 1;
+        SEXP child_converter_xptr = VECTOR_ELT(child_converter_xptrs, child_index);
+        NANOARROW_RETURN_NOT_OK(nanoarrow_materialize(converter->children[child_index],
+                                                      child_converter_xptr));
+      }
+      return NANOARROW_OK;
+
+    default:
+      return ENOTSUP;
+  }
 }
 
 static int materialize_list_element(struct RConverter* converter, SEXP converter_xptr,
diff --git a/r/src/schema.h b/r/src/schema.h
index c1ec55b..5f252ab 100644
--- a/r/src/schema.h
+++ b/r/src/schema.h
@@ -79,4 +79,11 @@ static inline SEXP schema_owning_xptr(void) {
   return schema_xptr;
 }
 
+static inline void schema_export(SEXP schema_xptr, struct ArrowSchema* schema_copy) {
+  int result = ArrowSchemaDeepCopy(schema_from_xptr(schema_xptr), schema_copy);
+  if (result != NANOARROW_OK) {
+    Rf_error("ArrowSchemaDeepCopy() failed");
+  }
+}
+
 #endif
diff --git a/r/tests/testthat/test-as-array.R b/r/tests/testthat/test-as-array.R
index fac5e65..7ed5cc8 100644
--- a/r/tests/testthat/test-as-array.R
+++ b/r/tests/testthat/test-as-array.R
@@ -495,3 +495,79 @@ test_that("as_nanoarrow_array() works for bad unspecified() create", {
     as_nanoarrow_array(vctrs::unspecified(5), schema = na_interval_day_time())
   )
 })
+
+test_that("as_nanoarrow_array() can convert data.frame() to sparse_union()", {
+  # Features: At least one element with more than one non-NA value,
+  # one element with all NA values.
+  test_df <- data.frame(
+    lgl = c(TRUE, NA, NA, NA, NA, FALSE),
+    int = c(NA, 123L, NA, NA, NA, NA),
+    dbl = c(NA, NA, 456, NA, NA, NA),
+    chr = c(NA, NA, NA, "789", NA, NA),
+    stringsAsFactors = FALSE
+  )
+
+  array <- as_nanoarrow_array(
+    test_df,
+    schema = na_sparse_union(lapply(test_df, infer_nanoarrow_schema))
+  )
+
+  expect_identical(infer_nanoarrow_schema(array)$format, "+us:0,1,2,3")
+  expect_identical(array$length, 6L)
+  expect_identical(array$null_count, 0L)
+  expect_identical(
+    as.raw(array$buffers[[1]]),
+    as.raw(as_nanoarrow_buffer(as.raw(c(0L, 1L, 2L, 3L, 0L, 0L))))
+  )
+
+  expect_identical(
+    lapply(array$children, convert_array),
+    lapply(test_df, identity)
+  )
+  expect_identical(convert_array(array), test_df)
+})
+
+test_that("as_nanoarrow_array() can convert data.frame() to sparse_union()", {
+  test_df <- data.frame(
+    lgl = c(TRUE, NA, NA, NA, NA, FALSE),
+    int = c(NA, 123L, NA, NA, NA, NA),
+    dbl = c(NA, NA, 456, NA, NA, NA),
+    chr = c(NA, NA, NA, "789", NA, NA),
+    stringsAsFactors = FALSE
+  )
+
+  array <- as_nanoarrow_array(
+    test_df,
+    schema = na_dense_union(lapply(test_df, infer_nanoarrow_schema))
+  )
+
+  expect_identical(infer_nanoarrow_schema(array)$format, "+ud:0,1,2,3")
+  expect_identical(array$length, 6L)
+  expect_identical(array$null_count, 0L)
+  expect_identical(
+    as.raw(array$buffers[[1]]),
+    as.raw(as_nanoarrow_buffer(as.raw(c(0L, 1L, 2L, 3L, 0L, 0L))))
+  )
+  expect_identical(
+    as.raw(array$buffers[[2]]),
+    as.raw(as_nanoarrow_buffer(c(0L, 0L, 0L, 0L, 1L, 2L)))
+  )
+
+  expect_identical(
+    lapply(array$children, convert_array),
+    list(
+      lgl = c(TRUE, NA, FALSE),
+      int = 123L,
+      dbl = 456,
+      chr = "789"
+    )
+  )
+  expect_identical(convert_array(array), test_df)
+})
+
+test_that("as_nanoarrow_array() for union type errors for unsupported objects", {
+  expect_error(
+    as_nanoarrow_array(data.frame(), schema = na_dense_union()),
+    "Can't convert data frame with 0 columns"
+  )
+})