You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by th...@apache.org on 2023/06/23 07:22:37 UTC
[arrow] branch main updated: GH-35542 : [R] Implement schema extraction function (#35543)

This is an automated email from the ASF dual-hosted git repository.

thisisnic pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
     new 29a339f573 GH-35542 : [R] Implement schema extraction function (#35543)
29a339f573 is described below

commit 29a339f573a25f1963b66bf9036bd253a266dac6
Author: Nic Crane <th...@gmail.com>
AuthorDate: Fri Jun 23 08:22:30 2023 +0100

    GH-35542 : [R] Implement schema extraction function (#35543)
    
    ``` r
    library(arrow)
    mtcarrow <- arrow_table(mtcars)
    schema(mtcarrow)
    #> Schema
    #> mpg: double
    #> cyl: double
    #> disp: double
    #> hp: double
    #> drat: double
    #> wt: double
    #> qsec: double
    #> vs: double
    #> am: double
    #> gear: double
    #> carb: double
    #>
    #> See $metadata for additional Schema metadata
    ```
    
    <sup>Created on 2023-05-11 with [reprex v2.0.2](https://reprex.tidyverse.org)</sup>
    
    * Closes: #35542
    
    Lead-authored-by: Nic Crane <th...@gmail.com>
    Co-authored-by: Dewey Dunnington <de...@dunnington.ca>
    Signed-off-by: Nic Crane <th...@gmail.com>
---
 r/NAMESPACE                          |  6 +++
 r/R/arrow-package.R                  |  1 +
 r/R/csv.R                            |  2 +-
 r/R/dplyr-collect.R                  |  3 +-
 r/R/dplyr-summarize.R                |  2 +-
 r/R/schema.R                         | 77 +++++++++++++++++++++++++++---------
 r/_pkgdown.yml                       |  4 +-
 r/man/infer_schema.Rd                | 14 +++++++
 r/man/{Schema.Rd => schema-class.Rd} | 22 -----------
 r/man/schema.Rd                      | 39 ++++++++++++++++++
 r/tests/testthat/test-schema.R       | 16 ++++++++
 11 files changed, 141 insertions(+), 45 deletions(-)

diff --git a/r/NAMESPACE b/r/NAMESPACE
index 2bf1a863a2..e2078a1290 100644
--- a/r/NAMESPACE
+++ b/r/NAMESPACE
@@ -108,6 +108,10 @@ S3method(head,ExecPlanReader)
 S3method(head,RecordBatchReader)
 S3method(head,Scanner)
 S3method(head,arrow_dplyr_query)
+S3method(infer_schema,ArrowTabular)
+S3method(infer_schema,Dataset)
+S3method(infer_schema,RecordBatchReader)
+S3method(infer_schema,arrow_dplyr_query)
 S3method(infer_type,ArrowDatum)
 S3method(infer_type,Expression)
 S3method(infer_type,blob)
@@ -327,6 +331,7 @@ export(float64)
 export(gs_bucket)
 export(halffloat)
 export(hive_partition)
+export(infer_schema)
 export(infer_type)
 export(install_arrow)
 export(install_pyarrow)
@@ -455,6 +460,7 @@ importFrom(rlang,f_env)
 importFrom(rlang,f_rhs)
 importFrom(rlang,inform)
 importFrom(rlang,is_bare_character)
+importFrom(rlang,is_bare_list)
 importFrom(rlang,is_call)
 importFrom(rlang,is_character)
 importFrom(rlang,is_empty)
diff --git a/r/R/arrow-package.R b/r/R/arrow-package.R
index 6105a062a8..79871d8735 100644
--- a/r/R/arrow-package.R
+++ b/r/R/arrow-package.R
@@ -27,6 +27,7 @@
 #' @importFrom rlang is_list call2 is_empty as_function as_label arg_match is_symbol is_call call_args
 #' @importFrom rlang quo_set_env quo_get_env is_formula quo_is_call f_rhs parse_expr f_env new_quosure
 #' @importFrom rlang new_quosures expr_text caller_env check_dots_empty check_dots_empty0 dots_list is_string inform
+#' @importFrom rlang is_bare_list
 #' @importFrom tidyselect vars_pull eval_select eval_rename
 #' @importFrom glue glue
 #' @useDynLib arrow, .registration = TRUE
diff --git a/r/R/csv.R b/r/R/csv.R
index 6af46acb25..c8a13630d2 100644
--- a/r/R/csv.R
+++ b/r/R/csv.R
@@ -696,7 +696,7 @@ readr_to_csv_convert_options <- function(na,
     }))
     # To "guess" types, omit them from col_types
     col_types <- keep(col_types, ~ !is.null(.x))
-    col_types <- schema(!!!col_types)
+    col_types <- schema(col_types)
   }
 
   if (!is.null(col_types)) {
diff --git a/r/R/dplyr-collect.R b/r/R/dplyr-collect.R
index 970722e86a..c3232c6ff7 100644
--- a/r/R/dplyr-collect.R
+++ b/r/R/dplyr-collect.R
@@ -192,5 +192,6 @@ implicit_schema <- function(.data) {
       aggregate_types(.data, hash, old_schm)
     )
   }
-  schema(!!!new_fields)
+
+  schema(new_fields)
 }
diff --git a/r/R/dplyr-summarize.R b/r/R/dplyr-summarize.R
index 4eb24ab8b8..1e9d42969d 100644
--- a/r/R/dplyr-summarize.R
+++ b/r/R/dplyr-summarize.R
@@ -470,7 +470,7 @@ summarize_eval <- function(name, quosure, ctx, hash) {
       list(
         selected_columns = agg_field_refs,
         .data = list(
-          schema = schema(!!!agg_field_types)
+          schema = schema(agg_field_types)
         )
       )
     )
diff --git a/r/R/schema.R b/r/R/schema.R
index dc0b4ba81f..c7bc8accb6 100644
--- a/r/R/schema.R
+++ b/r/R/schema.R
@@ -75,22 +75,8 @@
 #'   Files with compressed metadata are readable by older versions of arrow, but
 #'   the metadata is dropped.
 #'
-#' @rdname Schema
+#' @rdname schema-class
 #' @name Schema
-#' @examples
-#' schema(a = int32(), b = float64())
-#'
-#' schema(
-#'   field("b", double()),
-#'   field("c", bool(), nullable = FALSE),
-#'   field("d", string())
-#' )
-#'
-#' df <- data.frame(col1 = 2:4, col2 = c(0.1, 0.3, 0.5))
-#' tab1 <- arrow_table(df)
-#' tab1$schema
-#' tab2 <- arrow_table(df, schema = schema(col1 = int8(), col2 = float32()))
-#' tab2$schema
 #' @export
 Schema <- R6Class("Schema",
   inherit = ArrowObject,
@@ -244,10 +230,63 @@ print_schema_fields <- function(s) {
   paste(map_chr(s$fields, ~ .$ToString()), collapse = "\n")
 }
 
-#' @param ... [fields][field] or field name/[data type][data-type] pairs
+#' Schemas
+#'
+#' Create a schema or extract one from an object.
+#'
+#' @seealso [Schema] for detailed documentation of the Schema R6 object
+#' @param ... [fields][field], field name/[data type][data-type] pairs (or a list of), or object from which to extract
+#'  a schema
+#' @examples
+#' # Create schema using pairs of field names and data types
+#' schema(a = int32(), b = float64())
+#'
+#' # Create a schema using a list of pairs of field names and data types
+#' schema(list(a = int8(), b = string()))
+#'
+#' # Create schema using fields
+#' schema(
+#'   field("b", double()),
+#'   field("c", bool(), nullable = FALSE),
+#'   field("d", string())
+#' )
+#'
+#' # Extract schemas from objects
+#' df <- data.frame(col1 = 2:4, col2 = c(0.1, 0.3, 0.5))
+#' tab1 <- arrow_table(df)
+#' schema(tab1)
+#' tab2 <- arrow_table(df, schema = schema(col1 = int8(), col2 = float32()))
+#' schema(tab2)
+#' @export
+schema <- function(...) {
+  dots <- list2(...)
+
+  if (length(dots) == 1 && !is_bare_list(dots[[1]]) && is.null(names(dots)) && !inherits(dots[[1]], "Field")) {
+    return(infer_schema(dots[[1]]))
+  }
+
+  Schema$create(!!!dots)
+}
+
+#' Extract a schema from an object
+#'
+#' @param x An object which has a schema, e.g. a `Dataset`
+#' @export
+infer_schema <- function(x) {
+  UseMethod("infer_schema")
+}
+
+#' @export
+infer_schema.ArrowTabular <- function(x) x$schema
+
+#' @export
+infer_schema.RecordBatchReader <- function(x) x$schema
+
+#' @export
+infer_schema.Dataset <- function(x) x$schema
+
 #' @export
-#' @rdname Schema
-schema <- Schema$create
+infer_schema.arrow_dplyr_query <- function(x) implicit_schema(x)
 
 #' @export
 names.Schema <- function(x) x$names
@@ -411,7 +450,7 @@ as_schema.Schema <- function(x, ...) {
 #' @rdname as_schema
 #' @export
 as_schema.StructType <- function(x, ...) {
-  schema(!!!x$fields())
+  schema(x$fields())
 }
 
 #' @export
diff --git a/r/_pkgdown.yml b/r/_pkgdown.yml
index 26a77199b5..029debc772 100644
--- a/r/_pkgdown.yml
+++ b/r/_pkgdown.yml
@@ -206,7 +206,9 @@ reference:
 
   - title: Arrow data types and schema
     contents:
-      - Schema
+      - schema
+      - infer_schema
+      - schema-class
       - unify_schemas
       - infer_type
       - dictionary
diff --git a/r/man/infer_schema.Rd b/r/man/infer_schema.Rd
new file mode 100644
index 0000000000..3f7e5126c6
--- /dev/null
+++ b/r/man/infer_schema.Rd
@@ -0,0 +1,14 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/schema.R
+\name{infer_schema}
+\alias{infer_schema}
+\title{Extract a schema from an object}
+\usage{
+infer_schema(x)
+}
+\arguments{
+\item{x}{An object which has a schema, e.g. a \code{Dataset}}
+}
+\description{
+Extract a schema from an object
+}
diff --git a/r/man/Schema.Rd b/r/man/schema-class.Rd
similarity index 87%
rename from r/man/Schema.Rd
rename to r/man/schema-class.Rd
index f81f6c397d..32250cdfe7 100644
--- a/r/man/Schema.Rd
+++ b/r/man/schema-class.Rd
@@ -3,14 +3,7 @@
 \docType{class}
 \name{Schema}
 \alias{Schema}
-\alias{schema}
 \title{Schema class}
-\usage{
-schema(...)
-}
-\arguments{
-\item{...}{\link[=field]{fields} or field name/\link[=data-type]{data type} pairs}
-}
 \description{
 A \code{Schema} is an Arrow object containing \link{Field}s, which map names to
 Arrow \link[=data-type]{data types}. Create a \code{Schema} when you
@@ -74,18 +67,3 @@ Files with compressed metadata are readable by older versions of arrow, but
 the metadata is dropped.
 }
 
-\examples{
-schema(a = int32(), b = float64())
-
-schema(
-  field("b", double()),
-  field("c", bool(), nullable = FALSE),
-  field("d", string())
-)
-
-df <- data.frame(col1 = 2:4, col2 = c(0.1, 0.3, 0.5))
-tab1 <- arrow_table(df)
-tab1$schema
-tab2 <- arrow_table(df, schema = schema(col1 = int8(), col2 = float32()))
-tab2$schema
-}
diff --git a/r/man/schema.Rd b/r/man/schema.Rd
new file mode 100644
index 0000000000..42532d84b4
--- /dev/null
+++ b/r/man/schema.Rd
@@ -0,0 +1,39 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/schema.R
+\name{schema}
+\alias{schema}
+\title{Schemas}
+\usage{
+schema(...)
+}
+\arguments{
+\item{...}{\link[=field]{fields}, field name/\link[=data-type]{data type} pairs (or a list of), or object from which to extract
+a schema}
+}
+\description{
+Create a schema or extract one from an object.
+}
+\examples{
+# Create schema using pairs of field names and data types
+schema(a = int32(), b = float64())
+
+# Create a schema using a list of pairs of field names and data types
+schema(list(a = int8(), b = string()))
+
+# Create schema using fields
+schema(
+  field("b", double()),
+  field("c", bool(), nullable = FALSE),
+  field("d", string())
+)
+
+# Extract schemas from objects
+df <- data.frame(col1 = 2:4, col2 = c(0.1, 0.3, 0.5))
+tab1 <- arrow_table(df)
+schema(tab1)
+tab2 <- arrow_table(df, schema = schema(col1 = int8(), col2 = float32()))
+schema(tab2)
+}
+\seealso{
+\link{Schema} for detailed documentation of the Schema R6 object
+}
diff --git a/r/tests/testthat/test-schema.R b/r/tests/testthat/test-schema.R
index a6a0555eac..77259fb507 100644
--- a/r/tests/testthat/test-schema.R
+++ b/r/tests/testthat/test-schema.R
@@ -292,3 +292,19 @@ test_that("schema name assignment", {
   expect_identical(names(schm2), c("col1", "col2"))
   expect_identical(names(schm2$r_metadata$columns), c("col1", "col2"))
 })
+
+test_that("schema extraction", {
+  skip_if_not_available("dataset")
+  tbl <- arrow_table(example_data)
+  expect_equal(schema(tbl), tbl$schema)
+
+  ds <- InMemoryDataset$create(example_data)
+  expect_equal(schema(ds), ds$schema)
+
+  rdr <- RecordBatchReader$create(record_batch(example_data))
+  expect_equal(schema(rdr), rdr$schema)
+
+  adq <- as_adq(example_data)
+  expect_equal(schema(adq), adq$.data$schema)
+
+})