You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by th...@apache.org on 2023/06/23 07:22:37 UTC
[arrow] branch main updated: GH-35542 : [R] Implement schema extraction function (#35543)
This is an automated email from the ASF dual-hosted git repository.
thisisnic pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new 29a339f573 GH-35542 : [R] Implement schema extraction function (#35543)
29a339f573 is described below
commit 29a339f573a25f1963b66bf9036bd253a266dac6
Author: Nic Crane <th...@gmail.com>
AuthorDate: Fri Jun 23 08:22:30 2023 +0100
GH-35542 : [R] Implement schema extraction function (#35543)
``` r
library(arrow)
mtcarrow <- arrow_table(mtcars)
schema(mtcarrow)
#> Schema
#> mpg: double
#> cyl: double
#> disp: double
#> hp: double
#> drat: double
#> wt: double
#> qsec: double
#> vs: double
#> am: double
#> gear: double
#> carb: double
#>
#> See $metadata for additional Schema metadata
```
<sup>Created on 2023-05-11 with [reprex v2.0.2](https://reprex.tidyverse.org)</sup>
* Closes: #35542
Lead-authored-by: Nic Crane <th...@gmail.com>
Co-authored-by: Dewey Dunnington <de...@dunnington.ca>
Signed-off-by: Nic Crane <th...@gmail.com>
---
r/NAMESPACE | 6 +++
r/R/arrow-package.R | 1 +
r/R/csv.R | 2 +-
r/R/dplyr-collect.R | 3 +-
r/R/dplyr-summarize.R | 2 +-
r/R/schema.R | 77 +++++++++++++++++++++++++++---------
r/_pkgdown.yml | 4 +-
r/man/infer_schema.Rd | 14 +++++++
r/man/{Schema.Rd => schema-class.Rd} | 22 -----------
r/man/schema.Rd | 39 ++++++++++++++++++
r/tests/testthat/test-schema.R | 16 ++++++++
11 files changed, 141 insertions(+), 45 deletions(-)
diff --git a/r/NAMESPACE b/r/NAMESPACE
index 2bf1a863a2..e2078a1290 100644
--- a/r/NAMESPACE
+++ b/r/NAMESPACE
@@ -108,6 +108,10 @@ S3method(head,ExecPlanReader)
S3method(head,RecordBatchReader)
S3method(head,Scanner)
S3method(head,arrow_dplyr_query)
+S3method(infer_schema,ArrowTabular)
+S3method(infer_schema,Dataset)
+S3method(infer_schema,RecordBatchReader)
+S3method(infer_schema,arrow_dplyr_query)
S3method(infer_type,ArrowDatum)
S3method(infer_type,Expression)
S3method(infer_type,blob)
@@ -327,6 +331,7 @@ export(float64)
export(gs_bucket)
export(halffloat)
export(hive_partition)
+export(infer_schema)
export(infer_type)
export(install_arrow)
export(install_pyarrow)
@@ -455,6 +460,7 @@ importFrom(rlang,f_env)
importFrom(rlang,f_rhs)
importFrom(rlang,inform)
importFrom(rlang,is_bare_character)
+importFrom(rlang,is_bare_list)
importFrom(rlang,is_call)
importFrom(rlang,is_character)
importFrom(rlang,is_empty)
diff --git a/r/R/arrow-package.R b/r/R/arrow-package.R
index 6105a062a8..79871d8735 100644
--- a/r/R/arrow-package.R
+++ b/r/R/arrow-package.R
@@ -27,6 +27,7 @@
#' @importFrom rlang is_list call2 is_empty as_function as_label arg_match is_symbol is_call call_args
#' @importFrom rlang quo_set_env quo_get_env is_formula quo_is_call f_rhs parse_expr f_env new_quosure
#' @importFrom rlang new_quosures expr_text caller_env check_dots_empty check_dots_empty0 dots_list is_string inform
+#' @importFrom rlang is_bare_list
#' @importFrom tidyselect vars_pull eval_select eval_rename
#' @importFrom glue glue
#' @useDynLib arrow, .registration = TRUE
diff --git a/r/R/csv.R b/r/R/csv.R
index 6af46acb25..c8a13630d2 100644
--- a/r/R/csv.R
+++ b/r/R/csv.R
@@ -696,7 +696,7 @@ readr_to_csv_convert_options <- function(na,
}))
# To "guess" types, omit them from col_types
col_types <- keep(col_types, ~ !is.null(.x))
- col_types <- schema(!!!col_types)
+ col_types <- schema(col_types)
}
if (!is.null(col_types)) {
diff --git a/r/R/dplyr-collect.R b/r/R/dplyr-collect.R
index 970722e86a..c3232c6ff7 100644
--- a/r/R/dplyr-collect.R
+++ b/r/R/dplyr-collect.R
@@ -192,5 +192,6 @@ implicit_schema <- function(.data) {
aggregate_types(.data, hash, old_schm)
)
}
- schema(!!!new_fields)
+
+ schema(new_fields)
}
diff --git a/r/R/dplyr-summarize.R b/r/R/dplyr-summarize.R
index 4eb24ab8b8..1e9d42969d 100644
--- a/r/R/dplyr-summarize.R
+++ b/r/R/dplyr-summarize.R
@@ -470,7 +470,7 @@ summarize_eval <- function(name, quosure, ctx, hash) {
list(
selected_columns = agg_field_refs,
.data = list(
- schema = schema(!!!agg_field_types)
+ schema = schema(agg_field_types)
)
)
)
diff --git a/r/R/schema.R b/r/R/schema.R
index dc0b4ba81f..c7bc8accb6 100644
--- a/r/R/schema.R
+++ b/r/R/schema.R
@@ -75,22 +75,8 @@
#' Files with compressed metadata are readable by older versions of arrow, but
#' the metadata is dropped.
#'
-#' @rdname Schema
+#' @rdname schema-class
#' @name Schema
-#' @examples
-#' schema(a = int32(), b = float64())
-#'
-#' schema(
-#' field("b", double()),
-#' field("c", bool(), nullable = FALSE),
-#' field("d", string())
-#' )
-#'
-#' df <- data.frame(col1 = 2:4, col2 = c(0.1, 0.3, 0.5))
-#' tab1 <- arrow_table(df)
-#' tab1$schema
-#' tab2 <- arrow_table(df, schema = schema(col1 = int8(), col2 = float32()))
-#' tab2$schema
#' @export
Schema <- R6Class("Schema",
inherit = ArrowObject,
@@ -244,10 +230,63 @@ print_schema_fields <- function(s) {
paste(map_chr(s$fields, ~ .$ToString()), collapse = "\n")
}
-#' @param ... [fields][field] or field name/[data type][data-type] pairs
+#' Schemas
+#'
+#' Create a schema or extract one from an object.
+#'
+#' @seealso [Schema] for detailed documentation of the Schema R6 object
+#' @param ... [fields][field], field name/[data type][data-type] pairs (or a list of), or object from which to extract
+#' a schema
+#' @examples
+#' # Create schema using pairs of field names and data types
+#' schema(a = int32(), b = float64())
+#'
+#' # Create a schema using a list of pairs of field names and data types
+#' schema(list(a = int8(), b = string()))
+#'
+#' # Create schema using fields
+#' schema(
+#' field("b", double()),
+#' field("c", bool(), nullable = FALSE),
+#' field("d", string())
+#' )
+#'
+#' # Extract schemas from objects
+#' df <- data.frame(col1 = 2:4, col2 = c(0.1, 0.3, 0.5))
+#' tab1 <- arrow_table(df)
+#' schema(tab1)
+#' tab2 <- arrow_table(df, schema = schema(col1 = int8(), col2 = float32()))
+#' schema(tab2)
+#' @export
+schema <- function(...) {
+ dots <- list2(...)
+
+ if (length(dots) == 1 && !is_bare_list(dots[[1]]) && is.null(names(dots)) && !inherits(dots[[1]], "Field")) {
+ return(infer_schema(dots[[1]]))
+ }
+
+ Schema$create(!!!dots)
+}
+
+#' Extract a schema from an object
+#'
+#' @param x An object which has a schema, e.g. a `Dataset`
+#' @export
+infer_schema <- function(x) {
+ UseMethod("infer_schema")
+}
+
+#' @export
+infer_schema.ArrowTabular <- function(x) x$schema
+
+#' @export
+infer_schema.RecordBatchReader <- function(x) x$schema
+
+#' @export
+infer_schema.Dataset <- function(x) x$schema
+
#' @export
-#' @rdname Schema
-schema <- Schema$create
+infer_schema.arrow_dplyr_query <- function(x) implicit_schema(x)
#' @export
names.Schema <- function(x) x$names
@@ -411,7 +450,7 @@ as_schema.Schema <- function(x, ...) {
#' @rdname as_schema
#' @export
as_schema.StructType <- function(x, ...) {
- schema(!!!x$fields())
+ schema(x$fields())
}
#' @export
diff --git a/r/_pkgdown.yml b/r/_pkgdown.yml
index 26a77199b5..029debc772 100644
--- a/r/_pkgdown.yml
+++ b/r/_pkgdown.yml
@@ -206,7 +206,9 @@ reference:
- title: Arrow data types and schema
contents:
- - Schema
+ - schema
+ - infer_schema
+ - schema-class
- unify_schemas
- infer_type
- dictionary
diff --git a/r/man/infer_schema.Rd b/r/man/infer_schema.Rd
new file mode 100644
index 0000000000..3f7e5126c6
--- /dev/null
+++ b/r/man/infer_schema.Rd
@@ -0,0 +1,14 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/schema.R
+\name{infer_schema}
+\alias{infer_schema}
+\title{Extract a schema from an object}
+\usage{
+infer_schema(x)
+}
+\arguments{
+\item{x}{An object which has a schema, e.g. a \code{Dataset}}
+}
+\description{
+Extract a schema from an object
+}
diff --git a/r/man/Schema.Rd b/r/man/schema-class.Rd
similarity index 87%
rename from r/man/Schema.Rd
rename to r/man/schema-class.Rd
index f81f6c397d..32250cdfe7 100644
--- a/r/man/Schema.Rd
+++ b/r/man/schema-class.Rd
@@ -3,14 +3,7 @@
\docType{class}
\name{Schema}
\alias{Schema}
-\alias{schema}
\title{Schema class}
-\usage{
-schema(...)
-}
-\arguments{
-\item{...}{\link[=field]{fields} or field name/\link[=data-type]{data type} pairs}
-}
\description{
A \code{Schema} is an Arrow object containing \link{Field}s, which map names to
Arrow \link[=data-type]{data types}. Create a \code{Schema} when you
@@ -74,18 +67,3 @@ Files with compressed metadata are readable by older versions of arrow, but
the metadata is dropped.
}
-\examples{
-schema(a = int32(), b = float64())
-
-schema(
- field("b", double()),
- field("c", bool(), nullable = FALSE),
- field("d", string())
-)
-
-df <- data.frame(col1 = 2:4, col2 = c(0.1, 0.3, 0.5))
-tab1 <- arrow_table(df)
-tab1$schema
-tab2 <- arrow_table(df, schema = schema(col1 = int8(), col2 = float32()))
-tab2$schema
-}
diff --git a/r/man/schema.Rd b/r/man/schema.Rd
new file mode 100644
index 0000000000..42532d84b4
--- /dev/null
+++ b/r/man/schema.Rd
@@ -0,0 +1,39 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/schema.R
+\name{schema}
+\alias{schema}
+\title{Schemas}
+\usage{
+schema(...)
+}
+\arguments{
+\item{...}{\link[=field]{fields}, field name/\link[=data-type]{data type} pairs (or a list of), or object from which to extract
+a schema}
+}
+\description{
+Create a schema or extract one from an object.
+}
+\examples{
+# Create schema using pairs of field names and data types
+schema(a = int32(), b = float64())
+
+# Create a schema using a list of pairs of field names and data types
+schema(list(a = int8(), b = string()))
+
+# Create schema using fields
+schema(
+ field("b", double()),
+ field("c", bool(), nullable = FALSE),
+ field("d", string())
+)
+
+# Extract schemas from objects
+df <- data.frame(col1 = 2:4, col2 = c(0.1, 0.3, 0.5))
+tab1 <- arrow_table(df)
+schema(tab1)
+tab2 <- arrow_table(df, schema = schema(col1 = int8(), col2 = float32()))
+schema(tab2)
+}
+\seealso{
+\link{Schema} for detailed documentation of the Schema R6 object
+}
diff --git a/r/tests/testthat/test-schema.R b/r/tests/testthat/test-schema.R
index a6a0555eac..77259fb507 100644
--- a/r/tests/testthat/test-schema.R
+++ b/r/tests/testthat/test-schema.R
@@ -292,3 +292,19 @@ test_that("schema name assignment", {
expect_identical(names(schm2), c("col1", "col2"))
expect_identical(names(schm2$r_metadata$columns), c("col1", "col2"))
})
+
+test_that("schema extraction", {
+ skip_if_not_available("dataset")
+ tbl <- arrow_table(example_data)
+ expect_equal(schema(tbl), tbl$schema)
+
+ ds <- InMemoryDataset$create(example_data)
+ expect_equal(schema(ds), ds$schema)
+
+ rdr <- RecordBatchReader$create(record_batch(example_data))
+ expect_equal(schema(rdr), rdr$schema)
+
+ adq <- as_adq(example_data)
+ expect_equal(schema(adq), adq$.data$schema)
+
+})