You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ko...@apache.org on 2022/10/20 21:43:52 UTC
[arrow] 05/13: ARROW-18079: [R] Improve efficiency of schema creation to prevent performance regressions (#14447)

This is an automated email from the ASF dual-hosted git repository.

kou pushed a commit to branch maint-10.0.0
in repository https://gitbox.apache.org/repos/asf/arrow.git

commit 04ccd84bb88123a55134edb982cc695a5d5e0632
Author: Nic Crane <th...@gmail.com>
AuthorDate: Tue Oct 18 16:22:33 2022 +0100

    ARROW-18079: [R] Improve efficiency of schema creation to prevent performance regressions (#14447)
    
    Authored-by: Nic Crane <th...@gmail.com>
    Signed-off-by: Nic Crane <th...@gmail.com>
---
 r/NAMESPACE            |  1 +
 r/R/arrow-package.R    |  2 +-
 r/R/arrowExports.R     |  8 ++++++--
 r/R/dplyr-collect.R    |  6 ++++++
 r/R/dplyr-eval.R       |  3 ++-
 r/R/dplyr-select.R     |  3 ++-
 r/R/schema.R           |  6 +++---
 r/src/arrowExports.cpp | 17 +++++++++++++----
 r/src/schema.cpp       | 19 ++++++++++++++++++-
 9 files changed, 52 insertions(+), 13 deletions(-)

diff --git a/r/NAMESPACE b/r/NAMESPACE
index f1f4bd8057..4a0c6ed261 100644
--- a/r/NAMESPACE
+++ b/r/NAMESPACE
@@ -412,6 +412,7 @@ importFrom(purrr,map_dfr)
 importFrom(purrr,map_int)
 importFrom(purrr,map_lgl)
 importFrom(purrr,reduce)
+importFrom(purrr,walk)
 importFrom(rlang,"%||%")
 importFrom(rlang,":=")
 importFrom(rlang,.data)
diff --git a/r/R/arrow-package.R b/r/R/arrow-package.R
index 4c5067480a..1ab4e41a7a 100644
--- a/r/R/arrow-package.R
+++ b/r/R/arrow-package.R
@@ -18,7 +18,7 @@
 #' @importFrom stats quantile median na.omit na.exclude na.pass na.fail
 #' @importFrom R6 R6Class
 #' @importFrom purrr as_mapper map map2 map_chr map2_chr map_dbl map_dfr map_int map_lgl keep imap imap_chr
-#' @importFrom purrr flatten reduce
+#' @importFrom purrr flatten reduce walk
 #' @importFrom assertthat assert_that is.string
 #' @importFrom rlang list2 %||% is_false abort dots_n warn enquo quo_is_null enquos is_integerish quos quo
 #' @importFrom rlang eval_tidy new_data_mask syms env new_environment env_bind set_names exec
diff --git a/r/R/arrowExports.R b/r/R/arrowExports.R
index a45bb7ae57..c42fca00b5 100644
--- a/r/R/arrowExports.R
+++ b/r/R/arrowExports.R
@@ -1876,8 +1876,12 @@ Scalar__ApproxEquals <- function(lhs, rhs) {
   .Call(`_arrow_Scalar__ApproxEquals`, lhs, rhs)
 }
 
-schema_ <- function(fields) {
-  .Call(`_arrow_schema_`, fields)
+Schema__from_fields <- function(fields) {
+  .Call(`_arrow_Schema__from_fields`, fields)
+}
+
+Schema__from_list <- function(field_list) {
+  .Call(`_arrow_Schema__from_list`, field_list)
 }
 
 Schema__ToString <- function(s) {
diff --git a/r/R/dplyr-collect.R b/r/R/dplyr-collect.R
index be58031d96..4f8ffc7c1a 100644
--- a/r/R/dplyr-collect.R
+++ b/r/R/dplyr-collect.R
@@ -115,6 +115,12 @@ implicit_schema <- function(.data) {
   # want to go one level up (where we may have called implicit_schema() before)
   .data <- ensure_group_vars(.data)
   old_schm <- .data$.data$schema
+
+  if (is.null(.data$aggregations) && is.null(.data$join) && !needs_projection(.data$selected_columns, old_schm)) {
+    # Just use the schema we have
+    return(old_schm)
+  }
+
   # Add in any augmented fields that may exist in the query but not in the
   # real data, in case we have FieldRefs to them
   old_schm[["__filename"]] <- string()
diff --git a/r/R/dplyr-eval.R b/r/R/dplyr-eval.R
index a8fb7c4330..15618d01d9 100644
--- a/r/R/dplyr-eval.R
+++ b/r/R/dplyr-eval.R
@@ -95,8 +95,9 @@ arrow_mask <- function(.data, aggregation = FALSE) {
     }
   }
 
+  schema <- .data$.data$schema
   # Assign the schema to the expressions
-  map(.data$selected_columns, ~ (.$schema <- .data$.data$schema))
+  walk(.data$selected_columns, ~ (.$schema <- schema))
 
   # Add the column references and make the mask
   out <- new_data_mask(
diff --git a/r/R/dplyr-select.R b/r/R/dplyr-select.R
index 3a9d82f975..9b6d07d375 100644
--- a/r/R/dplyr-select.R
+++ b/r/R/dplyr-select.R
@@ -45,7 +45,8 @@ relocate.arrow_dplyr_query <- function(.data, ..., .before = NULL, .after = NULL
   .data <- as_adq(.data)
 
   # Assign the schema to the expressions
-  map(.data$selected_columns, ~ (.$schema <- .data$.data$schema))
+  schema <- .data$.data$schema
+  walk(.data$selected_columns, ~ (.$schema <- schema))
 
   # Create a mask for evaluating expressions in tidyselect helpers
   mask <- new_environment(.cache$functions, parent = caller_env())
diff --git a/r/R/schema.R b/r/R/schema.R
index c7e26652c9..93e826eff2 100644
--- a/r/R/schema.R
+++ b/r/R/schema.R
@@ -182,9 +182,9 @@ Schema$create <- function(...) {
   }
 
   if (all(map_lgl(.list, ~ inherits(., "Field")))) {
-    schema_(.list)
+    Schema__from_fields(.list)
   } else {
-    schema_(.fields(.list))
+    Schema__from_list(imap(.list, as_type))
   }
 }
 #' @include arrowExports.R
@@ -298,7 +298,7 @@ length.Schema <- function(x) x$num_fields
       call. = FALSE
     )
   }
-  schema_(fields)
+  Schema__from_fields(fields)
 }
 
 #' @export
diff --git a/r/src/arrowExports.cpp b/r/src/arrowExports.cpp
index 91c3c6a235..cde8795c9f 100644
--- a/r/src/arrowExports.cpp
+++ b/r/src/arrowExports.cpp
@@ -4776,11 +4776,19 @@ BEGIN_CPP11
 END_CPP11
 }
 // schema.cpp
-std::shared_ptr<arrow::Schema> schema_(const std::vector<std::shared_ptr<arrow::Field>>& fields);
-extern "C" SEXP _arrow_schema_(SEXP fields_sexp){
+std::shared_ptr<arrow::Schema> Schema__from_fields(const std::vector<std::shared_ptr<arrow::Field>>& fields);
+extern "C" SEXP _arrow_Schema__from_fields(SEXP fields_sexp){
 BEGIN_CPP11
 	arrow::r::Input<const std::vector<std::shared_ptr<arrow::Field>>&>::type fields(fields_sexp);
-	return cpp11::as_sexp(schema_(fields));
+	return cpp11::as_sexp(Schema__from_fields(fields));
+END_CPP11
+}
+// schema.cpp
+std::shared_ptr<arrow::Schema> Schema__from_list(cpp11::list field_list);
+extern "C" SEXP _arrow_Schema__from_list(SEXP field_list_sexp){
+BEGIN_CPP11
+	arrow::r::Input<cpp11::list>::type field_list(field_list_sexp);
+	return cpp11::as_sexp(Schema__from_list(field_list));
 END_CPP11
 }
 // schema.cpp
@@ -5695,7 +5703,8 @@ static const R_CallMethodDef CallEntries[] = {
 		{ "_arrow_Scalar__type", (DL_FUNC) &_arrow_Scalar__type, 1}, 
 		{ "_arrow_Scalar__Equals", (DL_FUNC) &_arrow_Scalar__Equals, 2}, 
 		{ "_arrow_Scalar__ApproxEquals", (DL_FUNC) &_arrow_Scalar__ApproxEquals, 2}, 
-		{ "_arrow_schema_", (DL_FUNC) &_arrow_schema_, 1}, 
+		{ "_arrow_Schema__from_fields", (DL_FUNC) &_arrow_Schema__from_fields, 1}, 
+		{ "_arrow_Schema__from_list", (DL_FUNC) &_arrow_Schema__from_list, 1}, 
 		{ "_arrow_Schema__ToString", (DL_FUNC) &_arrow_Schema__ToString, 1}, 
 		{ "_arrow_Schema__num_fields", (DL_FUNC) &_arrow_Schema__num_fields, 1}, 
 		{ "_arrow_Schema__field", (DL_FUNC) &_arrow_Schema__field, 2}, 
diff --git a/r/src/schema.cpp b/r/src/schema.cpp
index 2bc58f0fa3..0dac188ec0 100644
--- a/r/src/schema.cpp
+++ b/r/src/schema.cpp
@@ -22,11 +22,28 @@
 #include <arrow/util/key_value_metadata.h>
 
 // [[arrow::export]]
-std::shared_ptr<arrow::Schema> schema_(
+std::shared_ptr<arrow::Schema> Schema__from_fields(
     const std::vector<std::shared_ptr<arrow::Field>>& fields) {
   return arrow::schema(fields);
 }
 
+// [[arrow::export]]
+std::shared_ptr<arrow::Schema> Schema__from_list(cpp11::list field_list) {
+  int n = field_list.size();
+
+  bool nullable = true;
+  cpp11::strings names(field_list.attr(R_NamesSymbol));
+
+  std::vector<std::shared_ptr<arrow::Field>> fields(n);
+
+  for (int i = 0; i < n; i++) {
+    fields[i] = arrow::field(
+        names[i], cpp11::as_cpp<std::shared_ptr<arrow::DataType>>(field_list[i]),
+        nullable);
+  }
+  return arrow::schema(fields);
+}
+
 // [[arrow::export]]
 std::string Schema__ToString(const std::shared_ptr<arrow::Schema>& s) {
   return s->ToString();