You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by jo...@apache.org on 2022/04/07 21:07:42 UTC
[arrow] branch master updated: ARROW-16038: [R] different behavior from dplyr when mutate's `.keep` option is set
This is an automated email from the ASF dual-hosted git repository.
jonkeane pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new b8299436c8 ARROW-16038: [R] different behavior from dplyr when mutate's `.keep` option is set
b8299436c8 is described below
commit b8299436c8b1a2d7cd3d6e019a2b750893a3af87
Author: SAm Albers <sa...@gmail.com>
AuthorDate: Thu Apr 7 16:07:34 2022 -0500
ARROW-16038: [R] different behavior from dplyr when mutate's `.keep` option is set
This PR does two things to match some dplyr behaviour around column order:
1) Mimics dplyr implementation of `mutate(..., .keep = "none")` to append new columns after the existing columns (if suggested) as [per](https://github.com/tidyverse/dplyr/issues/6086)
2) As per this [discussion](https://github.com/tidyverse/dplyr/issues/6086), this required a bespoke approach to `transmute` as it not simply a wrapper for `mutate(..., .keep = "none")`. This cascades into needing to catch a couple edge cases.
I have also added some tests which will test for this behaviour.
Closes #12818 from boshek/mutate-keep
Authored-by: SAm Albers <sa...@gmail.com>
Signed-off-by: Jonathan Keane <jk...@gmail.com>
---
r/NAMESPACE | 1 +
r/R/arrow-package.R | 2 +-
r/R/dplyr-mutate.R | 17 +++++++++++++++--
r/tests/testthat/test-dplyr-mutate.R | 34 +++++++++++++++++++++++++++++-----
4 files changed, 46 insertions(+), 8 deletions(-)
diff --git a/r/NAMESPACE b/r/NAMESPACE
index 7cb89b0a53..f32e73f537 100644
--- a/r/NAMESPACE
+++ b/r/NAMESPACE
@@ -331,6 +331,7 @@ importFrom(rlang,":=")
importFrom(rlang,.data)
importFrom(rlang,abort)
importFrom(rlang,as_function)
+importFrom(rlang,as_label)
importFrom(rlang,as_quosure)
importFrom(rlang,call2)
importFrom(rlang,caller_env)
diff --git a/r/R/arrow-package.R b/r/R/arrow-package.R
index 2fab03d08c..256bc7aefa 100644
--- a/r/R/arrow-package.R
+++ b/r/R/arrow-package.R
@@ -23,7 +23,7 @@
#' @importFrom rlang eval_tidy new_data_mask syms env new_environment env_bind set_names exec
#' @importFrom rlang is_bare_character quo_get_expr quo_get_env quo_set_expr .data seq2 is_interactive
#' @importFrom rlang expr caller_env is_character quo_name is_quosure enexpr enexprs as_quosure
-#' @importFrom rlang is_list call2 is_empty as_function
+#' @importFrom rlang is_list call2 is_empty as_function as_label
#' @importFrom tidyselect vars_pull vars_rename vars_select eval_select
#' @useDynLib arrow, .registration = TRUE
#' @keywords internal
diff --git a/r/R/dplyr-mutate.R b/r/R/dplyr-mutate.R
index 986f29cc1d..07802f8c83 100644
--- a/r/R/dplyr-mutate.R
+++ b/r/R/dplyr-mutate.R
@@ -94,7 +94,10 @@ mutate.arrow_dplyr_query <- function(.data,
# Respect .keep
if (.keep == "none") {
- .data$selected_columns <- .data$selected_columns[new_vars]
+ ## for consistency with dplyr, this appends new columns after existing columns
+ ## by specifying the order
+ new_cols_last <- c(intersect(old_vars, new_vars), setdiff(new_vars, old_vars))
+ .data$selected_columns <- .data$selected_columns[new_cols_last]
} else if (.keep != "all") {
# "used" or "unused"
used_vars <- unlist(lapply(exprs, all.vars), use.names = FALSE)
@@ -112,7 +115,17 @@ mutate.Dataset <- mutate.ArrowTabular <- mutate.RecordBatchReader <- mutate.arro
transmute.arrow_dplyr_query <- function(.data, ...) {
dots <- check_transmute_args(...)
- dplyr::mutate(.data, !!!dots, .keep = "none")
+ has_null <- map_lgl(dots, quo_is_null)
+ .data <- dplyr::mutate(.data, !!!dots, .keep = "none")
+ if (is_empty(dots) | any(has_null)) {
+ return(.data)
+ }
+
+ ## keeping with: https://github.com/tidyverse/dplyr/issues/6086
+ cur_exprs <- map_chr(dots, as_label)
+ transmute_order <- names(cur_exprs)
+ transmute_order[!nzchar(transmute_order)] <- cur_exprs[!nzchar(transmute_order)]
+ dplyr::select(.data, all_of(transmute_order))
}
transmute.Dataset <- transmute.ArrowTabular <- transmute.RecordBatchReader <- transmute.arrow_dplyr_query
diff --git a/r/tests/testthat/test-dplyr-mutate.R b/r/tests/testthat/test-dplyr-mutate.R
index 61d9edac1e..a746335940 100644
--- a/r/tests/testthat/test-dplyr-mutate.R
+++ b/r/tests/testthat/test-dplyr-mutate.R
@@ -74,6 +74,16 @@ test_that("transmute", {
)
})
+test_that("transmute respect bespoke dplyr implementation", {
+ ## see: https://github.com/tidyverse/dplyr/issues/6086
+ compare_dplyr_binding(
+ .input %>%
+ transmute(dbl, int = int + 6L) %>%
+ collect(),
+ tbl
+ )
+})
+
test_that("transmute() with NULL inputs", {
compare_dplyr_binding(
.input %>%
@@ -92,6 +102,20 @@ test_that("empty transmute()", {
)
})
+test_that("transmute with unnamed expressions", {
+ compare_dplyr_binding(
+ .input %>%
+ select(int, padded_strings) %>%
+ transmute(
+ int, # bare column name
+ nchar(padded_strings) # expression
+ ) %>%
+ filter(int > 5) %>%
+ collect(),
+ tbl
+ )
+})
+
test_that("transmute() with unsupported arguments", {
expect_error(
tbl %>%
@@ -329,13 +353,13 @@ test_that("dplyr::mutate's examples", {
#> <chr> <chr> <dbl>
#> 1 a b 3
compare_dplyr_binding(
- .input %>% mutate(z = x + y, .keep = "none") %>% collect(), # same as transmute()
+ .input %>% mutate(z = x + y, x, .keep = "none") %>% collect(),
df
)
- #> # A tibble: 1 x 1
- #> z
- #> <dbl>
- #> 1 3
+ #> # A tibble: 1 × 2
+ #> x z
+ #> <dbl> <dbl>
+ #> 1 1 3
# Grouping ----------------------------------------
# The mutate operation may yield different results on grouped