You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by jo...@apache.org on 2022/04/07 21:07:42 UTC

[arrow] branch master updated: ARROW-16038: [R] different behavior from dplyr when mutate's `.keep` option is set

This is an automated email from the ASF dual-hosted git repository.

jonkeane pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new b8299436c8 ARROW-16038: [R] different behavior from dplyr when mutate's `.keep` option is set
b8299436c8 is described below

commit b8299436c8b1a2d7cd3d6e019a2b750893a3af87
Author: SAm Albers <sa...@gmail.com>
AuthorDate: Thu Apr 7 16:07:34 2022 -0500

    ARROW-16038: [R] different behavior from dplyr when mutate's `.keep` option is set
    
    This PR does two things to match some dplyr behaviour around column order:
    
    1) Mimics dplyr implementation of `mutate(..., .keep = "none")` to append new columns after the existing columns (if suggested) as [per](https://github.com/tidyverse/dplyr/issues/6086)
    
    2) As per this [discussion](https://github.com/tidyverse/dplyr/issues/6086), this required a bespoke approach to `transmute` as it not simply a wrapper for `mutate(..., .keep = "none")`. This cascades into needing to catch a couple edge cases.
    
    I have also added some tests which will test for this behaviour.
    
    Closes #12818 from boshek/mutate-keep
    
    Authored-by: SAm Albers <sa...@gmail.com>
    Signed-off-by: Jonathan Keane <jk...@gmail.com>
---
 r/NAMESPACE                          |  1 +
 r/R/arrow-package.R                  |  2 +-
 r/R/dplyr-mutate.R                   | 17 +++++++++++++++--
 r/tests/testthat/test-dplyr-mutate.R | 34 +++++++++++++++++++++++++++++-----
 4 files changed, 46 insertions(+), 8 deletions(-)

diff --git a/r/NAMESPACE b/r/NAMESPACE
index 7cb89b0a53..f32e73f537 100644
--- a/r/NAMESPACE
+++ b/r/NAMESPACE
@@ -331,6 +331,7 @@ importFrom(rlang,":=")
 importFrom(rlang,.data)
 importFrom(rlang,abort)
 importFrom(rlang,as_function)
+importFrom(rlang,as_label)
 importFrom(rlang,as_quosure)
 importFrom(rlang,call2)
 importFrom(rlang,caller_env)
diff --git a/r/R/arrow-package.R b/r/R/arrow-package.R
index 2fab03d08c..256bc7aefa 100644
--- a/r/R/arrow-package.R
+++ b/r/R/arrow-package.R
@@ -23,7 +23,7 @@
 #' @importFrom rlang eval_tidy new_data_mask syms env new_environment env_bind set_names exec
 #' @importFrom rlang is_bare_character quo_get_expr quo_get_env quo_set_expr .data seq2 is_interactive
 #' @importFrom rlang expr caller_env is_character quo_name is_quosure enexpr enexprs as_quosure
-#' @importFrom rlang is_list call2 is_empty as_function
+#' @importFrom rlang is_list call2 is_empty as_function as_label
 #' @importFrom tidyselect vars_pull vars_rename vars_select eval_select
 #' @useDynLib arrow, .registration = TRUE
 #' @keywords internal
diff --git a/r/R/dplyr-mutate.R b/r/R/dplyr-mutate.R
index 986f29cc1d..07802f8c83 100644
--- a/r/R/dplyr-mutate.R
+++ b/r/R/dplyr-mutate.R
@@ -94,7 +94,10 @@ mutate.arrow_dplyr_query <- function(.data,
 
   # Respect .keep
   if (.keep == "none") {
-    .data$selected_columns <- .data$selected_columns[new_vars]
+    ## for consistency with dplyr, this appends new columns after existing columns
+    ## by specifying the order
+    new_cols_last <- c(intersect(old_vars, new_vars), setdiff(new_vars, old_vars))
+    .data$selected_columns <- .data$selected_columns[new_cols_last]
   } else if (.keep != "all") {
     # "used" or "unused"
     used_vars <- unlist(lapply(exprs, all.vars), use.names = FALSE)
@@ -112,7 +115,17 @@ mutate.Dataset <- mutate.ArrowTabular <- mutate.RecordBatchReader <- mutate.arro
 
 transmute.arrow_dplyr_query <- function(.data, ...) {
   dots <- check_transmute_args(...)
-  dplyr::mutate(.data, !!!dots, .keep = "none")
+  has_null <- map_lgl(dots, quo_is_null)
+  .data <- dplyr::mutate(.data, !!!dots, .keep = "none")
+  if (is_empty(dots) | any(has_null)) {
+    return(.data)
+  }
+
+  ## keeping with: https://github.com/tidyverse/dplyr/issues/6086
+  cur_exprs <- map_chr(dots, as_label)
+  transmute_order <- names(cur_exprs)
+  transmute_order[!nzchar(transmute_order)] <- cur_exprs[!nzchar(transmute_order)]
+  dplyr::select(.data, all_of(transmute_order))
 }
 transmute.Dataset <- transmute.ArrowTabular <- transmute.RecordBatchReader <- transmute.arrow_dplyr_query
 
diff --git a/r/tests/testthat/test-dplyr-mutate.R b/r/tests/testthat/test-dplyr-mutate.R
index 61d9edac1e..a746335940 100644
--- a/r/tests/testthat/test-dplyr-mutate.R
+++ b/r/tests/testthat/test-dplyr-mutate.R
@@ -74,6 +74,16 @@ test_that("transmute", {
   )
 })
 
+test_that("transmute respect bespoke dplyr implementation", {
+  ## see: https://github.com/tidyverse/dplyr/issues/6086
+  compare_dplyr_binding(
+    .input %>%
+      transmute(dbl, int = int + 6L) %>%
+      collect(),
+    tbl
+  )
+})
+
 test_that("transmute() with NULL inputs", {
   compare_dplyr_binding(
     .input %>%
@@ -92,6 +102,20 @@ test_that("empty transmute()", {
   )
 })
 
+test_that("transmute with unnamed expressions", {
+  compare_dplyr_binding(
+    .input %>%
+      select(int, padded_strings) %>%
+      transmute(
+        int, # bare column name
+        nchar(padded_strings) # expression
+      ) %>%
+      filter(int > 5) %>%
+      collect(),
+    tbl
+  )
+})
+
 test_that("transmute() with unsupported arguments", {
   expect_error(
     tbl %>%
@@ -329,13 +353,13 @@ test_that("dplyr::mutate's examples", {
   #>   <chr> <chr> <dbl>
   #> 1 a     b         3
   compare_dplyr_binding(
-    .input %>% mutate(z = x + y, .keep = "none") %>% collect(), # same as transmute()
+    .input %>% mutate(z = x + y, x, .keep = "none") %>% collect(),
     df
   )
-  #> # A tibble: 1 x 1
-  #>       z
-  #>   <dbl>
-  #> 1     3
+  #> # A tibble: 1 × 2
+  #>       x     z
+  #>   <dbl> <dbl>
+  #> 1     1     3
 
   # Grouping ----------------------------------------
   # The mutate operation may yield different results on grouped