You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by pa...@apache.org on 2022/12/13 13:24:57 UTC

[arrow] branch master updated: GH-14872: [R] arrow returns wrong variable content when multiple group_by/summarise statements are used (#14905)

This is an automated email from the ASF dual-hosted git repository.

paleolimbot pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 9753a6720d GH-14872: [R] arrow returns wrong variable content when multiple group_by/summarise statements are used (#14905)
9753a6720d is described below

commit 9753a6720dabf3da1dd97706005adb411e5773ec
Author: Dewey Dunnington <de...@voltrondata.com>
AuthorDate: Tue Dec 13 09:24:50 2022 -0400

    GH-14872: [R] arrow returns wrong variable content when multiple group_by/summarise statements are used (#14905)
    
    Reprex using CRAN arrow:
    
    ``` r
    library(arrow, warn.conflicts = FALSE)
    library(dplyr, warn.conflicts = FALSE)
    
    mtcars |>
      arrow_table() |>
      select(mpg, cyl) |>
      group_by(mpg, cyl) |>
      group_by(cyl, value = "foo") |>
      collect()
    #> # A tibble: 32 × 4
    #> # Groups:   cyl, value [3]
    #>      mpg   cyl value `"foo"`
    #>    <dbl> <dbl> <dbl> <chr>
    #>  1  21       6     6 foo
    #>  2  21       6     6 foo
    #>  3  22.8     4     4 foo
    #>  4  21.4     6     6 foo
    #>  5  18.7     8     8 foo
    #>  6  18.1     6     6 foo
    #>  7  14.3     8     8 foo
    #>  8  24.4     4     4 foo
    #>  9  22.8     4     4 foo
    #> 10  19.2     6     6 foo
    #> # … with 22 more rows
    ```
    
    <sup>Created on 2022-12-09 with [reprex v2.0.2](https://reprex.tidyverse.org)</sup>
    
    After this PR:
    
    ``` r
    library(arrow, warn.conflicts = FALSE)
    #> Some features are not enabled in this build of Arrow. Run `arrow_info()` for more information.
    library(dplyr, warn.conflicts = FALSE)
    
    mtcars |>
      arrow_table() |>
      select(mpg, cyl) |>
      group_by(mpg, cyl) |>
      group_by(cyl, value = "foo") |>
      collect()
    #> # A tibble: 32 × 3
    #> # Groups:   cyl, value [3]
    #>      mpg   cyl value
    #>    <dbl> <dbl> <chr>
    #>  1  21       6 foo
    #>  2  21       6 foo
    #>  3  22.8     4 foo
    #>  4  21.4     6 foo
    #>  5  18.7     8 foo
    #>  6  18.1     6 foo
    #>  7  14.3     8 foo
    #>  8  24.4     4 foo
    #>  9  22.8     4 foo
    #> 10  19.2     6 foo
    #> # … with 22 more rows
    ```
    
    <sup>Created on 2022-12-09 with [reprex v2.0.2](https://reprex.tidyverse.org)</sup>
    * Closes: #14872
    
    Authored-by: Dewey Dunnington <de...@fishandwhistle.net>
    Signed-off-by: Dewey Dunnington <de...@voltrondata.com>
---
 r/R/dplyr-group-by.R                   | 21 +++++++++++----------
 r/tests/testthat/test-dplyr-group-by.R | 18 ++++++++++++++++++
 2 files changed, 29 insertions(+), 10 deletions(-)

diff --git a/r/R/dplyr-group-by.R b/r/R/dplyr-group-by.R
index 6a62a058a9..6eddea5529 100644
--- a/r/R/dplyr-group-by.R
+++ b/r/R/dplyr-group-by.R
@@ -35,19 +35,20 @@ group_by.arrow_dplyr_query <- function(.data,
 
   .data <- as_adq(.data)
   expression_list <- expand_across(.data, quos(...))
-  new_groups <- ensure_named_exprs(expression_list)
+  named_expression_list <- ensure_named_exprs(expression_list)
 
-  # set up group names and check which are new
+  # Set up group names
   gbp <- dplyr::group_by_prepare(.data, !!!expression_list, .add = .add)
-  existing_groups <- dplyr::group_vars(gbp$data)
-  new_group_names <- setdiff(gbp$group_names, existing_groups)
 
-  names(new_groups) <- new_group_names
-
-  if (length(new_groups)) {
-    # Add them to the data
-    .data <- dplyr::mutate(.data, !!!new_groups)
-  }
+  # Add them all (or update them) to the .data via. In theory
+  # one could calculate which variables do or do not need to be added via a
+  # complex combination of the expression names, whether they are or are not
+  # a symbol, and/or whether they currently exist in .data. Instead, we just
+  # put them all into a mutate().
+  existing_groups <- dplyr::groups(gbp$data)
+  names(existing_groups) <- dplyr::group_vars(gbp$data)
+  final_groups <- c(unclass(named_expression_list), unclass(existing_groups))[gbp$group_names]
+  .data <- dplyr::mutate(.data, !!!final_groups)
 
   .data$group_by_vars <- gbp$group_names
   .data$drop_empty_groups <- ifelse(length(gbp$group_names), .drop, dplyr::group_by_drop_default(.data))
diff --git a/r/tests/testthat/test-dplyr-group-by.R b/r/tests/testthat/test-dplyr-group-by.R
index 9f2869dd10..f17a013916 100644
--- a/r/tests/testthat/test-dplyr-group-by.R
+++ b/r/tests/testthat/test-dplyr-group-by.R
@@ -52,6 +52,24 @@ test_that("group_by supports creating/renaming", {
   )
 })
 
+test_that("group_by supports re-grouping by overlapping groups", {
+  compare_dplyr_binding(
+    .input %>%
+      group_by(chr, int) %>%
+      group_by(int, dbl) %>%
+      collect(),
+    tbl
+  )
+
+  compare_dplyr_binding(
+    .input %>%
+      group_by(chr, int) %>%
+      group_by(int, chr = "some new value") %>%
+      collect(),
+    tbl
+  )
+})
+
 test_that("ungroup", {
   compare_dplyr_binding(
     .input %>%