You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by pa...@apache.org on 2022/12/13 13:24:57 UTC
[arrow] branch master updated: GH-14872: [R] arrow returns wrong variable content when multiple group_by/summarise statements are used (#14905)
This is an automated email from the ASF dual-hosted git repository.
paleolimbot pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 9753a6720d GH-14872: [R] arrow returns wrong variable content when multiple group_by/summarise statements are used (#14905)
9753a6720d is described below
commit 9753a6720dabf3da1dd97706005adb411e5773ec
Author: Dewey Dunnington <de...@voltrondata.com>
AuthorDate: Tue Dec 13 09:24:50 2022 -0400
GH-14872: [R] arrow returns wrong variable content when multiple group_by/summarise statements are used (#14905)
Reprex using CRAN arrow:
``` r
library(arrow, warn.conflicts = FALSE)
library(dplyr, warn.conflicts = FALSE)
mtcars |>
arrow_table() |>
select(mpg, cyl) |>
group_by(mpg, cyl) |>
group_by(cyl, value = "foo") |>
collect()
#> # A tibble: 32 × 4
#> # Groups: cyl, value [3]
#> mpg cyl value `"foo"`
#> <dbl> <dbl> <dbl> <chr>
#> 1 21 6 6 foo
#> 2 21 6 6 foo
#> 3 22.8 4 4 foo
#> 4 21.4 6 6 foo
#> 5 18.7 8 8 foo
#> 6 18.1 6 6 foo
#> 7 14.3 8 8 foo
#> 8 24.4 4 4 foo
#> 9 22.8 4 4 foo
#> 10 19.2 6 6 foo
#> # … with 22 more rows
```
<sup>Created on 2022-12-09 with [reprex v2.0.2](https://reprex.tidyverse.org)</sup>
After this PR:
``` r
library(arrow, warn.conflicts = FALSE)
#> Some features are not enabled in this build of Arrow. Run `arrow_info()` for more information.
library(dplyr, warn.conflicts = FALSE)
mtcars |>
arrow_table() |>
select(mpg, cyl) |>
group_by(mpg, cyl) |>
group_by(cyl, value = "foo") |>
collect()
#> # A tibble: 32 × 3
#> # Groups: cyl, value [3]
#> mpg cyl value
#> <dbl> <dbl> <chr>
#> 1 21 6 foo
#> 2 21 6 foo
#> 3 22.8 4 foo
#> 4 21.4 6 foo
#> 5 18.7 8 foo
#> 6 18.1 6 foo
#> 7 14.3 8 foo
#> 8 24.4 4 foo
#> 9 22.8 4 foo
#> 10 19.2 6 foo
#> # … with 22 more rows
```
<sup>Created on 2022-12-09 with [reprex v2.0.2](https://reprex.tidyverse.org)</sup>
* Closes: #14872
Authored-by: Dewey Dunnington <de...@fishandwhistle.net>
Signed-off-by: Dewey Dunnington <de...@voltrondata.com>
---
r/R/dplyr-group-by.R | 21 +++++++++++----------
r/tests/testthat/test-dplyr-group-by.R | 18 ++++++++++++++++++
2 files changed, 29 insertions(+), 10 deletions(-)
diff --git a/r/R/dplyr-group-by.R b/r/R/dplyr-group-by.R
index 6a62a058a9..6eddea5529 100644
--- a/r/R/dplyr-group-by.R
+++ b/r/R/dplyr-group-by.R
@@ -35,19 +35,20 @@ group_by.arrow_dplyr_query <- function(.data,
.data <- as_adq(.data)
expression_list <- expand_across(.data, quos(...))
- new_groups <- ensure_named_exprs(expression_list)
+ named_expression_list <- ensure_named_exprs(expression_list)
- # set up group names and check which are new
+ # Set up group names
gbp <- dplyr::group_by_prepare(.data, !!!expression_list, .add = .add)
- existing_groups <- dplyr::group_vars(gbp$data)
- new_group_names <- setdiff(gbp$group_names, existing_groups)
- names(new_groups) <- new_group_names
-
- if (length(new_groups)) {
- # Add them to the data
- .data <- dplyr::mutate(.data, !!!new_groups)
- }
+ # Add them all (or update them) to the .data via. In theory
+ # one could calculate which variables do or do not need to be added via a
+ # complex combination of the expression names, whether they are or are not
+ # a symbol, and/or whether they currently exist in .data. Instead, we just
+ # put them all into a mutate().
+ existing_groups <- dplyr::groups(gbp$data)
+ names(existing_groups) <- dplyr::group_vars(gbp$data)
+ final_groups <- c(unclass(named_expression_list), unclass(existing_groups))[gbp$group_names]
+ .data <- dplyr::mutate(.data, !!!final_groups)
.data$group_by_vars <- gbp$group_names
.data$drop_empty_groups <- ifelse(length(gbp$group_names), .drop, dplyr::group_by_drop_default(.data))
diff --git a/r/tests/testthat/test-dplyr-group-by.R b/r/tests/testthat/test-dplyr-group-by.R
index 9f2869dd10..f17a013916 100644
--- a/r/tests/testthat/test-dplyr-group-by.R
+++ b/r/tests/testthat/test-dplyr-group-by.R
@@ -52,6 +52,24 @@ test_that("group_by supports creating/renaming", {
)
})
+test_that("group_by supports re-grouping by overlapping groups", {
+ compare_dplyr_binding(
+ .input %>%
+ group_by(chr, int) %>%
+ group_by(int, dbl) %>%
+ collect(),
+ tbl
+ )
+
+ compare_dplyr_binding(
+ .input %>%
+ group_by(chr, int) %>%
+ group_by(int, chr = "some new value") %>%
+ collect(),
+ tbl
+ )
+})
+
test_that("ungroup", {
compare_dplyr_binding(
.input %>%