You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by np...@apache.org on 2022/07/15 16:07:07 UTC
[arrow] branch master updated: ARROW-17085: [R] group_vars() should not return NULL (#13621)
This is an automated email from the ASF dual-hosted git repository.
npr pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 29cc263068 ARROW-17085: [R] group_vars() should not return NULL (#13621)
29cc263068 is described below
commit 29cc263068b983e690879d4d768025439a0fdd47
Author: eitsupi <50...@users.noreply.github.com>
AuthorDate: Sat Jul 16 01:06:57 2022 +0900
ARROW-17085: [R] group_vars() should not return NULL (#13621)
If an ungrouped data.frame or an `arrow_dplyr_query` is given to `dplyr::group_vars()`, `character()` returns.
But for an ungrouped Table, `NULL` is returned.
```r
mtcars |> dplyr::group_vars()
#> character(0)
mtcars |> arrow:::as_adq() |> dplyr::group_vars()
#> character(0)
mtcars |> arrow::arrow_table() |> dplyr::group_vars()
#> NULL
```
Therefore, functions that expect `group_vars` to return character, such as the following, will fail.
```r
mtcars |> arrow::arrow_table() |> dtplyr::lazy_dt()
#> Error in new_step(parent, vars = names(parent), groups = groups, locals = list(), : is.character(groups) is not TRUE
```
This PR modifies `dplyr::group_vars()` and `dplyr::groups()` for Arrow objects to work the same as for data.frame.
(Note that `arrow_dplyr_query` already works the same way as data.frame.)
Lead-authored-by: SHIMA Tatsuya <ts...@gmail.com>
Co-authored-by: eitsupi <50...@users.noreply.github.com>
Signed-off-by: Neal Richardson <ne...@gmail.com>
---
r/R/dplyr-group-by.R | 8 ++++----
r/R/dplyr.R | 2 +-
r/tests/testthat/test-RecordBatch.R | 10 ++++++++--
r/tests/testthat/test-Table.R | 8 +++++++-
r/tests/testthat/test-metadata.R | 2 +-
5 files changed, 21 insertions(+), 9 deletions(-)
diff --git a/r/R/dplyr-group-by.R b/r/R/dplyr-group-by.R
index 250dbedb18..c650799e8d 100644
--- a/r/R/dplyr-group-by.R
+++ b/r/R/dplyr-group-by.R
@@ -58,13 +58,13 @@ group_by.arrow_dplyr_query <- function(.data,
group_by.Dataset <- group_by.ArrowTabular <- group_by.RecordBatchReader <- group_by.arrow_dplyr_query
groups.arrow_dplyr_query <- function(x) syms(dplyr::group_vars(x))
-groups.Dataset <- groups.ArrowTabular <- groups.RecordBatchReader <- function(x) NULL
+groups.Dataset <- groups.ArrowTabular <- groups.RecordBatchReader <- groups.arrow_dplyr_query
group_vars.arrow_dplyr_query <- function(x) x$group_by_vars
-group_vars.Dataset <- function(x) NULL
-group_vars.RecordBatchReader <- function(x) NULL
+group_vars.Dataset <- function(x) character()
+group_vars.RecordBatchReader <- function(x) character()
group_vars.ArrowTabular <- function(x) {
- x$metadata$r$attributes$.group_vars
+ x$metadata$r$attributes$.group_vars %||% character()
}
# the logical literal in the two functions below controls the default value of
diff --git a/r/R/dplyr.R b/r/R/dplyr.R
index b048d98018..1296e60384 100644
--- a/r/R/dplyr.R
+++ b/r/R/dplyr.R
@@ -42,7 +42,7 @@ arrow_dplyr_query <- function(.data) {
gv <- tryCatch(
# If dplyr is not available, or if the input doesn't have a group_vars
# method, assume no group vars
- dplyr::group_vars(.data) %||% character(),
+ dplyr::group_vars(.data),
error = function(e) character()
)
diff --git a/r/tests/testthat/test-RecordBatch.R b/r/tests/testthat/test-RecordBatch.R
index e7602d9f74..6b79325934 100644
--- a/r/tests/testthat/test-RecordBatch.R
+++ b/r/tests/testthat/test-RecordBatch.R
@@ -654,7 +654,7 @@ test_that("Handling string data with embedded nuls", {
})
})
-test_that("ARROW-11769/ARROW-13860 - grouping preserved in record batch creation", {
+test_that("ARROW-11769/ARROW-13860/ARROW-17085 - grouping preserved in record batch creation", {
skip_if_not_available("dataset")
library(dplyr, warn.conflicts = FALSE)
@@ -670,6 +670,12 @@ test_that("ARROW-11769/ARROW-13860 - grouping preserved in record batch creation
record_batch(),
"RecordBatch"
)
+ expect_identical(
+ tbl %>%
+ record_batch() %>%
+ group_vars(),
+ group_vars(tbl)
+ )
expect_identical(
tbl %>%
group_by(fct, fct2) %>%
@@ -683,7 +689,7 @@ test_that("ARROW-11769/ARROW-13860 - grouping preserved in record batch creation
record_batch() %>%
ungroup() %>%
group_vars(),
- NULL
+ character()
)
expect_identical(
tbl %>%
diff --git a/r/tests/testthat/test-Table.R b/r/tests/testthat/test-Table.R
index 5edba2cd4a..bafd183108 100644
--- a/r/tests/testthat/test-Table.R
+++ b/r/tests/testthat/test-Table.R
@@ -592,7 +592,7 @@ test_that("cbind.Table handles record batches and tables", {
)
})
-test_that("ARROW-11769 - grouping preserved in table creation", {
+test_that("ARROW-11769/ARROW-17085 - grouping preserved in table creation", {
skip_if_not_available("dataset")
tbl <- tibble::tibble(
@@ -601,6 +601,12 @@ test_that("ARROW-11769 - grouping preserved in table creation", {
fct2 = factor(rep(c("C", "D"), each = 5)),
)
+ expect_identical(
+ tbl %>%
+ Table$create() %>%
+ dplyr::group_vars(),
+ dplyr::group_vars(tbl)
+ )
expect_identical(
tbl %>%
dplyr::group_by(fct, fct2) %>%
diff --git a/r/tests/testthat/test-metadata.R b/r/tests/testthat/test-metadata.R
index 69475ccd39..21b7ebe11a 100644
--- a/r/tests/testthat/test-metadata.R
+++ b/r/tests/testthat/test-metadata.R
@@ -382,7 +382,7 @@ test_that("grouped_df metadata is recorded (efficiently)", {
expect_s3_class(grouped, "grouped_df")
grouped_tab <- Table$create(grouped)
expect_r6_class(grouped_tab, "Table")
- expect_equal(grouped_tab$r_metadata$attributes$.group_vars, "a")
+ expect_equal(grouped_tab$metadata$r$attributes$.group_vars, "a")
})
test_that("grouped_df non-arrow metadata is preserved", {