You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by np...@apache.org on 2022/07/15 16:07:07 UTC

[arrow] branch master updated: ARROW-17085: [R] group_vars() should not return NULL (#13621)

This is an automated email from the ASF dual-hosted git repository.

npr pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 29cc263068 ARROW-17085: [R] group_vars() should not return NULL (#13621)
29cc263068 is described below

commit 29cc263068b983e690879d4d768025439a0fdd47
Author: eitsupi <50...@users.noreply.github.com>
AuthorDate: Sat Jul 16 01:06:57 2022 +0900

    ARROW-17085: [R] group_vars() should not return NULL (#13621)
    
    If an ungrouped data.frame or an `arrow_dplyr_query` is given to `dplyr::group_vars()`, `character()` returns.
    But for an ungrouped Table, `NULL` is returned.
    
    ```r
    mtcars |> dplyr::group_vars()
    #> character(0)
    mtcars |> arrow:::as_adq() |> dplyr::group_vars()
    #> character(0)
    mtcars |> arrow::arrow_table() |> dplyr::group_vars()
    #> NULL
    ```
    
    Therefore, functions that expect `group_vars` to return character, such as the following, will fail.
    
    ```r
    mtcars |> arrow::arrow_table() |> dtplyr::lazy_dt()
    #> Error in new_step(parent, vars = names(parent), groups = groups, locals = list(), : is.character(groups) is not TRUE
    ```
    
    This PR modifies `dplyr::group_vars()` and `dplyr::groups()` for Arrow objects to work the same as for data.frame.
    (Note that `arrow_dplyr_query` already works the same way as data.frame.)
    
    Lead-authored-by: SHIMA Tatsuya <ts...@gmail.com>
    Co-authored-by: eitsupi <50...@users.noreply.github.com>
    Signed-off-by: Neal Richardson <ne...@gmail.com>
---
 r/R/dplyr-group-by.R                |  8 ++++----
 r/R/dplyr.R                         |  2 +-
 r/tests/testthat/test-RecordBatch.R | 10 ++++++++--
 r/tests/testthat/test-Table.R       |  8 +++++++-
 r/tests/testthat/test-metadata.R    |  2 +-
 5 files changed, 21 insertions(+), 9 deletions(-)

diff --git a/r/R/dplyr-group-by.R b/r/R/dplyr-group-by.R
index 250dbedb18..c650799e8d 100644
--- a/r/R/dplyr-group-by.R
+++ b/r/R/dplyr-group-by.R
@@ -58,13 +58,13 @@ group_by.arrow_dplyr_query <- function(.data,
 group_by.Dataset <- group_by.ArrowTabular <- group_by.RecordBatchReader <- group_by.arrow_dplyr_query
 
 groups.arrow_dplyr_query <- function(x) syms(dplyr::group_vars(x))
-groups.Dataset <- groups.ArrowTabular <- groups.RecordBatchReader <- function(x) NULL
+groups.Dataset <- groups.ArrowTabular <- groups.RecordBatchReader <- groups.arrow_dplyr_query
 
 group_vars.arrow_dplyr_query <- function(x) x$group_by_vars
-group_vars.Dataset <- function(x) NULL
-group_vars.RecordBatchReader <- function(x) NULL
+group_vars.Dataset <- function(x) character()
+group_vars.RecordBatchReader <- function(x) character()
 group_vars.ArrowTabular <- function(x) {
-  x$metadata$r$attributes$.group_vars
+  x$metadata$r$attributes$.group_vars %||% character()
 }
 
 # the logical literal in the two functions below controls the default value of
diff --git a/r/R/dplyr.R b/r/R/dplyr.R
index b048d98018..1296e60384 100644
--- a/r/R/dplyr.R
+++ b/r/R/dplyr.R
@@ -42,7 +42,7 @@ arrow_dplyr_query <- function(.data) {
   gv <- tryCatch(
     # If dplyr is not available, or if the input doesn't have a group_vars
     # method, assume no group vars
-    dplyr::group_vars(.data) %||% character(),
+    dplyr::group_vars(.data),
     error = function(e) character()
   )
 
diff --git a/r/tests/testthat/test-RecordBatch.R b/r/tests/testthat/test-RecordBatch.R
index e7602d9f74..6b79325934 100644
--- a/r/tests/testthat/test-RecordBatch.R
+++ b/r/tests/testthat/test-RecordBatch.R
@@ -654,7 +654,7 @@ test_that("Handling string data with embedded nuls", {
   })
 })
 
-test_that("ARROW-11769/ARROW-13860 - grouping preserved in record batch creation", {
+test_that("ARROW-11769/ARROW-13860/ARROW-17085 - grouping preserved in record batch creation", {
   skip_if_not_available("dataset")
   library(dplyr, warn.conflicts = FALSE)
 
@@ -670,6 +670,12 @@ test_that("ARROW-11769/ARROW-13860 - grouping preserved in record batch creation
       record_batch(),
     "RecordBatch"
   )
+  expect_identical(
+    tbl %>%
+      record_batch() %>%
+      group_vars(),
+    group_vars(tbl)
+  )
   expect_identical(
     tbl %>%
       group_by(fct, fct2) %>%
@@ -683,7 +689,7 @@ test_that("ARROW-11769/ARROW-13860 - grouping preserved in record batch creation
       record_batch() %>%
       ungroup() %>%
       group_vars(),
-    NULL
+    character()
   )
   expect_identical(
     tbl %>%
diff --git a/r/tests/testthat/test-Table.R b/r/tests/testthat/test-Table.R
index 5edba2cd4a..bafd183108 100644
--- a/r/tests/testthat/test-Table.R
+++ b/r/tests/testthat/test-Table.R
@@ -592,7 +592,7 @@ test_that("cbind.Table handles record batches and tables", {
   )
 })
 
-test_that("ARROW-11769 - grouping preserved in table creation", {
+test_that("ARROW-11769/ARROW-17085 - grouping preserved in table creation", {
   skip_if_not_available("dataset")
 
   tbl <- tibble::tibble(
@@ -601,6 +601,12 @@ test_that("ARROW-11769 - grouping preserved in table creation", {
     fct2 = factor(rep(c("C", "D"), each = 5)),
   )
 
+  expect_identical(
+    tbl %>%
+      Table$create() %>%
+      dplyr::group_vars(),
+    dplyr::group_vars(tbl)
+  )
   expect_identical(
     tbl %>%
       dplyr::group_by(fct, fct2) %>%
diff --git a/r/tests/testthat/test-metadata.R b/r/tests/testthat/test-metadata.R
index 69475ccd39..21b7ebe11a 100644
--- a/r/tests/testthat/test-metadata.R
+++ b/r/tests/testthat/test-metadata.R
@@ -382,7 +382,7 @@ test_that("grouped_df metadata is recorded (efficiently)", {
   expect_s3_class(grouped, "grouped_df")
   grouped_tab <- Table$create(grouped)
   expect_r6_class(grouped_tab, "Table")
-  expect_equal(grouped_tab$r_metadata$attributes$.group_vars, "a")
+  expect_equal(grouped_tab$metadata$r$attributes$.group_vars, "a")
 })
 
 test_that("grouped_df non-arrow metadata is preserved", {