You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by np...@apache.org on 2021/02/17 21:54:26 UTC

[arrow] branch master updated: ARROW-11657: [R] group_by with .drop specified errors

This is an automated email from the ASF dual-hosted git repository.

npr pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new f84b81d  ARROW-11657: [R] group_by with .drop specified errors
f84b81d is described below

commit f84b81d47f3f053033f856a25253523d06128818
Author: Neal Richardson <ne...@gmail.com>
AuthorDate: Wed Feb 17 13:53:33 2021 -0800

    ARROW-11657: [R] group_by with .drop specified errors
    
    https://github.com/tidyverse/dplyr/issues/5763 concisely exposes multiple issues:
    
    * We don't expect the .drop argument to group_by (fixed here)
    * You can apparently provide expressions to `...` in group_by, which effectively do `mutate()` to add the columns and then group by them (detected here with a useful error; implementation of the feature deferred to ARROW-11658)
    * Our input validation in `[.ArrowTabular` and the Table/RecordBatch `SelectColumns` methods was incomplete, and where it was present was not very helpful (fixed here)
    
    Other issues observed here and deferred:
    
    * Table has a proper SelectColumns method in the C++ library but the RecordBatch one is in the R library and should be pushed down to C++ (ARROW-11660)
    * The .drop argument is not actually implemented here, it is only caught if specified, and if the value given is other than the default, it errors. We should keep it around like we do the group_vars themselves (ARROW-11658), and we'll need to implement it too in the C++ query engine eventually.
    
    Closes #9510 from nealrichardson/group-by-drop
    
    Authored-by: Neal Richardson <ne...@gmail.com>
    Signed-off-by: Neal Richardson <ne...@gmail.com>
---
 r/R/arrow-tabular.R                 | 14 +++++++++++---
 r/R/dplyr.R                         | 19 ++++++++++++++++++-
 r/src/recordbatch.cpp               |  4 ++++
 r/tests/testthat/test-RecordBatch.R |  5 +++++
 r/tests/testthat/test-Table.R       |  4 ++++
 r/tests/testthat/test-dplyr.R       | 27 +++++++++++++++++++++++++++
 6 files changed, 69 insertions(+), 4 deletions(-)

diff --git a/r/R/arrow-tabular.R b/r/R/arrow-tabular.R
index c0ac3df..a41586f 100644
--- a/r/R/arrow-tabular.R
+++ b/r/R/arrow-tabular.R
@@ -64,14 +64,22 @@ as.data.frame.ArrowTabular <- function(x, row.names = NULL, optional = FALSE, ..
   if (!missing(j)) {
     # Selecting columns is cheaper than filtering rows, so do it first.
     # That way, if we're filtering too, we have fewer arrays to filter/slice/take
+    if (is.character(j)) {
+      j_new <- match(j, names(x))
+      if (any(is.na(j_new))) {
+        stop("Column not found: ", oxford_paste(j[is.na(j_new)]), call. = FALSE)
+      }
+      j <- j_new
+    }
     if (is_integerish(j)) {
-      if (all(j < 0)) {
+      if (any(is.na(j))) {
+        stop("Column indices cannot be NA", call. = FALSE)
+      }
+      if (length(j) && all(j < 0)) {
         # in R, negative j means "everything but j"
         j <- setdiff(seq_len(x$num_columns), -1 * j)
       }
       x <- x$SelectColumns(as.integer(j) - 1L)
-    } else if (is.character(j)) {
-      x <- x$SelectColumns(match(j, names(x)) - 1L)
     }
 
     if (drop && ncol(x) == 1L) {
diff --git a/r/R/dplyr.R b/r/R/dplyr.R
index f9df04d..3271374 100644
--- a/r/R/dplyr.R
+++ b/r/R/dplyr.R
@@ -381,8 +381,25 @@ summarise.arrow_dplyr_query <- function(.data, ...) {
 }
 summarise.Dataset <- summarise.ArrowTabular <- summarise.arrow_dplyr_query
 
-group_by.arrow_dplyr_query <- function(.data, ..., .add = FALSE, add = .add) {
+group_by.arrow_dplyr_query <- function(.data,
+                                       ...,
+                                       .add = FALSE,
+                                       add = .add,
+                                       .drop = TRUE) {
+  if (!isTRUE(.drop)) {
+    stop(".drop argument not supported for Arrow objects", call. = FALSE)
+  }
   .data <- arrow_dplyr_query(.data)
+  # ... can contain expressions (i.e. can add (or rename?) columns)
+  # Check for those (they show up as named expressions)
+  new_groups <- enquos(...)
+  new_groups <- new_groups[nzchar(names(new_groups))]
+  if (length(new_groups)) {
+    # TODO(ARROW-11658): either find a way to let group_by_prepare handle this
+    # (it may call mutate() for us)
+    # or essentially reimplement it here (see dplyr:::add_computed_columns)
+    stop("Cannot create or rename columns in group_by on Arrow objects", call. = FALSE)
+  }
   if (".add" %in% names(formals(dplyr::group_by))) {
     # dplyr >= 1.0
     gv <- dplyr::group_by_prepare(.data, ..., .add = .add)$group_names
diff --git a/r/src/recordbatch.cpp b/r/src/recordbatch.cpp
index 715bf8a..6eb1f0c 100644
--- a/r/src/recordbatch.cpp
+++ b/r/src/recordbatch.cpp
@@ -96,12 +96,16 @@ std::shared_ptr<arrow::RecordBatch> RecordBatch__SelectColumns(
     const std::shared_ptr<arrow::RecordBatch>& batch, cpp11::integers indices) {
   R_xlen_t n = indices.size();
   auto nrows = batch->num_rows();
+  auto ncols = batch->num_columns();
 
   std::vector<std::shared_ptr<arrow::Field>> fields(n);
   std::vector<std::shared_ptr<arrow::Array>> columns(n);
 
   for (R_xlen_t i = 0; i < n; i++) {
     int pos = indices[i];
+    if (pos < 0 || pos > ncols - 1) {
+      cpp11::stop("Invalid column index %d to select columns.", pos);
+    }
     fields[i] = batch->schema()->field(pos);
     columns[i] = batch->column(pos);
   }
diff --git a/r/tests/testthat/test-RecordBatch.R b/r/tests/testthat/test-RecordBatch.R
index b16f631..aeee66d 100644
--- a/r/tests/testthat/test-RecordBatch.R
+++ b/r/tests/testthat/test-RecordBatch.R
@@ -147,6 +147,11 @@ test_that("[ on RecordBatch", {
   expect_data_frame(batch[batch$lgl,], tbl[tbl$lgl,])
   # int Array
   expect_data_frame(batch[Array$create(5:6), 2:4], tbl[6:7, 2:4])
+
+  # input validation
+  expect_error(batch[, c("dbl", "NOTACOLUMN")], 'Column not found: "NOTACOLUMN"')
+  expect_error(batch[, c(6, NA)], 'Column indices cannot be NA')
+  expect_error(batch[, c(2, -2)], 'Invalid column index')
 })
 
 test_that("[[ and $ on RecordBatch", {
diff --git a/r/tests/testthat/test-Table.R b/r/tests/testthat/test-Table.R
index 33e39c9..f68ce4e 100644
--- a/r/tests/testthat/test-Table.R
+++ b/r/tests/testthat/test-Table.R
@@ -141,6 +141,10 @@ test_that("[, [[, $ for Table", {
   expect_error(tab[1000],  "Invalid column index")
   expect_error(tab[1:1000], "Invalid column index")
 
+  # input validation
+  expect_error(tab[, c("dbl", "NOTACOLUMN")], 'Column not found: "NOTACOLUMN"')
+  expect_error(tab[, c(6, NA)], 'Column indices cannot be NA')
+
   skip("Table with 0 cols doesn't know how many rows it should have")
   expect_data_frame(tab[0], tbl[0])
 })
diff --git a/r/tests/testthat/test-dplyr.R b/r/tests/testthat/test-dplyr.R
index 3082019..6d9945a 100644
--- a/r/tests/testthat/test-dplyr.R
+++ b/r/tests/testthat/test-dplyr.R
@@ -446,6 +446,14 @@ test_that("group_by groupings are recorded", {
   expect_identical(collect(batch), tbl)
 })
 
+test_that("group_by doesn't yet support creating/renaming", {
+  expect_error(
+    record_batch(tbl) %>%
+      group_by(chr, numbers = int),
+    "Cannot create or rename columns in group_by on Arrow objects"
+  )
+})
+
 test_that("ungroup", {
   expect_dplyr_equal(
     input %>%
@@ -534,6 +542,25 @@ test_that("group_by then rename", {
   )
 })
 
+test_that("group_by with .drop", {
+  expect_identical(
+    Table$create(tbl) %>% 
+      group_by(chr, .drop = TRUE) %>%
+      group_vars(), 
+    "chr"
+  )
+  expect_dplyr_equal(
+    input %>%
+      group_by(chr, .drop = TRUE) %>%
+      collect(),
+    tbl
+  )
+  expect_error(
+    Table$create(tbl) %>% group_by(chr, .drop = FALSE),
+    "not supported"
+  )
+})
+
 test_that("pull", {
   expect_dplyr_equal(
     input %>% pull(),