You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ro...@apache.org on 2019/06/03 07:08:16 UTC

[arrow] branch master updated: ARROW-5361: [R] Follow DictionaryType/DictionaryArray changes from ARROW-3144

This is an automated email from the ASF dual-hosted git repository.

romainfrancois pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 5025126  ARROW-5361: [R] Follow DictionaryType/DictionaryArray changes from ARROW-3144
5025126 is described below

commit 50251264d0004aef38548468d2dc81996b48f267
Author: Romain Francois <ro...@rstudio.com>
AuthorDate: Mon Jun 3 09:07:53 2019 +0200

    ARROW-5361: [R] Follow DictionaryType/DictionaryArray changes from ARROW-3144
    
    At the moment however, all the `DictionaryMemo` use is internal, it should probably be promoted to arguments (with defaults) to the R functions.
    
    I'll do this here or on another PR if this one is merged first so that `r/` builds again on travis.
    
    This now needs the C++ lib up to date, e.g. on my setup I get it through `brew install apache-arrow --HEAD`, and there is no conditional compiling so that it still works with previous versions. Let me know if that's ok.
    
    follow up from #4316
    
    Author: Romain Francois <ro...@rstudio.com>
    
    Closes #4413 from romainfrancois/ARROW-5361/dictionary and squashes the following commits:
    
    b0de1a8a <Romain Francois> R should pass now
    2556c16f <Romain Francois> document()
    fa0440f1 <Romain Francois> update R to changes from ARROW-3144 #4316
---
 .travis.yml                      |  2 --
 r/R/RcppExports.R                | 12 ++++++------
 r/R/dictionary.R                 | 16 ++++++++--------
 r/man/dictionary.Rd              |  8 ++++----
 r/src/RcppExports.cpp            | 28 ++++++++++++++--------------
 r/src/array_from_vector.cpp      | 35 +++++++++++------------------------
 r/src/datatype.cpp               | 16 ++++++++--------
 r/src/message.cpp                |  9 +++++++--
 r/src/recordbatch.cpp            |  4 +++-
 r/tests/testthat/test-DataType.R |  4 ++--
 10 files changed, 63 insertions(+), 71 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index 79ce8cd..622e820 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -47,8 +47,6 @@ before_install:
   - if [ $TRAVIS_OS_NAME == "linux" ]; then ccache -s; fi
 
 matrix:
-  allow_failures:
-  - language: r
   fast_finish: true
   include:
   - name: "Lint, Release tests"
diff --git a/r/R/RcppExports.R b/r/R/RcppExports.R
index 3940cc2..81e1c04 100644
--- a/r/R/RcppExports.R
+++ b/r/R/RcppExports.R
@@ -417,20 +417,20 @@ TimestampType__unit <- function(type) {
     .Call(`_arrow_TimestampType__unit`, type)
 }
 
-DictionaryType__initialize <- function(type, array, ordered) {
-    .Call(`_arrow_DictionaryType__initialize`, type, array, ordered)
+DictionaryType__initialize <- function(index_type, value_type, ordered) {
+    .Call(`_arrow_DictionaryType__initialize`, index_type, value_type, ordered)
 }
 
 DictionaryType__index_type <- function(type) {
     .Call(`_arrow_DictionaryType__index_type`, type)
 }
 
-DictionaryType__name <- function(type) {
-    .Call(`_arrow_DictionaryType__name`, type)
+DictionaryType__value_type <- function(type) {
+    .Call(`_arrow_DictionaryType__value_type`, type)
 }
 
-DictionaryType__dictionary <- function(type) {
-    .Call(`_arrow_DictionaryType__dictionary`, type)
+DictionaryType__name <- function(type) {
+    .Call(`_arrow_DictionaryType__name`, type)
 }
 
 DictionaryType__ordered <- function(type) {
diff --git a/r/R/dictionary.R b/r/R/dictionary.R
index 3c3758d..bfe2373 100644
--- a/r/R/dictionary.R
+++ b/r/R/dictionary.R
@@ -34,7 +34,7 @@
 
   active = list(
     index_type = function() `arrow::DataType`$dispatch(DictionaryType__index_type(self)),
-    dictionary = function() shared_ptr(`arrow::Array`, DictionaryType__dictionary(self)),
+    value_type = function() `arrow::DataType`$dispatch(DictionaryType__value_type(self)),
     name = function() DictionaryType__name(self),
     ordered = function() DictionaryType__ordered(self)
   )
@@ -42,17 +42,17 @@
 
 #' dictionary type factory
 #'
-#' @param type indices type, e.g. [int32()]
-#' @param values values array, typically an arrow array of strings
-#' @param ordered Is this an ordered dictionary
+#' @param index_type index type, e.g. [int32()]
+#' @param value_type value type, probably [utf8()]
+#' @param ordered Is this an ordered dictionary ?
 #'
 #' @return a [arrow::DictionaryType][arrow__DictionaryType]
 #'
 #' @export
-dictionary <- function(type, values, ordered = FALSE) {
+dictionary <- function(index_type, value_type, ordered = FALSE) {
   assert_that(
-    inherits(type, "arrow::DataType"),
-    inherits(values, "arrow::Array")
+    inherits(index_type, "arrow::DataType"),
+    inherits(index_type, "arrow::DataType")
   )
-  shared_ptr(`arrow::DictionaryType`, DictionaryType__initialize(type, values, ordered))
+  shared_ptr(`arrow::DictionaryType`, DictionaryType__initialize(index_type, value_type, ordered))
 }
diff --git a/r/man/dictionary.Rd b/r/man/dictionary.Rd
index 340283e..9662328 100644
--- a/r/man/dictionary.Rd
+++ b/r/man/dictionary.Rd
@@ -4,14 +4,14 @@
 \alias{dictionary}
 \title{dictionary type factory}
 \usage{
-dictionary(type, values, ordered = FALSE)
+dictionary(index_type, value_type, ordered = FALSE)
 }
 \arguments{
-\item{type}{indices type, e.g. \code{\link[=int32]{int32()}}}
+\item{index_type}{index type, e.g. \code{\link[=int32]{int32()}}}
 
-\item{values}{values array, typically an arrow array of strings}
+\item{value_type}{value type, probably \code{\link[=utf8]{utf8()}}}
 
-\item{ordered}{Is this an ordered dictionary}
+\item{ordered}{Is this an ordered dictionary ?}
 }
 \value{
 a \link[=arrow__DictionaryType]{arrow::DictionaryType}
diff --git a/r/src/RcppExports.cpp b/r/src/RcppExports.cpp
index b39e635..1ac96d4 100644
--- a/r/src/RcppExports.cpp
+++ b/r/src/RcppExports.cpp
@@ -1172,15 +1172,15 @@ BEGIN_RCPP
 END_RCPP
 }
 // DictionaryType__initialize
-std::shared_ptr<arrow::DataType> DictionaryType__initialize(const std::shared_ptr<arrow::DataType>& type, const std::shared_ptr<arrow::Array>& array, bool ordered);
-RcppExport SEXP _arrow_DictionaryType__initialize(SEXP typeSEXP, SEXP arraySEXP, SEXP orderedSEXP) {
+std::shared_ptr<arrow::DataType> DictionaryType__initialize(const std::shared_ptr<arrow::DataType>& index_type, const std::shared_ptr<arrow::DataType>& value_type, bool ordered);
+RcppExport SEXP _arrow_DictionaryType__initialize(SEXP index_typeSEXP, SEXP value_typeSEXP, SEXP orderedSEXP) {
 BEGIN_RCPP
     Rcpp::RObject rcpp_result_gen;
     Rcpp::RNGScope rcpp_rngScope_gen;
-    Rcpp::traits::input_parameter< const std::shared_ptr<arrow::DataType>& >::type type(typeSEXP);
-    Rcpp::traits::input_parameter< const std::shared_ptr<arrow::Array>& >::type array(arraySEXP);
+    Rcpp::traits::input_parameter< const std::shared_ptr<arrow::DataType>& >::type index_type(index_typeSEXP);
+    Rcpp::traits::input_parameter< const std::shared_ptr<arrow::DataType>& >::type value_type(value_typeSEXP);
     Rcpp::traits::input_parameter< bool >::type ordered(orderedSEXP);
-    rcpp_result_gen = Rcpp::wrap(DictionaryType__initialize(type, array, ordered));
+    rcpp_result_gen = Rcpp::wrap(DictionaryType__initialize(index_type, value_type, ordered));
     return rcpp_result_gen;
 END_RCPP
 }
@@ -1195,25 +1195,25 @@ BEGIN_RCPP
     return rcpp_result_gen;
 END_RCPP
 }
-// DictionaryType__name
-std::string DictionaryType__name(const std::shared_ptr<arrow::DictionaryType>& type);
-RcppExport SEXP _arrow_DictionaryType__name(SEXP typeSEXP) {
+// DictionaryType__value_type
+std::shared_ptr<arrow::DataType> DictionaryType__value_type(const std::shared_ptr<arrow::DictionaryType>& type);
+RcppExport SEXP _arrow_DictionaryType__value_type(SEXP typeSEXP) {
 BEGIN_RCPP
     Rcpp::RObject rcpp_result_gen;
     Rcpp::RNGScope rcpp_rngScope_gen;
     Rcpp::traits::input_parameter< const std::shared_ptr<arrow::DictionaryType>& >::type type(typeSEXP);
-    rcpp_result_gen = Rcpp::wrap(DictionaryType__name(type));
+    rcpp_result_gen = Rcpp::wrap(DictionaryType__value_type(type));
     return rcpp_result_gen;
 END_RCPP
 }
-// DictionaryType__dictionary
-std::shared_ptr<arrow::Array> DictionaryType__dictionary(const std::shared_ptr<arrow::DictionaryType>& type);
-RcppExport SEXP _arrow_DictionaryType__dictionary(SEXP typeSEXP) {
+// DictionaryType__name
+std::string DictionaryType__name(const std::shared_ptr<arrow::DictionaryType>& type);
+RcppExport SEXP _arrow_DictionaryType__name(SEXP typeSEXP) {
 BEGIN_RCPP
     Rcpp::RObject rcpp_result_gen;
     Rcpp::RNGScope rcpp_rngScope_gen;
     Rcpp::traits::input_parameter< const std::shared_ptr<arrow::DictionaryType>& >::type type(typeSEXP);
-    rcpp_result_gen = Rcpp::wrap(DictionaryType__dictionary(type));
+    rcpp_result_gen = Rcpp::wrap(DictionaryType__name(type));
     return rcpp_result_gen;
 END_RCPP
 }
@@ -2419,8 +2419,8 @@ static const R_CallMethodDef CallEntries[] = {
     {"_arrow_TimestampType__unit", (DL_FUNC) &_arrow_TimestampType__unit, 1},
     {"_arrow_DictionaryType__initialize", (DL_FUNC) &_arrow_DictionaryType__initialize, 3},
     {"_arrow_DictionaryType__index_type", (DL_FUNC) &_arrow_DictionaryType__index_type, 1},
+    {"_arrow_DictionaryType__value_type", (DL_FUNC) &_arrow_DictionaryType__value_type, 1},
     {"_arrow_DictionaryType__name", (DL_FUNC) &_arrow_DictionaryType__name, 1},
-    {"_arrow_DictionaryType__dictionary", (DL_FUNC) &_arrow_DictionaryType__dictionary, 1},
     {"_arrow_DictionaryType__ordered", (DL_FUNC) &_arrow_DictionaryType__ordered, 1},
     {"_arrow_ipc___feather___TableWriter__SetDescription", (DL_FUNC) &_arrow_ipc___feather___TableWriter__SetDescription, 2},
     {"_arrow_ipc___feather___TableWriter__SetNumRows", (DL_FUNC) &_arrow_ipc___feather___TableWriter__SetNumRows, 2},
diff --git a/r/src/array_from_vector.cpp b/r/src/array_from_vector.cpp
index ac72a4a..509a39a 100644
--- a/r/src/array_from_vector.cpp
+++ b/r/src/array_from_vector.cpp
@@ -141,8 +141,11 @@ std::shared_ptr<Array> MakeFactorArrayImpl(Rcpp::IntegerVector_ factor,
       ArrayData::Make(std::make_shared<Type>(), n, std::move(buffers), null_count, 0);
   auto array_indices = MakeArray(array_indices_data);
 
+  SEXP levels = Rf_getAttrib(factor, R_LevelsSymbol);
+  auto dict = MakeStringArray(levels);
+
   std::shared_ptr<Array> out;
-  STOP_IF_NOT_OK(DictionaryArray::FromArrays(type, array_indices, &out));
+  STOP_IF_NOT_OK(DictionaryArray::FromArrays(type, array_indices, dict, &out));
   return out;
 }
 
@@ -741,22 +744,20 @@ Status GetConverter(const std::shared_ptr<DataType>& type,
 }
 
 template <typename Type>
-std::shared_ptr<arrow::DataType> GetFactorTypeImpl(Rcpp::IntegerVector_ factor) {
-  auto dict_values = MakeStringArray(Rf_getAttrib(factor, R_LevelsSymbol));
-  auto dict_type =
-      dictionary(std::make_shared<Type>(), dict_values, Rf_inherits(factor, "ordered"));
-  return dict_type;
+std::shared_ptr<arrow::DataType> GetFactorTypeImpl(bool ordered) {
+  return dictionary(std::make_shared<Type>(), arrow::utf8(), ordered);
 }
 
 std::shared_ptr<arrow::DataType> GetFactorType(SEXP factor) {
   SEXP levels = Rf_getAttrib(factor, R_LevelsSymbol);
+  bool is_ordered = Rf_inherits(factor, "ordered");
   int n = Rf_length(levels);
   if (n < 128) {
-    return GetFactorTypeImpl<arrow::Int8Type>(factor);
+    return GetFactorTypeImpl<arrow::Int8Type>(is_ordered);
   } else if (n < 32768) {
-    return GetFactorTypeImpl<arrow::Int16Type>(factor);
+    return GetFactorTypeImpl<arrow::Int16Type>(is_ordered);
   } else {
-    return GetFactorTypeImpl<arrow::Int32Type>(factor);
+    return GetFactorTypeImpl<arrow::Int32Type>(is_ordered);
   }
 }
 
@@ -909,21 +910,7 @@ bool CheckCompatibleFactor(SEXP obj, const std::shared_ptr<arrow::DataType>& typ
 
   arrow::DictionaryType* dict_type =
       arrow::checked_cast<arrow::DictionaryType*>(type.get());
-  auto dictionary = dict_type->dictionary();
-  if (dictionary->type() != utf8()) return false;
-
-  // then compare levels
-  auto typed_dict = checked_cast<arrow::StringArray*>(dictionary.get());
-  SEXP levels = Rf_getAttrib(obj, R_LevelsSymbol);
-
-  R_xlen_t n = XLENGTH(levels);
-  if (n != typed_dict->length()) return false;
-
-  for (R_xlen_t i = 0; i < n; i++) {
-    if (typed_dict->GetString(i) != CHAR(STRING_ELT(levels, i))) return false;
-  }
-
-  return true;
+  return dict_type->value_type() == utf8();
 }
 
 std::shared_ptr<arrow::Array> Array__from_vector(
diff --git a/r/src/datatype.cpp b/r/src/datatype.cpp
index e392f09..b0becb4 100644
--- a/r/src/datatype.cpp
+++ b/r/src/datatype.cpp
@@ -250,9 +250,9 @@ arrow::TimeUnit::type TimestampType__unit(
 
 // [[Rcpp::export]]
 std::shared_ptr<arrow::DataType> DictionaryType__initialize(
-    const std::shared_ptr<arrow::DataType>& type,
-    const std::shared_ptr<arrow::Array>& array, bool ordered) {
-  return arrow::dictionary(type, array, ordered);
+    const std::shared_ptr<arrow::DataType>& index_type,
+    const std::shared_ptr<arrow::DataType>& value_type, bool ordered) {
+  return arrow::dictionary(index_type, value_type, ordered);
 }
 
 // [[Rcpp::export]]
@@ -262,14 +262,14 @@ std::shared_ptr<arrow::DataType> DictionaryType__index_type(
 }
 
 // [[Rcpp::export]]
-std::string DictionaryType__name(const std::shared_ptr<arrow::DictionaryType>& type) {
-  return type->name();
+std::shared_ptr<arrow::DataType> DictionaryType__value_type(
+    const std::shared_ptr<arrow::DictionaryType>& type) {
+  return type->value_type();
 }
 
 // [[Rcpp::export]]
-std::shared_ptr<arrow::Array> DictionaryType__dictionary(
-    const std::shared_ptr<arrow::DictionaryType>& type) {
-  return type->dictionary();
+std::string DictionaryType__name(const std::shared_ptr<arrow::DictionaryType>& type) {
+  return type->name();
 }
 
 // [[Rcpp::export]]
diff --git a/r/src/message.cpp b/r/src/message.cpp
index 3aec1be..cb8a3c9 100644
--- a/r/src/message.cpp
+++ b/r/src/message.cpp
@@ -56,7 +56,10 @@ std::shared_ptr<arrow::RecordBatch> ipc___ReadRecordBatch__Message__Schema(
     const std::unique_ptr<arrow::ipc::Message>& message,
     const std::shared_ptr<arrow::Schema>& schema) {
   std::shared_ptr<arrow::RecordBatch> batch;
-  STOP_IF_NOT_OK(arrow::ipc::ReadRecordBatch(*message, schema, &batch));
+
+  // TODO: perhaps this should come from the R side
+  arrow::ipc::DictionaryMemo memo;
+  STOP_IF_NOT_OK(arrow::ipc::ReadRecordBatch(*message, schema, &memo, &batch));
   return batch;
 }
 
@@ -64,7 +67,9 @@ std::shared_ptr<arrow::RecordBatch> ipc___ReadRecordBatch__Message__Schema(
 std::shared_ptr<arrow::Schema> ipc___ReadSchema_InputStream(
     const std::shared_ptr<arrow::io::InputStream>& stream) {
   std::shared_ptr<arrow::Schema> schema;
-  STOP_IF_NOT_OK(arrow::ipc::ReadSchema(stream.get(), &schema));
+  // TODO: promote to function argument
+  arrow::ipc::DictionaryMemo memo;
+  STOP_IF_NOT_OK(arrow::ipc::ReadSchema(stream.get(), &memo, &schema));
   return schema;
 }
 
diff --git a/r/src/recordbatch.cpp b/r/src/recordbatch.cpp
index f2bec01..31fefa8 100644
--- a/r/src/recordbatch.cpp
+++ b/r/src/recordbatch.cpp
@@ -140,6 +140,8 @@ std::shared_ptr<arrow::RecordBatch> ipc___ReadRecordBatch__InputStream__Schema(
     const std::shared_ptr<arrow::io::InputStream>& stream,
     const std::shared_ptr<arrow::Schema>& schema) {
   std::shared_ptr<arrow::RecordBatch> batch;
-  STOP_IF_NOT_OK(arrow::ipc::ReadRecordBatch(schema, stream.get(), &batch));
+  // TODO: promote to function arg
+  arrow::ipc::DictionaryMemo memo;
+  STOP_IF_NOT_OK(arrow::ipc::ReadRecordBatch(schema, &memo, stream.get(), &batch));
   return batch;
 }
diff --git a/r/tests/testthat/test-DataType.R b/r/tests/testthat/test-DataType.R
index fc9fc89..5faf721 100644
--- a/r/tests/testthat/test-DataType.R
+++ b/r/tests/testthat/test-DataType.R
@@ -314,7 +314,7 @@ test_that("struct type works as expected", {
 })
 
 test_that("DictionaryType works as expected (ARROW-3355)", {
-  d <- dictionary(int32(), array(c("foo", "bar", "baz")))
+  d <- dictionary(int32(), utf8())
   expect_equal(d, d)
   expect_true(d == d)
   expect_false(d == int32())
@@ -322,5 +322,5 @@ test_that("DictionaryType works as expected (ARROW-3355)", {
   expect_equal(d$bit_width, 32L)
   expect_equal(d$ToString(), "dictionary<values=string, indices=int32, ordered=0>")
   expect_equal(d$index_type, int32())
-  expect_equal(d$dictionary, array(c("foo", "bar", "baz")))
+  expect_equal(d$value_type, utf8())
 })