You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ro...@apache.org on 2019/06/03 07:08:16 UTC
[arrow] branch master updated: ARROW-5361: [R] Follow
DictionaryType/DictionaryArray changes from ARROW-3144
This is an automated email from the ASF dual-hosted git repository.
romainfrancois pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 5025126 ARROW-5361: [R] Follow DictionaryType/DictionaryArray changes from ARROW-3144
5025126 is described below
commit 50251264d0004aef38548468d2dc81996b48f267
Author: Romain Francois <ro...@rstudio.com>
AuthorDate: Mon Jun 3 09:07:53 2019 +0200
ARROW-5361: [R] Follow DictionaryType/DictionaryArray changes from ARROW-3144
At the moment however, all the `DictionaryMemo` use is internal, it should probably be promoted to arguments (with defaults) to the R functions.
I'll do this here or on another PR if this one is merged first so that `r/` builds again on travis.
This now needs the C++ lib up to date, e.g. on my setup I get it through `brew install apache-arrow --HEAD`, and there is no conditional compiling so that it still works with previous versions. Let me know if that's ok.
follow up from #4316
Author: Romain Francois <ro...@rstudio.com>
Closes #4413 from romainfrancois/ARROW-5361/dictionary and squashes the following commits:
b0de1a8a <Romain Francois> R should pass now
2556c16f <Romain Francois> document()
fa0440f1 <Romain Francois> update R to changes from ARROW-3144 #4316
---
.travis.yml | 2 --
r/R/RcppExports.R | 12 ++++++------
r/R/dictionary.R | 16 ++++++++--------
r/man/dictionary.Rd | 8 ++++----
r/src/RcppExports.cpp | 28 ++++++++++++++--------------
r/src/array_from_vector.cpp | 35 +++++++++++------------------------
r/src/datatype.cpp | 16 ++++++++--------
r/src/message.cpp | 9 +++++++--
r/src/recordbatch.cpp | 4 +++-
r/tests/testthat/test-DataType.R | 4 ++--
10 files changed, 63 insertions(+), 71 deletions(-)
diff --git a/.travis.yml b/.travis.yml
index 79ce8cd..622e820 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -47,8 +47,6 @@ before_install:
- if [ $TRAVIS_OS_NAME == "linux" ]; then ccache -s; fi
matrix:
- allow_failures:
- - language: r
fast_finish: true
include:
- name: "Lint, Release tests"
diff --git a/r/R/RcppExports.R b/r/R/RcppExports.R
index 3940cc2..81e1c04 100644
--- a/r/R/RcppExports.R
+++ b/r/R/RcppExports.R
@@ -417,20 +417,20 @@ TimestampType__unit <- function(type) {
.Call(`_arrow_TimestampType__unit`, type)
}
-DictionaryType__initialize <- function(type, array, ordered) {
- .Call(`_arrow_DictionaryType__initialize`, type, array, ordered)
+DictionaryType__initialize <- function(index_type, value_type, ordered) {
+ .Call(`_arrow_DictionaryType__initialize`, index_type, value_type, ordered)
}
DictionaryType__index_type <- function(type) {
.Call(`_arrow_DictionaryType__index_type`, type)
}
-DictionaryType__name <- function(type) {
- .Call(`_arrow_DictionaryType__name`, type)
+DictionaryType__value_type <- function(type) {
+ .Call(`_arrow_DictionaryType__value_type`, type)
}
-DictionaryType__dictionary <- function(type) {
- .Call(`_arrow_DictionaryType__dictionary`, type)
+DictionaryType__name <- function(type) {
+ .Call(`_arrow_DictionaryType__name`, type)
}
DictionaryType__ordered <- function(type) {
diff --git a/r/R/dictionary.R b/r/R/dictionary.R
index 3c3758d..bfe2373 100644
--- a/r/R/dictionary.R
+++ b/r/R/dictionary.R
@@ -34,7 +34,7 @@
active = list(
index_type = function() `arrow::DataType`$dispatch(DictionaryType__index_type(self)),
- dictionary = function() shared_ptr(`arrow::Array`, DictionaryType__dictionary(self)),
+ value_type = function() `arrow::DataType`$dispatch(DictionaryType__value_type(self)),
name = function() DictionaryType__name(self),
ordered = function() DictionaryType__ordered(self)
)
@@ -42,17 +42,17 @@
#' dictionary type factory
#'
-#' @param type indices type, e.g. [int32()]
-#' @param values values array, typically an arrow array of strings
-#' @param ordered Is this an ordered dictionary
+#' @param index_type index type, e.g. [int32()]
+#' @param value_type value type, probably [utf8()]
+#' @param ordered Is this an ordered dictionary ?
#'
#' @return a [arrow::DictionaryType][arrow__DictionaryType]
#'
#' @export
-dictionary <- function(type, values, ordered = FALSE) {
+dictionary <- function(index_type, value_type, ordered = FALSE) {
assert_that(
- inherits(type, "arrow::DataType"),
- inherits(values, "arrow::Array")
+ inherits(index_type, "arrow::DataType"),
+ inherits(index_type, "arrow::DataType")
)
- shared_ptr(`arrow::DictionaryType`, DictionaryType__initialize(type, values, ordered))
+ shared_ptr(`arrow::DictionaryType`, DictionaryType__initialize(index_type, value_type, ordered))
}
diff --git a/r/man/dictionary.Rd b/r/man/dictionary.Rd
index 340283e..9662328 100644
--- a/r/man/dictionary.Rd
+++ b/r/man/dictionary.Rd
@@ -4,14 +4,14 @@
\alias{dictionary}
\title{dictionary type factory}
\usage{
-dictionary(type, values, ordered = FALSE)
+dictionary(index_type, value_type, ordered = FALSE)
}
\arguments{
-\item{type}{indices type, e.g. \code{\link[=int32]{int32()}}}
+\item{index_type}{index type, e.g. \code{\link[=int32]{int32()}}}
-\item{values}{values array, typically an arrow array of strings}
+\item{value_type}{value type, probably \code{\link[=utf8]{utf8()}}}
-\item{ordered}{Is this an ordered dictionary}
+\item{ordered}{Is this an ordered dictionary ?}
}
\value{
a \link[=arrow__DictionaryType]{arrow::DictionaryType}
diff --git a/r/src/RcppExports.cpp b/r/src/RcppExports.cpp
index b39e635..1ac96d4 100644
--- a/r/src/RcppExports.cpp
+++ b/r/src/RcppExports.cpp
@@ -1172,15 +1172,15 @@ BEGIN_RCPP
END_RCPP
}
// DictionaryType__initialize
-std::shared_ptr<arrow::DataType> DictionaryType__initialize(const std::shared_ptr<arrow::DataType>& type, const std::shared_ptr<arrow::Array>& array, bool ordered);
-RcppExport SEXP _arrow_DictionaryType__initialize(SEXP typeSEXP, SEXP arraySEXP, SEXP orderedSEXP) {
+std::shared_ptr<arrow::DataType> DictionaryType__initialize(const std::shared_ptr<arrow::DataType>& index_type, const std::shared_ptr<arrow::DataType>& value_type, bool ordered);
+RcppExport SEXP _arrow_DictionaryType__initialize(SEXP index_typeSEXP, SEXP value_typeSEXP, SEXP orderedSEXP) {
BEGIN_RCPP
Rcpp::RObject rcpp_result_gen;
Rcpp::RNGScope rcpp_rngScope_gen;
- Rcpp::traits::input_parameter< const std::shared_ptr<arrow::DataType>& >::type type(typeSEXP);
- Rcpp::traits::input_parameter< const std::shared_ptr<arrow::Array>& >::type array(arraySEXP);
+ Rcpp::traits::input_parameter< const std::shared_ptr<arrow::DataType>& >::type index_type(index_typeSEXP);
+ Rcpp::traits::input_parameter< const std::shared_ptr<arrow::DataType>& >::type value_type(value_typeSEXP);
Rcpp::traits::input_parameter< bool >::type ordered(orderedSEXP);
- rcpp_result_gen = Rcpp::wrap(DictionaryType__initialize(type, array, ordered));
+ rcpp_result_gen = Rcpp::wrap(DictionaryType__initialize(index_type, value_type, ordered));
return rcpp_result_gen;
END_RCPP
}
@@ -1195,25 +1195,25 @@ BEGIN_RCPP
return rcpp_result_gen;
END_RCPP
}
-// DictionaryType__name
-std::string DictionaryType__name(const std::shared_ptr<arrow::DictionaryType>& type);
-RcppExport SEXP _arrow_DictionaryType__name(SEXP typeSEXP) {
+// DictionaryType__value_type
+std::shared_ptr<arrow::DataType> DictionaryType__value_type(const std::shared_ptr<arrow::DictionaryType>& type);
+RcppExport SEXP _arrow_DictionaryType__value_type(SEXP typeSEXP) {
BEGIN_RCPP
Rcpp::RObject rcpp_result_gen;
Rcpp::RNGScope rcpp_rngScope_gen;
Rcpp::traits::input_parameter< const std::shared_ptr<arrow::DictionaryType>& >::type type(typeSEXP);
- rcpp_result_gen = Rcpp::wrap(DictionaryType__name(type));
+ rcpp_result_gen = Rcpp::wrap(DictionaryType__value_type(type));
return rcpp_result_gen;
END_RCPP
}
-// DictionaryType__dictionary
-std::shared_ptr<arrow::Array> DictionaryType__dictionary(const std::shared_ptr<arrow::DictionaryType>& type);
-RcppExport SEXP _arrow_DictionaryType__dictionary(SEXP typeSEXP) {
+// DictionaryType__name
+std::string DictionaryType__name(const std::shared_ptr<arrow::DictionaryType>& type);
+RcppExport SEXP _arrow_DictionaryType__name(SEXP typeSEXP) {
BEGIN_RCPP
Rcpp::RObject rcpp_result_gen;
Rcpp::RNGScope rcpp_rngScope_gen;
Rcpp::traits::input_parameter< const std::shared_ptr<arrow::DictionaryType>& >::type type(typeSEXP);
- rcpp_result_gen = Rcpp::wrap(DictionaryType__dictionary(type));
+ rcpp_result_gen = Rcpp::wrap(DictionaryType__name(type));
return rcpp_result_gen;
END_RCPP
}
@@ -2419,8 +2419,8 @@ static const R_CallMethodDef CallEntries[] = {
{"_arrow_TimestampType__unit", (DL_FUNC) &_arrow_TimestampType__unit, 1},
{"_arrow_DictionaryType__initialize", (DL_FUNC) &_arrow_DictionaryType__initialize, 3},
{"_arrow_DictionaryType__index_type", (DL_FUNC) &_arrow_DictionaryType__index_type, 1},
+ {"_arrow_DictionaryType__value_type", (DL_FUNC) &_arrow_DictionaryType__value_type, 1},
{"_arrow_DictionaryType__name", (DL_FUNC) &_arrow_DictionaryType__name, 1},
- {"_arrow_DictionaryType__dictionary", (DL_FUNC) &_arrow_DictionaryType__dictionary, 1},
{"_arrow_DictionaryType__ordered", (DL_FUNC) &_arrow_DictionaryType__ordered, 1},
{"_arrow_ipc___feather___TableWriter__SetDescription", (DL_FUNC) &_arrow_ipc___feather___TableWriter__SetDescription, 2},
{"_arrow_ipc___feather___TableWriter__SetNumRows", (DL_FUNC) &_arrow_ipc___feather___TableWriter__SetNumRows, 2},
diff --git a/r/src/array_from_vector.cpp b/r/src/array_from_vector.cpp
index ac72a4a..509a39a 100644
--- a/r/src/array_from_vector.cpp
+++ b/r/src/array_from_vector.cpp
@@ -141,8 +141,11 @@ std::shared_ptr<Array> MakeFactorArrayImpl(Rcpp::IntegerVector_ factor,
ArrayData::Make(std::make_shared<Type>(), n, std::move(buffers), null_count, 0);
auto array_indices = MakeArray(array_indices_data);
+ SEXP levels = Rf_getAttrib(factor, R_LevelsSymbol);
+ auto dict = MakeStringArray(levels);
+
std::shared_ptr<Array> out;
- STOP_IF_NOT_OK(DictionaryArray::FromArrays(type, array_indices, &out));
+ STOP_IF_NOT_OK(DictionaryArray::FromArrays(type, array_indices, dict, &out));
return out;
}
@@ -741,22 +744,20 @@ Status GetConverter(const std::shared_ptr<DataType>& type,
}
template <typename Type>
-std::shared_ptr<arrow::DataType> GetFactorTypeImpl(Rcpp::IntegerVector_ factor) {
- auto dict_values = MakeStringArray(Rf_getAttrib(factor, R_LevelsSymbol));
- auto dict_type =
- dictionary(std::make_shared<Type>(), dict_values, Rf_inherits(factor, "ordered"));
- return dict_type;
+std::shared_ptr<arrow::DataType> GetFactorTypeImpl(bool ordered) {
+ return dictionary(std::make_shared<Type>(), arrow::utf8(), ordered);
}
std::shared_ptr<arrow::DataType> GetFactorType(SEXP factor) {
SEXP levels = Rf_getAttrib(factor, R_LevelsSymbol);
+ bool is_ordered = Rf_inherits(factor, "ordered");
int n = Rf_length(levels);
if (n < 128) {
- return GetFactorTypeImpl<arrow::Int8Type>(factor);
+ return GetFactorTypeImpl<arrow::Int8Type>(is_ordered);
} else if (n < 32768) {
- return GetFactorTypeImpl<arrow::Int16Type>(factor);
+ return GetFactorTypeImpl<arrow::Int16Type>(is_ordered);
} else {
- return GetFactorTypeImpl<arrow::Int32Type>(factor);
+ return GetFactorTypeImpl<arrow::Int32Type>(is_ordered);
}
}
@@ -909,21 +910,7 @@ bool CheckCompatibleFactor(SEXP obj, const std::shared_ptr<arrow::DataType>& typ
arrow::DictionaryType* dict_type =
arrow::checked_cast<arrow::DictionaryType*>(type.get());
- auto dictionary = dict_type->dictionary();
- if (dictionary->type() != utf8()) return false;
-
- // then compare levels
- auto typed_dict = checked_cast<arrow::StringArray*>(dictionary.get());
- SEXP levels = Rf_getAttrib(obj, R_LevelsSymbol);
-
- R_xlen_t n = XLENGTH(levels);
- if (n != typed_dict->length()) return false;
-
- for (R_xlen_t i = 0; i < n; i++) {
- if (typed_dict->GetString(i) != CHAR(STRING_ELT(levels, i))) return false;
- }
-
- return true;
+ return dict_type->value_type() == utf8();
}
std::shared_ptr<arrow::Array> Array__from_vector(
diff --git a/r/src/datatype.cpp b/r/src/datatype.cpp
index e392f09..b0becb4 100644
--- a/r/src/datatype.cpp
+++ b/r/src/datatype.cpp
@@ -250,9 +250,9 @@ arrow::TimeUnit::type TimestampType__unit(
// [[Rcpp::export]]
std::shared_ptr<arrow::DataType> DictionaryType__initialize(
- const std::shared_ptr<arrow::DataType>& type,
- const std::shared_ptr<arrow::Array>& array, bool ordered) {
- return arrow::dictionary(type, array, ordered);
+ const std::shared_ptr<arrow::DataType>& index_type,
+ const std::shared_ptr<arrow::DataType>& value_type, bool ordered) {
+ return arrow::dictionary(index_type, value_type, ordered);
}
// [[Rcpp::export]]
@@ -262,14 +262,14 @@ std::shared_ptr<arrow::DataType> DictionaryType__index_type(
}
// [[Rcpp::export]]
-std::string DictionaryType__name(const std::shared_ptr<arrow::DictionaryType>& type) {
- return type->name();
+std::shared_ptr<arrow::DataType> DictionaryType__value_type(
+ const std::shared_ptr<arrow::DictionaryType>& type) {
+ return type->value_type();
}
// [[Rcpp::export]]
-std::shared_ptr<arrow::Array> DictionaryType__dictionary(
- const std::shared_ptr<arrow::DictionaryType>& type) {
- return type->dictionary();
+std::string DictionaryType__name(const std::shared_ptr<arrow::DictionaryType>& type) {
+ return type->name();
}
// [[Rcpp::export]]
diff --git a/r/src/message.cpp b/r/src/message.cpp
index 3aec1be..cb8a3c9 100644
--- a/r/src/message.cpp
+++ b/r/src/message.cpp
@@ -56,7 +56,10 @@ std::shared_ptr<arrow::RecordBatch> ipc___ReadRecordBatch__Message__Schema(
const std::unique_ptr<arrow::ipc::Message>& message,
const std::shared_ptr<arrow::Schema>& schema) {
std::shared_ptr<arrow::RecordBatch> batch;
- STOP_IF_NOT_OK(arrow::ipc::ReadRecordBatch(*message, schema, &batch));
+
+ // TODO: perhaps this should come from the R side
+ arrow::ipc::DictionaryMemo memo;
+ STOP_IF_NOT_OK(arrow::ipc::ReadRecordBatch(*message, schema, &memo, &batch));
return batch;
}
@@ -64,7 +67,9 @@ std::shared_ptr<arrow::RecordBatch> ipc___ReadRecordBatch__Message__Schema(
std::shared_ptr<arrow::Schema> ipc___ReadSchema_InputStream(
const std::shared_ptr<arrow::io::InputStream>& stream) {
std::shared_ptr<arrow::Schema> schema;
- STOP_IF_NOT_OK(arrow::ipc::ReadSchema(stream.get(), &schema));
+ // TODO: promote to function argument
+ arrow::ipc::DictionaryMemo memo;
+ STOP_IF_NOT_OK(arrow::ipc::ReadSchema(stream.get(), &memo, &schema));
return schema;
}
diff --git a/r/src/recordbatch.cpp b/r/src/recordbatch.cpp
index f2bec01..31fefa8 100644
--- a/r/src/recordbatch.cpp
+++ b/r/src/recordbatch.cpp
@@ -140,6 +140,8 @@ std::shared_ptr<arrow::RecordBatch> ipc___ReadRecordBatch__InputStream__Schema(
const std::shared_ptr<arrow::io::InputStream>& stream,
const std::shared_ptr<arrow::Schema>& schema) {
std::shared_ptr<arrow::RecordBatch> batch;
- STOP_IF_NOT_OK(arrow::ipc::ReadRecordBatch(schema, stream.get(), &batch));
+ // TODO: promote to function arg
+ arrow::ipc::DictionaryMemo memo;
+ STOP_IF_NOT_OK(arrow::ipc::ReadRecordBatch(schema, &memo, stream.get(), &batch));
return batch;
}
diff --git a/r/tests/testthat/test-DataType.R b/r/tests/testthat/test-DataType.R
index fc9fc89..5faf721 100644
--- a/r/tests/testthat/test-DataType.R
+++ b/r/tests/testthat/test-DataType.R
@@ -314,7 +314,7 @@ test_that("struct type works as expected", {
})
test_that("DictionaryType works as expected (ARROW-3355)", {
- d <- dictionary(int32(), array(c("foo", "bar", "baz")))
+ d <- dictionary(int32(), utf8())
expect_equal(d, d)
expect_true(d == d)
expect_false(d == int32())
@@ -322,5 +322,5 @@ test_that("DictionaryType works as expected (ARROW-3355)", {
expect_equal(d$bit_width, 32L)
expect_equal(d$ToString(), "dictionary<values=string, indices=int32, ordered=0>")
expect_equal(d$index_type, int32())
- expect_equal(d$dictionary, array(c("foo", "bar", "baz")))
+ expect_equal(d$value_type, utf8())
})