You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by fs...@apache.org on 2019/07/12 14:40:24 UTC
[arrow] branch master updated: ARROW-5588: [C++] Better support for
building union arrays
This is an automated email from the ASF dual-hosted git repository.
fsaintjacques pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 1bcfbe1 ARROW-5588: [C++] Better support for building union arrays
1bcfbe1 is described below
commit 1bcfbe107067af4a6de486623683fe744614f500
Author: Benjamin Kietzman <be...@gmail.com>
AuthorDate: Fri Jul 12 10:40:03 2019 -0400
ARROW-5588: [C++] Better support for building union arrays
- Simplify DenseUnionBuilder
- Add SparesUnionBuilder
- MakeBuilder can now produce a {Sparse,Dense}UnionBuilder
- ArrayFromJSON can now produce union arrays
Author: Benjamin Kietzman <be...@gmail.com>
Closes #4781 from bkietz/5588-Better-support-for-building-UnionArrays and squashes the following commits:
38e2828a0 <Benjamin Kietzman> iwyu #include <limits>
0efe91d77 <Benjamin Kietzman> address review comments
17e6e27aa <Benjamin Kietzman> construct offset_builder_ with a MemoryPool
4131fe3fb <Benjamin Kietzman> separate child builder array indexable by type_id
fd64c1ba7 <Benjamin Kietzman> rewrite union builder to share a base class, let children_ be indexed by type_id
37de5f2d5 <Benjamin Kietzman> explicit uint8_t for msvc
673916e50 <Benjamin Kietzman> Disable ListOfDictionary test until ListBuilder is updated
cf1c5bec5 <Benjamin Kietzman> revert changes to reader.cc
5742db998 <Benjamin Kietzman> debugging: highlight the broken case and a similar one
5b1ec9361 <Benjamin Kietzman> improve doccomments, dedupe test code
33fade16e <Benjamin Kietzman> Adding support for DenseUnions to ArrayFromJSON
6245c82f1 <Benjamin Kietzman> add SparseUnionBuilder and MakeBuilder case
8d4f36dcf <Benjamin Kietzman> add tests for building lists where the value builder has mutable type
351905dd9 <Benjamin Kietzman> add test for lazily typed union builder
7902d12c8 <Benjamin Kietzman> first pass at updating DenseUnionBuilder
d20055ad9 <Benjamin Kietzman> minor refactors, adding some asserts
---
cpp/src/arrow/array-dict-test.cc | 45 ++++++
cpp/src/arrow/array-union-test.cc | 215 +++++++++++++++++++++++++++++
cpp/src/arrow/array.cc | 1 +
cpp/src/arrow/array.h | 7 +-
cpp/src/arrow/array/builder_nested.cc | 4 +-
cpp/src/arrow/array/builder_nested.h | 2 +-
cpp/src/arrow/array/builder_union.cc | 95 ++++++++++---
cpp/src/arrow/array/builder_union.h | 162 +++++++++++++++-------
cpp/src/arrow/builder.cc | 24 +++-
cpp/src/arrow/builder.h | 1 +
cpp/src/arrow/compare.cc | 44 ++----
cpp/src/arrow/ipc/json-simple-test.cc | 190 +++++++++++++++++++++++++
cpp/src/arrow/ipc/json-simple.cc | 89 ++++++++++++
cpp/src/arrow/python/serialize.cc | 10 +-
cpp/src/arrow/type.cc | 9 ++
cpp/src/arrow/type.h | 2 +
python/pyarrow/tests/test_serialization.py | 8 ++
17 files changed, 788 insertions(+), 120 deletions(-)
diff --git a/cpp/src/arrow/array-dict-test.cc b/cpp/src/arrow/array-dict-test.cc
index 16d8aac..ed37df3 100644
--- a/cpp/src/arrow/array-dict-test.cc
+++ b/cpp/src/arrow/array-dict-test.cc
@@ -948,4 +948,49 @@ TEST(TestDictionary, TransposeNulls) {
AssertArraysEqual(*expected, *out);
}
+TEST(TestDictionary, DISABLED_ListOfDictionary) {
+ std::unique_ptr<ArrayBuilder> root_builder;
+ ASSERT_OK(MakeBuilder(default_memory_pool(), list(dictionary(int8(), utf8())),
+ &root_builder));
+ auto list_builder = checked_cast<ListBuilder*>(root_builder.get());
+ auto dict_builder =
+ checked_cast<DictionaryBuilder<StringType>*>(list_builder->value_builder());
+
+ ASSERT_OK(list_builder->Append());
+ std::vector<std::string> expected;
+ for (char a : "abc") {
+ for (char d : "def") {
+ for (char g : "ghi") {
+ for (char j : "jkl") {
+ for (char m : "mno") {
+ for (char p : "pqr") {
+ if ((static_cast<int>(a) + d + g + j + m + p) % 16 == 0) {
+ ASSERT_OK(list_builder->Append());
+ }
+ // 3**6 distinct strings; too large for int8
+ char str[6] = {a, d, g, j, m, p};
+ ASSERT_OK(dict_builder->Append(str));
+ expected.push_back(str);
+ }
+ }
+ }
+ }
+ }
+ }
+ std::shared_ptr<Array> expected_dict;
+ ArrayFromVector<StringType, std::string>(expected, &expected_dict);
+
+ std::shared_ptr<Array> array;
+ ASSERT_OK(root_builder->Finish(&array));
+ ASSERT_OK(ValidateArray(*array));
+
+ auto expected_type = list(dictionary(int16(), utf8()));
+ ASSERT_EQ(array->type()->ToString(), expected_type->ToString());
+
+ auto list_array = checked_cast<const ListArray*>(array.get());
+ auto actual_dict =
+ checked_cast<const DictionaryArray&>(*list_array->values()).dictionary();
+ ASSERT_ARRAYS_EQUAL(*expected_dict, *actual_dict);
+}
+
} // namespace arrow
diff --git a/cpp/src/arrow/array-union-test.cc b/cpp/src/arrow/array-union-test.cc
index 86cbeae..62a4e15 100644
--- a/cpp/src/arrow/array-union-test.cc
+++ b/cpp/src/arrow/array-union-test.cc
@@ -188,4 +188,219 @@ TEST_F(TestUnionArrayFactories, TestMakeSparse) {
type_codes);
}
+template <typename B>
+class UnionBuilderTest : public ::testing::Test {
+ public:
+ uint8_t I8 = 8, STR = 13, DBL = 7;
+
+ virtual void AppendInt(int8_t i) {
+ expected_types_vector.push_back(I8);
+ ASSERT_OK(union_builder->Append(I8));
+ ASSERT_OK(i8_builder->Append(i));
+ }
+
+ virtual void AppendString(const std::string& str) {
+ expected_types_vector.push_back(STR);
+ ASSERT_OK(union_builder->Append(STR));
+ ASSERT_OK(str_builder->Append(str));
+ }
+
+ virtual void AppendDouble(double dbl) {
+ expected_types_vector.push_back(DBL);
+ ASSERT_OK(union_builder->Append(DBL));
+ ASSERT_OK(dbl_builder->Append(dbl));
+ }
+
+ void AppendBasics() {
+ AppendInt(33);
+ AppendString("abc");
+ AppendDouble(1.0);
+ AppendDouble(-1.0);
+ AppendString("");
+ AppendInt(10);
+ AppendString("def");
+ AppendInt(-10);
+ AppendDouble(0.5);
+ ASSERT_OK(union_builder->Finish(&actual));
+ ArrayFromVector<Int8Type, uint8_t>(expected_types_vector, &expected_types);
+ }
+
+ void AppendInferred() {
+ I8 = union_builder->AppendChild(i8_builder, "i8");
+ ASSERT_EQ(I8, 0);
+ AppendInt(33);
+ AppendInt(10);
+
+ STR = union_builder->AppendChild(str_builder, "str");
+ ASSERT_EQ(STR, 1);
+ AppendString("abc");
+ AppendString("");
+ AppendString("def");
+ AppendInt(-10);
+
+ DBL = union_builder->AppendChild(dbl_builder, "dbl");
+ ASSERT_EQ(DBL, 2);
+ AppendDouble(1.0);
+ AppendDouble(-1.0);
+ AppendDouble(0.5);
+ ASSERT_OK(union_builder->Finish(&actual));
+ ArrayFromVector<Int8Type, uint8_t>(expected_types_vector, &expected_types);
+
+ ASSERT_EQ(I8, 0);
+ ASSERT_EQ(STR, 1);
+ ASSERT_EQ(DBL, 2);
+ }
+
+ void AppendListOfInferred(std::shared_ptr<ListArray>* actual) {
+ ListBuilder list_builder(default_memory_pool(), union_builder);
+
+ ASSERT_OK(list_builder.Append());
+ I8 = union_builder->AppendChild(i8_builder, "i8");
+ ASSERT_EQ(I8, 0);
+ AppendInt(10);
+
+ ASSERT_OK(list_builder.Append());
+ STR = union_builder->AppendChild(str_builder, "str");
+ ASSERT_EQ(STR, 1);
+ AppendString("abc");
+ AppendInt(-10);
+
+ ASSERT_OK(list_builder.Append());
+ DBL = union_builder->AppendChild(dbl_builder, "dbl");
+ ASSERT_EQ(DBL, 2);
+ AppendDouble(0.5);
+
+ ASSERT_OK(list_builder.Finish(actual));
+ ArrayFromVector<Int8Type, uint8_t>(expected_types_vector, &expected_types);
+
+ ASSERT_EQ(I8, 0);
+ ASSERT_EQ(STR, 1);
+ ASSERT_EQ(DBL, 2);
+ }
+
+ std::vector<uint8_t> expected_types_vector;
+ std::shared_ptr<Array> expected_types;
+ std::shared_ptr<Int8Builder> i8_builder = std::make_shared<Int8Builder>();
+ std::shared_ptr<StringBuilder> str_builder = std::make_shared<StringBuilder>();
+ std::shared_ptr<DoubleBuilder> dbl_builder = std::make_shared<DoubleBuilder>();
+ std::shared_ptr<B> union_builder{new B(default_memory_pool())};
+ std::shared_ptr<UnionArray> actual;
+};
+
+class DenseUnionBuilderTest : public UnionBuilderTest<DenseUnionBuilder> {};
+class SparseUnionBuilderTest : public UnionBuilderTest<SparseUnionBuilder> {
+ public:
+ using Base = UnionBuilderTest<SparseUnionBuilder>;
+
+ void AppendInt(int8_t i) override {
+ Base::AppendInt(i);
+ ASSERT_OK(str_builder->AppendNull());
+ ASSERT_OK(dbl_builder->AppendNull());
+ }
+
+ void AppendString(const std::string& str) override {
+ Base::AppendString(str);
+ ASSERT_OK(i8_builder->AppendNull());
+ ASSERT_OK(dbl_builder->AppendNull());
+ }
+
+ void AppendDouble(double dbl) override {
+ Base::AppendDouble(dbl);
+ ASSERT_OK(i8_builder->AppendNull());
+ ASSERT_OK(str_builder->AppendNull());
+ }
+};
+
+TEST_F(DenseUnionBuilderTest, Basics) {
+ union_builder.reset(new DenseUnionBuilder(
+ default_memory_pool(), {i8_builder, str_builder, dbl_builder},
+ union_({field("i8", int8()), field("str", utf8()), field("dbl", float64())},
+ {I8, STR, DBL}, UnionMode::DENSE)));
+ AppendBasics();
+
+ auto expected_i8 = ArrayFromJSON(int8(), "[33, 10, -10]");
+ auto expected_str = ArrayFromJSON(utf8(), R"(["abc", "", "def"])");
+ auto expected_dbl = ArrayFromJSON(float64(), "[1.0, -1.0, 0.5]");
+
+ auto expected_offsets = ArrayFromJSON(int32(), "[0, 0, 0, 1, 1, 1, 2, 2, 2]");
+
+ std::shared_ptr<Array> expected;
+ ASSERT_OK(UnionArray::MakeDense(*expected_types, *expected_offsets,
+ {expected_i8, expected_str, expected_dbl},
+ {"i8", "str", "dbl"}, {I8, STR, DBL}, &expected));
+
+ ASSERT_EQ(expected->type()->ToString(), actual->type()->ToString());
+ ASSERT_ARRAYS_EQUAL(*expected, *actual);
+}
+
+TEST_F(DenseUnionBuilderTest, InferredType) {
+ AppendInferred();
+
+ auto expected_i8 = ArrayFromJSON(int8(), "[33, 10, -10]");
+ auto expected_str = ArrayFromJSON(utf8(), R"(["abc", "", "def"])");
+ auto expected_dbl = ArrayFromJSON(float64(), "[1.0, -1.0, 0.5]");
+
+ auto expected_offsets = ArrayFromJSON(int32(), "[0, 1, 0, 1, 2, 2, 0, 1, 2]");
+
+ std::shared_ptr<Array> expected;
+ ASSERT_OK(UnionArray::MakeDense(*expected_types, *expected_offsets,
+ {expected_i8, expected_str, expected_dbl},
+ {"i8", "str", "dbl"}, {I8, STR, DBL}, &expected));
+
+ ASSERT_EQ(expected->type()->ToString(), actual->type()->ToString());
+ ASSERT_ARRAYS_EQUAL(*expected, *actual);
+}
+
+TEST_F(DenseUnionBuilderTest, ListOfInferredType) {
+ std::shared_ptr<ListArray> actual;
+ AppendListOfInferred(&actual);
+
+ auto expected_type =
+ list(union_({field("i8", int8()), field("str", utf8()), field("dbl", float64())},
+ {I8, STR, DBL}, UnionMode::DENSE));
+ ASSERT_EQ(expected_type->ToString(), actual->type()->ToString());
+}
+
+TEST_F(SparseUnionBuilderTest, Basics) {
+ union_builder.reset(new SparseUnionBuilder(
+ default_memory_pool(), {i8_builder, str_builder, dbl_builder},
+ union_({field("i8", int8()), field("str", utf8()), field("dbl", float64())},
+ {I8, STR, DBL}, UnionMode::SPARSE)));
+
+ AppendBasics();
+
+ auto expected_i8 =
+ ArrayFromJSON(int8(), "[33, null, null, null, null, 10, null, -10, null]");
+ auto expected_str =
+ ArrayFromJSON(utf8(), R"([null, "abc", null, null, "", null, "def", null, null])");
+ auto expected_dbl =
+ ArrayFromJSON(float64(), "[null, null, 1.0, -1.0, null, null, null, null, 0.5]");
+
+ std::shared_ptr<Array> expected;
+ ASSERT_OK(UnionArray::MakeSparse(*expected_types,
+ {expected_i8, expected_str, expected_dbl},
+ {"i8", "str", "dbl"}, {I8, STR, DBL}, &expected));
+
+ ASSERT_EQ(expected->type()->ToString(), actual->type()->ToString());
+ ASSERT_ARRAYS_EQUAL(*expected, *actual);
+}
+
+TEST_F(SparseUnionBuilderTest, InferredType) {
+ AppendInferred();
+
+ auto expected_i8 =
+ ArrayFromJSON(int8(), "[33, 10, null, null, null, -10, null, null, null]");
+ auto expected_str =
+ ArrayFromJSON(utf8(), R"([null, null, "abc", "", "def", null, null, null, null])");
+ auto expected_dbl =
+ ArrayFromJSON(float64(), "[null, null, null, null, null, null, 1.0, -1.0, 0.5]");
+
+ std::shared_ptr<Array> expected;
+ ASSERT_OK(UnionArray::MakeSparse(*expected_types,
+ {expected_i8, expected_str, expected_dbl},
+ {"i8", "str", "dbl"}, {I8, STR, DBL}, &expected));
+
+ ASSERT_EQ(expected->type()->ToString(), actual->type()->ToString());
+ ASSERT_ARRAYS_EQUAL(*expected, *actual);
+}
} // namespace arrow
diff --git a/cpp/src/arrow/array.cc b/cpp/src/arrow/array.cc
index bc38559..5f76f08 100644
--- a/cpp/src/arrow/array.cc
+++ b/cpp/src/arrow/array.cc
@@ -606,6 +606,7 @@ void UnionArray::SetData(const std::shared_ptr<ArrayData>& data) {
ARROW_CHECK_EQ(data->type->id(), Type::UNION);
ARROW_CHECK_EQ(data->buffers.size(), 3);
+ union_type_ = checked_cast<const UnionType*>(data_->type.get());
auto type_ids = data_->buffers[1];
auto value_offsets = data_->buffers[2];
diff --git a/cpp/src/arrow/array.h b/cpp/src/arrow/array.h
index 256bbdc..599a6ea 100644
--- a/cpp/src/arrow/array.h
+++ b/cpp/src/arrow/array.h
@@ -1032,9 +1032,9 @@ class ARROW_EXPORT UnionArray : public Array {
const type_id_t* raw_type_ids() const { return raw_type_ids_ + data_->offset; }
const int32_t* raw_value_offsets() const { return raw_value_offsets_ + data_->offset; }
- UnionMode::type mode() const {
- return internal::checked_cast<const UnionType&>(*type()).mode();
- }
+ const UnionType* union_type() const { return union_type_; }
+
+ UnionMode::type mode() const { return union_type_->mode(); }
// Return the given field as an individual array.
// For sparse unions, the returned array has its offset, length and null
@@ -1047,6 +1047,7 @@ class ARROW_EXPORT UnionArray : public Array {
const type_id_t* raw_type_ids_;
const int32_t* raw_value_offsets_;
+ const UnionType* union_type_;
// For caching boxed child data
mutable std::vector<std::shared_ptr<Array>> boxed_fields_;
diff --git a/cpp/src/arrow/array/builder_nested.cc b/cpp/src/arrow/array/builder_nested.cc
index 0bf9171..30b3fc0 100644
--- a/cpp/src/arrow/array/builder_nested.cc
+++ b/cpp/src/arrow/array/builder_nested.cc
@@ -60,7 +60,7 @@ Status ListBuilder::CheckNextOffset() const {
const int64_t num_values = value_builder_->length();
ARROW_RETURN_IF(
num_values > kListMaximumElements,
- Status::CapacityError("ListArray cannot contain more then 2^31 - 1 child elements,",
+ Status::CapacityError("ListArray cannot contain more than 2^31 - 1 child elements,",
" have ", num_values));
return Status::OK();
}
@@ -300,7 +300,7 @@ Status FixedSizeListBuilder::FinishInternal(std::shared_ptr<ArrayData>* out) {
// Struct
StructBuilder::StructBuilder(const std::shared_ptr<DataType>& type, MemoryPool* pool,
- std::vector<std::shared_ptr<ArrayBuilder>>&& field_builders)
+ std::vector<std::shared_ptr<ArrayBuilder>> field_builders)
: ArrayBuilder(type, pool) {
children_ = std::move(field_builders);
}
diff --git a/cpp/src/arrow/array/builder_nested.h b/cpp/src/arrow/array/builder_nested.h
index de03145..8742f2b 100644
--- a/cpp/src/arrow/array/builder_nested.h
+++ b/cpp/src/arrow/array/builder_nested.h
@@ -218,7 +218,7 @@ class ARROW_EXPORT FixedSizeListBuilder : public ArrayBuilder {
class ARROW_EXPORT StructBuilder : public ArrayBuilder {
public:
StructBuilder(const std::shared_ptr<DataType>& type, MemoryPool* pool,
- std::vector<std::shared_ptr<ArrayBuilder>>&& field_builders);
+ std::vector<std::shared_ptr<ArrayBuilder>> field_builders);
Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
diff --git a/cpp/src/arrow/array/builder_union.cc b/cpp/src/arrow/array/builder_union.cc
index f51b7d7..8de786f 100644
--- a/cpp/src/arrow/array/builder_union.cc
+++ b/cpp/src/arrow/array/builder_union.cc
@@ -17,44 +17,101 @@
#include "arrow/array/builder_union.h"
+#include <limits>
#include <utility>
+#include "arrow/util/checked_cast.h"
#include "arrow/util/logging.h"
namespace arrow {
-DenseUnionBuilder::DenseUnionBuilder(MemoryPool* pool,
- const std::shared_ptr<DataType>& type)
- : ArrayBuilder(type, pool), types_builder_(pool), offsets_builder_(pool) {}
+using internal::checked_cast;
-Status DenseUnionBuilder::FinishInternal(std::shared_ptr<ArrayData>* out) {
- std::shared_ptr<Buffer> types;
+Status BasicUnionBuilder::FinishInternal(std::shared_ptr<ArrayData>* out) {
+ std::shared_ptr<Buffer> types, null_bitmap;
+ RETURN_NOT_OK(null_bitmap_builder_.Finish(&null_bitmap));
RETURN_NOT_OK(types_builder_.Finish(&types));
- std::shared_ptr<Buffer> offsets;
- RETURN_NOT_OK(offsets_builder_.Finish(&offsets));
- std::shared_ptr<Buffer> null_bitmap;
- RETURN_NOT_OK(null_bitmap_builder_.Finish(&null_bitmap));
+ // If the type has not been specified in the constructor, gather type_codes
+ std::vector<uint8_t> type_codes;
+ if (type_ == nullptr) {
+ for (size_t i = 0; i < children_.size(); ++i) {
+ if (type_id_to_children_[i] != nullptr) {
+ type_codes.push_back(static_cast<uint8_t>(i));
+ }
+ }
+ } else {
+ type_codes = checked_cast<const UnionType&>(*type_).type_codes();
+ }
+ DCHECK_EQ(type_codes.size(), children_.size());
- std::vector<std::shared_ptr<Field>> fields;
std::vector<std::shared_ptr<ArrayData>> child_data(children_.size());
- std::vector<uint8_t> type_ids;
for (size_t i = 0; i < children_.size(); ++i) {
- std::shared_ptr<ArrayData> data;
- RETURN_NOT_OK(children_[i]->FinishInternal(&data));
- child_data[i] = data;
- fields.push_back(field(field_names_[i], children_[i]->type()));
- type_ids.push_back(static_cast<uint8_t>(i));
+ RETURN_NOT_OK(children_[i]->FinishInternal(&child_data[i]));
}
// If the type has not been specified in the constructor, infer it
- if (!type_) {
- type_ = union_(fields, type_ids, UnionMode::DENSE);
+ if (type_ == nullptr) {
+ std::vector<std::shared_ptr<Field>> fields;
+ auto field_names_it = field_names_.begin();
+ for (auto&& data : child_data) {
+ fields.push_back(field(*field_names_it++, data->type));
+ }
+ type_ = union_(fields, type_codes, mode_);
}
- *out = ArrayData::Make(type_, length(), {null_bitmap, types, offsets}, null_count_);
+ *out = ArrayData::Make(type_, length(), {null_bitmap, types, nullptr}, null_count_);
(*out)->child_data = std::move(child_data);
return Status::OK();
}
+BasicUnionBuilder::BasicUnionBuilder(
+ MemoryPool* pool, UnionMode::type mode,
+ const std::vector<std::shared_ptr<ArrayBuilder>>& children,
+ const std::shared_ptr<DataType>& type)
+ : ArrayBuilder(type, pool), mode_(mode), types_builder_(pool) {
+ auto union_type = checked_cast<const UnionType*>(type.get());
+ DCHECK_NE(union_type, nullptr);
+ DCHECK_EQ(union_type->mode(), mode);
+
+ children_ = children;
+ type_id_to_children_.resize(union_type->max_type_code() + 1, nullptr);
+ DCHECK_LT(type_id_to_children_.size(), std::numeric_limits<int8_t>::max());
+
+ auto field_it = type->children().begin();
+ auto children_it = children.begin();
+ for (auto type_id : union_type->type_codes()) {
+ type_id_to_children_[type_id] = *children_it++;
+ field_names_.push_back((*field_it++)->name());
+ }
+ DCHECK_EQ(children_it, children.end());
+ DCHECK_EQ(field_it, type->children().end());
+}
+
+int8_t BasicUnionBuilder::AppendChild(const std::shared_ptr<ArrayBuilder>& new_child,
+ const std::string& field_name) {
+ // force type inferrence in Finish
+ type_ = nullptr;
+
+ field_names_.push_back(field_name);
+ children_.push_back(new_child);
+
+ // Find type_id such that type_id_to_children_[type_id] == nullptr
+ // and use that for the new child. Start searching at dense_type_id_
+ // since type_id_to_children_ is densely packed up at least up to dense_type_id_
+ for (; static_cast<size_t>(dense_type_id_) < type_id_to_children_.size();
+ ++dense_type_id_) {
+ if (type_id_to_children_[dense_type_id_] == nullptr) {
+ type_id_to_children_[dense_type_id_] = new_child;
+ return dense_type_id_++;
+ }
+ }
+
+ DCHECK_LT(type_id_to_children_.size(), std::numeric_limits<int8_t>::max());
+
+ // type_id_to_children_ is already densely packed, so just append the new child
+ type_id_to_children_.push_back(new_child);
+ return dense_type_id_++;
+}
+
} // namespace arrow
diff --git a/cpp/src/arrow/array/builder_union.h b/cpp/src/arrow/array/builder_union.h
index aac2e54..e04d5d2 100644
--- a/cpp/src/arrow/array/builder_union.h
+++ b/cpp/src/arrow/array/builder_union.h
@@ -27,25 +27,66 @@
namespace arrow {
+class ARROW_EXPORT BasicUnionBuilder : public ArrayBuilder {
+ public:
+ Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
+
+ /// \cond FALSE
+ using ArrayBuilder::Finish;
+ /// \endcond
+
+ Status Finish(std::shared_ptr<UnionArray>* out) { return FinishTyped(out); }
+
+ /// \brief Make a new child builder available to the UnionArray
+ ///
+ /// \param[in] new_child the child builder
+ /// \param[in] field_name the name of the field in the union array type
+ /// if type inference is used
+ /// \return child index, which is the "type" argument that needs
+ /// to be passed to the "Append" method to add a new element to
+ /// the union array.
+ int8_t AppendChild(const std::shared_ptr<ArrayBuilder>& new_child,
+ const std::string& field_name = "");
+
+ protected:
+ /// Use this constructor to initialize the UnionBuilder with no child builders,
+ /// allowing type to be inferred. You will need to call AppendChild for each of the
+ /// children builders you want to use.
+ explicit BasicUnionBuilder(MemoryPool* pool, UnionMode::type mode)
+ : ArrayBuilder(NULLPTR, pool), mode_(mode), types_builder_(pool) {}
+
+ /// Use this constructor to specify the type explicitly.
+ /// You can still add child builders to the union after using this constructor
+ BasicUnionBuilder(MemoryPool* pool, UnionMode::type mode,
+ const std::vector<std::shared_ptr<ArrayBuilder>>& children,
+ const std::shared_ptr<DataType>& type);
+
+ UnionMode::type mode_;
+ std::vector<std::shared_ptr<ArrayBuilder>> type_id_to_children_;
+ // for all type_id < dense_type_id_, type_id_to_children_[type_id] != nullptr
+ int8_t dense_type_id_ = 0;
+ TypedBufferBuilder<int8_t> types_builder_;
+ std::vector<std::string> field_names_;
+};
+
/// \class DenseUnionBuilder
///
-/// You need to call AppendChild for each of the children builders you want
-/// to use. The function will return an int8_t, which is the type tag
-/// associated with that child. You can then call Append with that tag
-/// (followed by an append on the child builder) to add elements to
-/// the union array.
-///
-/// You can either specify the type when the UnionBuilder is constructed
-/// or let the UnionBuilder infer the type at runtime (by omitting the
-/// type argument from the constructor).
-///
/// This API is EXPERIMENTAL.
-class ARROW_EXPORT DenseUnionBuilder : public ArrayBuilder {
+class ARROW_EXPORT DenseUnionBuilder : public BasicUnionBuilder {
public:
- /// Use this constructor to incrementally build the union array along
- /// with types, offsets, and null bitmap.
- explicit DenseUnionBuilder(MemoryPool* pool,
- const std::shared_ptr<DataType>& type = NULLPTR);
+ /// Use this constructor to initialize the UnionBuilder with no child builders,
+ /// allowing type to be inferred. You will need to call AppendChild for each of the
+ /// children builders you want to use.
+ explicit DenseUnionBuilder(MemoryPool* pool)
+ : BasicUnionBuilder(pool, UnionMode::DENSE), offsets_builder_(pool) {}
+
+ /// Use this constructor to specify the type explicitly.
+ /// You can still add child builders to the union after using this constructor
+ DenseUnionBuilder(MemoryPool* pool,
+ const std::vector<std::shared_ptr<ArrayBuilder>>& children,
+ const std::shared_ptr<DataType>& type)
+ : BasicUnionBuilder(pool, UnionMode::DENSE, children, type),
+ offsets_builder_(pool) {}
Status AppendNull() final {
ARROW_RETURN_NOT_OK(types_builder_.Append(0));
@@ -54,53 +95,78 @@ class ARROW_EXPORT DenseUnionBuilder : public ArrayBuilder {
}
Status AppendNulls(int64_t length) final {
- ARROW_RETURN_NOT_OK(types_builder_.Reserve(length));
- ARROW_RETURN_NOT_OK(offsets_builder_.Reserve(length));
- ARROW_RETURN_NOT_OK(Reserve(length));
- for (int64_t i = 0; i < length; ++i) {
- types_builder_.UnsafeAppend(0);
- offsets_builder_.UnsafeAppend(0);
- }
+ ARROW_RETURN_NOT_OK(types_builder_.Append(length, 0));
+ ARROW_RETURN_NOT_OK(offsets_builder_.Append(length, 0));
return AppendToBitmap(length, false);
}
/// \brief Append an element to the UnionArray. This must be followed
/// by an append to the appropriate child builder.
- /// \param[in] type index of the child the value will be appended
- /// \param[in] offset offset of the value in that child
- Status Append(int8_t type, int32_t offset) {
- ARROW_RETURN_NOT_OK(types_builder_.Append(type));
+ ///
+ /// \param[in] next_type type_id of the child to which the next value will be appended.
+ ///
+ /// The corresponding child builder must be appended to independently after this method
+ /// is called.
+ Status Append(int8_t next_type) {
+ ARROW_RETURN_NOT_OK(types_builder_.Append(next_type));
+ if (type_id_to_children_[next_type]->length() == kListMaximumElements) {
+ return Status::CapacityError(
+ "a dense UnionArray cannot contain more than 2^31 - 1 elements from a single "
+ "child");
+ }
+ auto offset = static_cast<int32_t>(type_id_to_children_[next_type]->length());
ARROW_RETURN_NOT_OK(offsets_builder_.Append(offset));
return AppendToBitmap(true);
}
- Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
+ Status FinishInternal(std::shared_ptr<ArrayData>* out) override {
+ ARROW_RETURN_NOT_OK(BasicUnionBuilder::FinishInternal(out));
+ return offsets_builder_.Finish(&(*out)->buffers[2]);
+ }
- /// \cond FALSE
- using ArrayBuilder::Finish;
- /// \endcond
+ private:
+ TypedBufferBuilder<int32_t> offsets_builder_;
+};
- Status Finish(std::shared_ptr<UnionArray>* out) { return FinishTyped(out); }
+/// \class SparseUnionBuilder
+///
+/// This API is EXPERIMENTAL.
+class ARROW_EXPORT SparseUnionBuilder : public BasicUnionBuilder {
+ public:
+ /// Use this constructor to initialize the UnionBuilder with no child builders,
+ /// allowing type to be inferred. You will need to call AppendChild for each of the
+ /// children builders you want to use.
+ explicit SparseUnionBuilder(MemoryPool* pool)
+ : BasicUnionBuilder(pool, UnionMode::SPARSE) {}
- /// \brief Make a new child builder available to the UnionArray
- ///
- /// \param[in] child the child builder
- /// \param[in] field_name the name of the field in the union array type
- /// if type inference is used
- /// \return child index, which is the "type" argument that needs
- /// to be passed to the "Append" method to add a new element to
- /// the union array.
- int8_t AppendChild(const std::shared_ptr<ArrayBuilder>& child,
- const std::string& field_name = "") {
- children_.push_back(child);
- field_names_.push_back(field_name);
- return static_cast<int8_t>(children_.size() - 1);
+ /// Use this constructor to specify the type explicitly.
+ /// You can still add child builders to the union after using this constructor
+ SparseUnionBuilder(MemoryPool* pool,
+ const std::vector<std::shared_ptr<ArrayBuilder>>& children,
+ const std::shared_ptr<DataType>& type)
+ : BasicUnionBuilder(pool, UnionMode::SPARSE, children, type) {}
+
+ Status AppendNull() final {
+ ARROW_RETURN_NOT_OK(types_builder_.Append(0));
+ return AppendToBitmap(false);
}
- private:
- TypedBufferBuilder<int8_t> types_builder_;
- TypedBufferBuilder<int32_t> offsets_builder_;
- std::vector<std::string> field_names_;
+ Status AppendNulls(int64_t length) final {
+ ARROW_RETURN_NOT_OK(types_builder_.Append(length, 0));
+ return AppendToBitmap(length, false);
+ }
+
+ /// \brief Append an element to the UnionArray. This must be followed
+ /// by an append to the appropriate child builder.
+ ///
+ /// \param[in] next_type type_id of the child to which the next value will be appended.
+ ///
+ /// The corresponding child builder must be appended to independently after this method
+ /// is called, and all other child builders must have null appended
+ Status Append(int8_t next_type) {
+ ARROW_RETURN_NOT_OK(types_builder_.Append(next_type));
+ return AppendToBitmap(true);
+ }
};
} // namespace arrow
diff --git a/cpp/src/arrow/builder.cc b/cpp/src/arrow/builder.cc
index f6f8042..cee443c 100644
--- a/cpp/src/arrow/builder.cc
+++ b/cpp/src/arrow/builder.cc
@@ -155,14 +155,32 @@ Status MakeBuilder(MemoryPool* pool, const std::shared_ptr<DataType>& type,
case Type::STRUCT: {
const std::vector<std::shared_ptr<Field>>& fields = type->children();
- std::vector<std::shared_ptr<ArrayBuilder>> values_builder;
+ std::vector<std::shared_ptr<ArrayBuilder>> field_builders;
for (auto it : fields) {
std::unique_ptr<ArrayBuilder> builder;
RETURN_NOT_OK(MakeBuilder(pool, it->type(), &builder));
- values_builder.emplace_back(std::move(builder));
+ field_builders.emplace_back(std::move(builder));
+ }
+ out->reset(new StructBuilder(type, pool, std::move(field_builders)));
+ return Status::OK();
+ }
+
+ case Type::UNION: {
+ const auto& union_type = internal::checked_cast<const UnionType&>(*type);
+ const std::vector<std::shared_ptr<Field>>& fields = type->children();
+ std::vector<std::shared_ptr<ArrayBuilder>> field_builders;
+
+ for (auto it : fields) {
+ std::unique_ptr<ArrayBuilder> builder;
+ RETURN_NOT_OK(MakeBuilder(pool, it->type(), &builder));
+ field_builders.emplace_back(std::move(builder));
+ }
+ if (union_type.mode() == UnionMode::DENSE) {
+ out->reset(new DenseUnionBuilder(pool, std::move(field_builders), type));
+ } else {
+ out->reset(new SparseUnionBuilder(pool, std::move(field_builders), type));
}
- out->reset(new StructBuilder(type, pool, std::move(values_builder)));
return Status::OK();
}
diff --git a/cpp/src/arrow/builder.h b/cpp/src/arrow/builder.h
index 56c3e2b..223ed58 100644
--- a/cpp/src/arrow/builder.h
+++ b/cpp/src/arrow/builder.h
@@ -27,6 +27,7 @@
#include "arrow/array/builder_nested.h" // IWYU pragma: export
#include "arrow/array/builder_primitive.h" // IWYU pragma: export
#include "arrow/array/builder_time.h" // IWYU pragma: export
+#include "arrow/array/builder_union.h" // IWYU pragma: export
#include "arrow/status.h"
#include "arrow/util/visibility.h"
diff --git a/cpp/src/arrow/compare.cc b/cpp/src/arrow/compare.cc
index e1525a4..097bc8f 100644
--- a/cpp/src/arrow/compare.cc
+++ b/cpp/src/arrow/compare.cc
@@ -234,26 +234,15 @@ class RangeEqualsVisitor {
const auto& left_type = checked_cast<const UnionType&>(*left.type());
// Define a mapping from the type id to child number
- uint8_t max_code = 0;
-
const std::vector<uint8_t>& type_codes = left_type.type_codes();
- for (size_t i = 0; i < type_codes.size(); ++i) {
- const uint8_t code = type_codes[i];
- if (code > max_code) {
- max_code = code;
- }
- }
-
- // Store mapping in a vector for constant time lookups
- std::vector<uint8_t> type_id_to_child_num(max_code + 1);
- for (uint8_t i = 0; i < static_cast<uint8_t>(type_codes.size()); ++i) {
+ std::vector<uint8_t> type_id_to_child_num(left.union_type()->max_type_code() + 1, 0);
+ for (uint8_t i = 0; i < type_codes.size(); ++i) {
type_id_to_child_num[type_codes[i]] = i;
}
const uint8_t* left_ids = left.raw_type_ids();
const uint8_t* right_ids = right.raw_type_ids();
- uint8_t id, child_num;
for (int64_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_;
++i, ++o_i) {
if (left.IsNull(i) != right.IsNull(o_i)) {
@@ -264,8 +253,7 @@ class RangeEqualsVisitor {
return false;
}
- id = left_ids[i];
- child_num = type_id_to_child_num[id];
+ auto child_num = type_id_to_child_num[left_ids[i]];
// TODO(wesm): really we should be comparing stretches of non-null data
// rather than looking at one value at a time.
@@ -773,30 +761,16 @@ class TypeEqualsVisitor {
Status Visit(const UnionType& left) {
const auto& right = checked_cast<const UnionType&>(right_);
- if (left.mode() != right.mode() ||
- left.type_codes().size() != right.type_codes().size()) {
+ if (left.mode() != right.mode() || left.type_codes() != right.type_codes()) {
result_ = false;
return Status::OK();
}
- const std::vector<uint8_t>& left_codes = left.type_codes();
- const std::vector<uint8_t>& right_codes = right.type_codes();
-
- for (size_t i = 0; i < left_codes.size(); ++i) {
- if (left_codes[i] != right_codes[i]) {
- result_ = false;
- return Status::OK();
- }
- }
-
- for (int i = 0; i < left.num_children(); ++i) {
- if (!left.child(i)->Equals(right_.child(i), check_metadata_)) {
- result_ = false;
- return Status::OK();
- }
- }
-
- result_ = true;
+ result_ = std::equal(
+ left.children().begin(), left.children().end(), right.children().begin(),
+ [this](const std::shared_ptr<Field>& l, const std::shared_ptr<Field>& r) {
+ return l->Equals(r, check_metadata_);
+ });
return Status::OK();
}
diff --git a/cpp/src/arrow/ipc/json-simple-test.cc b/cpp/src/arrow/ipc/json-simple-test.cc
index 772557b..ce8b21a 100644
--- a/cpp/src/arrow/ipc/json-simple-test.cc
+++ b/cpp/src/arrow/ipc/json-simple-test.cc
@@ -905,6 +905,196 @@ TEST(TestStruct, Errors) {
ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[{\"c\": 0}]", &array));
}
+TEST(TestDenseUnion, Basics) {
+ auto field_a = field("a", int8());
+ auto field_b = field("b", boolean());
+
+ auto type = union_({field_a, field_b}, {4, 8}, UnionMode::DENSE);
+ auto array = ArrayFromJSON(type, "[[4, 122], [8, true], [4, null], null, [8, false]]");
+
+ auto expected_types = ArrayFromJSON(int8(), "[4, 8, 4, null, 8]");
+ auto expected_offsets = ArrayFromJSON(int32(), "[0, 0, 1, 0, 1]");
+ auto expected_a = ArrayFromJSON(int8(), "[122, null]");
+ auto expected_b = ArrayFromJSON(boolean(), "[true, false]");
+
+ std::shared_ptr<Array> expected;
+ ASSERT_OK(UnionArray::MakeDense(*expected_types, *expected_offsets,
+ {expected_a, expected_b}, {"a", "b"}, {4, 8},
+ &expected));
+
+ ASSERT_ARRAYS_EQUAL(*expected, *array);
+}
+
+TEST(TestSparseUnion, Basics) {
+ auto field_a = field("a", int8());
+ auto field_b = field("b", boolean());
+
+ auto type = union_({field_a, field_b}, {4, 8}, UnionMode::SPARSE);
+ auto array = ArrayFromJSON(type, "[[4, 122], [8, true], [4, null], null, [8, false]]");
+
+ auto expected_types = ArrayFromJSON(int8(), "[4, 8, 4, null, 8]");
+ auto expected_a = ArrayFromJSON(int8(), "[122, null, null, null, null]");
+ auto expected_b = ArrayFromJSON(boolean(), "[null, true, null, null, false]");
+
+ std::shared_ptr<Array> expected;
+ ASSERT_OK(UnionArray::MakeSparse(*expected_types, {expected_a, expected_b}, {"a", "b"},
+ {4, 8}, &expected));
+
+ ASSERT_ARRAYS_EQUAL(*expected, *array);
+}
+
+TEST(TestDenseUnion, ListOfUnion) {
+ auto field_a = field("a", int8());
+ auto field_b = field("b", boolean());
+ auto union_type = union_({field_a, field_b}, {4, 8}, UnionMode::DENSE);
+ auto list_type = list(union_type);
+ auto array = ArrayFromJSON(list_type,
+ "["
+ "[[4, 122], [8, true]],"
+ "[[4, null], null, [8, false]]"
+ "]");
+
+ auto expected_types = ArrayFromJSON(int8(), "[4, 8, 4, null, 8]");
+ auto expected_offsets = ArrayFromJSON(int32(), "[0, 0, 1, 0, 1]");
+ auto expected_a = ArrayFromJSON(int8(), "[122, null]");
+ auto expected_b = ArrayFromJSON(boolean(), "[true, false]");
+
+ std::shared_ptr<Array> expected_values, expected;
+ ASSERT_OK(UnionArray::MakeDense(*expected_types, *expected_offsets,
+ {expected_a, expected_b}, {"a", "b"}, {4, 8},
+ &expected_values));
+ auto expected_list_offsets = ArrayFromJSON(int32(), "[0, 2, 5]");
+ ASSERT_OK(ListArray::FromArrays(*expected_list_offsets, *expected_values,
+ default_memory_pool(), &expected));
+
+ ASSERT_ARRAYS_EQUAL(*expected, *array);
+}
+
+TEST(TestSparseUnion, ListOfUnion) {
+ auto field_a = field("a", int8());
+ auto field_b = field("b", boolean());
+ auto union_type = union_({field_a, field_b}, {4, 8}, UnionMode::SPARSE);
+ auto list_type = list(union_type);
+ auto array = ArrayFromJSON(list_type,
+ "["
+ "[[4, 122], [8, true]],"
+ "[[4, null], null, [8, false]]"
+ "]");
+
+ auto expected_types = ArrayFromJSON(int8(), "[4, 8, 4, null, 8]");
+ auto expected_a = ArrayFromJSON(int8(), "[122, null, null, null, null]");
+ auto expected_b = ArrayFromJSON(boolean(), "[null, true, null, null, false]");
+
+ std::shared_ptr<Array> expected_values, expected;
+ ASSERT_OK(UnionArray::MakeSparse(*expected_types, {expected_a, expected_b}, {"a", "b"},
+ {4, 8}, &expected_values));
+ auto expected_list_offsets = ArrayFromJSON(int32(), "[0, 2, 5]");
+ ASSERT_OK(ListArray::FromArrays(*expected_list_offsets, *expected_values,
+ default_memory_pool(), &expected));
+
+ ASSERT_ARRAYS_EQUAL(*expected, *array);
+}
+
+TEST(TestDenseUnion, UnionOfStructs) {
+ std::vector<std::shared_ptr<Field>> fields = {
+ field("ab", struct_({field("alpha", float64()), field("bravo", utf8())})),
+ field("wtf", struct_({field("whiskey", int8()), field("tango", float64()),
+ field("foxtrot", list(int8()))})),
+ field("q", struct_({field("quebec", utf8())}))};
+ auto type = union_(fields, {0, 23, 47}, UnionMode::DENSE);
+ auto array = ArrayFromJSON(type, R"([
+ [0, {"alpha": 0.0, "bravo": "charlie"}],
+ [23, {"whiskey": 99}],
+ [0, {"bravo": "mike"}],
+ null,
+ [23, {"tango": 8.25, "foxtrot": [0, 2, 3]}]
+ ])");
+
+ auto expected_types = ArrayFromJSON(int8(), "[0, 23, 0, null, 23]");
+ auto expected_offsets = ArrayFromJSON(int32(), "[0, 0, 1, 0, 1]");
+ ArrayVector expected_fields = {ArrayFromJSON(fields[0]->type(), R"([
+ {"alpha": 0.0, "bravo": "charlie"},
+ {"bravo": "mike"}
+ ])"),
+ ArrayFromJSON(fields[1]->type(), R"([
+ {"whiskey": 99},
+ {"tango": 8.25, "foxtrot": [0, 2, 3]}
+ ])"),
+ ArrayFromJSON(fields[2]->type(), "[]")};
+
+ std::shared_ptr<Array> expected;
+ ASSERT_OK(UnionArray::MakeDense(*expected_types, *expected_offsets, expected_fields,
+ {"ab", "wtf", "q"}, {0, 23, 47}, &expected));
+
+ ASSERT_ARRAYS_EQUAL(*expected, *array);
+}
+
+TEST(TestSparseUnion, UnionOfStructs) {
+ std::vector<std::shared_ptr<Field>> fields = {
+ field("ab", struct_({field("alpha", float64()), field("bravo", utf8())})),
+ field("wtf", struct_({field("whiskey", int8()), field("tango", float64()),
+ field("foxtrot", list(int8()))})),
+ field("q", struct_({field("quebec", utf8())}))};
+ auto type = union_(fields, {0, 23, 47}, UnionMode::SPARSE);
+ auto array = ArrayFromJSON(type, R"([
+ [0, {"alpha": 0.0, "bravo": "charlie"}],
+ [23, {"whiskey": 99}],
+ [0, {"bravo": "mike"}],
+ null,
+ [23, {"tango": 8.25, "foxtrot": [0, 2, 3]}]
+ ])");
+
+ auto expected_types = ArrayFromJSON(int8(), "[0, 23, 0, null, 23]");
+ ArrayVector expected_fields = {
+ ArrayFromJSON(fields[0]->type(), R"([
+ {"alpha": 0.0, "bravo": "charlie"},
+ null,
+ {"bravo": "mike"},
+ null,
+ null
+ ])"),
+ ArrayFromJSON(fields[1]->type(), R"([
+ null,
+ {"whiskey": 99},
+ null,
+ null,
+ {"tango": 8.25, "foxtrot": [0, 2, 3]}
+ ])"),
+ ArrayFromJSON(fields[2]->type(), "[null, null, null, null, null]")};
+
+ std::shared_ptr<Array> expected;
+ ASSERT_OK(UnionArray::MakeSparse(*expected_types, expected_fields, {"ab", "wtf", "q"},
+ {0, 23, 47}, &expected));
+
+ ASSERT_ARRAYS_EQUAL(*expected, *array);
+}
+
+TEST(TestDenseUnion, Errors) {
+ auto field_a = field("a", int8());
+ auto field_b = field("b", boolean());
+ std::shared_ptr<DataType> type = union_({field_a, field_b}, {4, 8}, UnionMode::DENSE);
+ std::shared_ptr<Array> array;
+
+ ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[\"\"]", &array));
+ ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[[0, 8]]", &array));
+ ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[[0]]", &array));
+ ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[[4, \"\"]]", &array));
+ ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[[8, true, 1]]", &array));
+}
+
+TEST(TestSparseUnion, Errors) {
+ auto field_a = field("a", int8());
+ auto field_b = field("b", boolean());
+ std::shared_ptr<DataType> type = union_({field_a, field_b}, {4, 8}, UnionMode::SPARSE);
+ std::shared_ptr<Array> array;
+
+ ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[\"\"]", &array));
+ ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[[0, 8]]", &array));
+ ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[[0]]", &array));
+ ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[[4, \"\"]]", &array));
+ ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[[8, true, 1]]", &array));
+}
+
} // namespace json
} // namespace internal
} // namespace ipc
diff --git a/cpp/src/arrow/ipc/json-simple.cc b/cpp/src/arrow/ipc/json-simple.cc
index f850f3d..ae01bcc 100644
--- a/cpp/src/arrow/ipc/json-simple.cc
+++ b/cpp/src/arrow/ipc/json-simple.cc
@@ -611,6 +611,94 @@ class StructConverter final : public ConcreteConverter<StructConverter> {
};
// ------------------------------------------------------------------------
+// Converter for struct arrays
+
+class UnionConverter final : public ConcreteConverter<UnionConverter> {
+ public:
+ explicit UnionConverter(const std::shared_ptr<DataType>& type) { type_ = type; }
+
+ Status Init() override {
+ auto union_type = checked_cast<const UnionType*>(type_.get());
+ mode_ = union_type->mode();
+ type_id_to_child_num_.clear();
+ type_id_to_child_num_.resize(union_type->max_type_code() + 1, -1);
+ int child_i = 0;
+ for (auto type_id : union_type->type_codes()) {
+ type_id_to_child_num_[type_id] = child_i++;
+ }
+ std::vector<std::shared_ptr<ArrayBuilder>> child_builders;
+ for (const auto& field : type_->children()) {
+ std::shared_ptr<Converter> child_converter;
+ RETURN_NOT_OK(GetConverter(field->type(), &child_converter));
+ child_converters_.push_back(child_converter);
+ child_builders.push_back(child_converter->builder());
+ }
+ if (mode_ == UnionMode::DENSE) {
+ builder_ = std::make_shared<DenseUnionBuilder>(default_memory_pool(),
+ std::move(child_builders), type_);
+ } else {
+ builder_ = std::make_shared<SparseUnionBuilder>(default_memory_pool(),
+ std::move(child_builders), type_);
+ }
+ return Status::OK();
+ }
+
+ Status AppendNull() override {
+ for (auto& converter : child_converters_) {
+ RETURN_NOT_OK(converter->AppendNull());
+ }
+ return builder_->AppendNull();
+ }
+
+ // Append a JSON value that is either an array of N elements in order
+ // or an object mapping struct names to values (omitted struct members
+ // are mapped to null).
+ Status AppendValue(const rj::Value& json_obj) override {
+ if (json_obj.IsNull()) {
+ return AppendNull();
+ }
+ if (!json_obj.IsArray()) {
+ return JSONTypeError("array", json_obj.GetType());
+ }
+ if (json_obj.Size() != 2) {
+ return Status::Invalid("Expected [type_id, value] pair, got array of size ",
+ json_obj.Size());
+ }
+ const auto& id_obj = json_obj[0];
+ if (!id_obj.IsInt()) {
+ return JSONTypeError("int", id_obj.GetType());
+ }
+
+ auto id = static_cast<int8_t>(id_obj.GetInt());
+ auto child_num = type_id_to_child_num_[id];
+ if (child_num == -1) {
+ return Status::Invalid("type_id ", id, " not found in ", *type_);
+ }
+
+ auto child_converter = child_converters_[child_num];
+ if (mode_ == UnionMode::DENSE) {
+ RETURN_NOT_OK(checked_cast<DenseUnionBuilder&>(*builder_).Append(id));
+ } else {
+ RETURN_NOT_OK(checked_cast<SparseUnionBuilder&>(*builder_).Append(id));
+ for (auto&& other_converter : child_converters_) {
+ if (other_converter != child_converter) {
+ RETURN_NOT_OK(other_converter->AppendNull());
+ }
+ }
+ }
+ return child_converter->AppendValue(json_obj[1]);
+ }
+
+ std::shared_ptr<ArrayBuilder> builder() override { return builder_; }
+
+ private:
+ UnionMode::type mode_;
+ std::shared_ptr<ArrayBuilder> builder_;
+ std::vector<std::shared_ptr<Converter>> child_converters_;
+ std::vector<int8_t> type_id_to_child_num_;
+};
+
+// ------------------------------------------------------------------------
// General conversion functions
Status GetConverter(const std::shared_ptr<DataType>& type,
@@ -648,6 +736,7 @@ Status GetConverter(const std::shared_ptr<DataType>& type,
SIMPLE_CONVERTER_CASE(Type::BINARY, StringConverter)
SIMPLE_CONVERTER_CASE(Type::FIXED_SIZE_BINARY, FixedSizeBinaryConverter)
SIMPLE_CONVERTER_CASE(Type::DECIMAL, DecimalConverter)
+ SIMPLE_CONVERTER_CASE(Type::UNION, UnionConverter)
default: {
return Status::NotImplemented("JSON conversion to ", type->ToString(),
" not implemented");
diff --git a/cpp/src/arrow/python/serialize.cc b/cpp/src/arrow/python/serialize.cc
index 5784394..bc64fb7 100644
--- a/cpp/src/arrow/python/serialize.cc
+++ b/cpp/src/arrow/python/serialize.cc
@@ -77,14 +77,6 @@ class SequenceBuilder {
// Appending a none to the sequence
Status AppendNone() { return builder_->AppendNull(); }
- template <typename BuilderType>
- Status Update(BuilderType* child_builder, int8_t tag) {
- int32_t offset32 = -1;
- RETURN_NOT_OK(internal::CastSize(child_builder->length(), &offset32));
- DCHECK_GE(offset32, 0);
- return builder_->Append(tag, offset32);
- }
-
template <typename BuilderType, typename MakeBuilderFn>
Status CreateAndUpdate(std::shared_ptr<BuilderType>* child_builder, int8_t tag,
MakeBuilderFn make_builder) {
@@ -95,7 +87,7 @@ class SequenceBuilder {
convert << static_cast<int>(tag);
type_map_[tag] = builder_->AppendChild(*child_builder, convert.str());
}
- return Update(child_builder->get(), type_map_[tag]);
+ return builder_->Append(type_map_[tag]);
}
template <typename BuilderType, typename T>
diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc
index 2dbd31a..55154ba 100644
--- a/cpp/src/arrow/type.cc
+++ b/cpp/src/arrow/type.cc
@@ -273,6 +273,9 @@ std::string DurationType::ToString() const {
UnionType::UnionType(const std::vector<std::shared_ptr<Field>>& fields,
const std::vector<uint8_t>& type_codes, UnionMode::type mode)
: NestedType(Type::UNION), mode_(mode), type_codes_(type_codes) {
+ DCHECK_LE(fields.size(), type_codes.size()) << "union field with unknown type id";
+ DCHECK_GE(fields.size(), type_codes.size())
+ << "type id provided without corresponding union field";
children_ = fields;
}
@@ -284,6 +287,12 @@ DataTypeLayout UnionType::layout() const {
}
}
+uint8_t UnionType::max_type_code() const {
+ return type_codes_.size() == 0
+ ? 0
+ : *std::max_element(type_codes_.begin(), type_codes_.end());
+}
+
std::string UnionType::ToString() const {
std::stringstream s;
diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h
index 16f486f..10c1f90 100644
--- a/cpp/src/arrow/type.h
+++ b/cpp/src/arrow/type.h
@@ -686,6 +686,8 @@ class ARROW_EXPORT UnionType : public NestedType {
const std::vector<uint8_t>& type_codes() const { return type_codes_; }
+ uint8_t max_type_code() const;
+
UnionMode::type mode() const { return mode_; }
private:
diff --git a/python/pyarrow/tests/test_serialization.py b/python/pyarrow/tests/test_serialization.py
index 22983e7..6c62699 100644
--- a/python/pyarrow/tests/test_serialization.py
+++ b/python/pyarrow/tests/test_serialization.py
@@ -302,6 +302,14 @@ def test_clone():
assert deserialized == (0, 'a')
+def test_primitive_serialization_notbroken(large_buffer):
+ serialization_roundtrip({(1, 2): 2}, large_buffer)
+
+
+def test_primitive_serialization_broken(large_buffer):
+ serialization_roundtrip({(): 2}, large_buffer)
+
+
def test_primitive_serialization(large_buffer):
for obj in PRIMITIVE_OBJECTS:
serialization_roundtrip(obj, large_buffer)