You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2020/06/12 18:03:46 UTC
[arrow] branch master updated: ARROW-8866: [C++] Split UNION into
SPARSE_UNION and DENSE_UNION
This is an automated email from the ASF dual-hosted git repository.
wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 89cf7bd ARROW-8866: [C++] Split UNION into SPARSE_UNION and DENSE_UNION
89cf7bd is described below
commit 89cf7bdd6fc0d34c96f1a0e1df0a3505b308188d
Author: Benjamin Kietzman <be...@gmail.com>
AuthorDate: Fri Jun 12 13:03:12 2020 -0500
ARROW-8866: [C++] Split UNION into SPARSE_UNION and DENSE_UNION
Also splits `UnionType -> SparseUnionType and DenseUnionType` and similar for `UnionArray`, `UnionScalar`.
`SparseUnionArray` no longer includes the unused offsets buffer
Closes #7378 from bkietz/8866-Split-TypeUNION-into-Type
Lead-authored-by: Benjamin Kietzman <be...@gmail.com>
Co-authored-by: Wes McKinney <we...@apache.org>
Signed-off-by: Wes McKinney <we...@apache.org>
---
c_glib/arrow-glib/basic-array.cpp | 15 +-
c_glib/arrow-glib/basic-data-type.cpp | 15 +-
c_glib/arrow-glib/composite-array.cpp | 8 +-
c_glib/arrow-glib/composite-data-type.cpp | 10 +-
c_glib/arrow-glib/type.cpp | 6 +-
c_glib/arrow-glib/type.h | 6 +-
c_glib/test/test-dense-union-data-type.rb | 4 +-
c_glib/test/test-sparse-union-data-type.rb | 4 +-
cpp/src/arrow/adapters/orc/adapter_util.cc | 2 +-
cpp/src/arrow/array/array_nested.cc | 99 +++++---
cpp/src/arrow/array/array_nested.h | 187 ++++++++-------
cpp/src/arrow/array/array_test.cc | 4 +-
cpp/src/arrow/array/array_union_test.cc | 92 ++++----
cpp/src/arrow/array/array_view_test.cc | 6 +-
cpp/src/arrow/array/builder_union.cc | 27 +--
cpp/src/arrow/array/builder_union.h | 23 +-
cpp/src/arrow/array/concatenate_test.cc | 4 +-
cpp/src/arrow/array/diff.cc | 4 +-
cpp/src/arrow/array/diff_test.cc | 6 +-
cpp/src/arrow/array/util.cc | 8 +-
cpp/src/arrow/array/validate.cc | 3 +-
cpp/src/arrow/builder.cc | 46 ++--
cpp/src/arrow/c/bridge.cc | 16 +-
cpp/src/arrow/c/bridge_test.cc | 32 +--
cpp/src/arrow/compare.cc | 6 +-
cpp/src/arrow/compute/kernels/codegen_internal.cc | 44 ++--
.../arrow/compute/kernels/vector_filter_test.cc | 4 +-
.../compute/kernels/vector_selection_internal.h | 259 ++++++++++++---------
cpp/src/arrow/compute/kernels/vector_take_test.cc | 5 +-
cpp/src/arrow/ipc/json_internal.cc | 30 ++-
cpp/src/arrow/ipc/json_simple.cc | 3 +-
cpp/src/arrow/ipc/json_simple_test.cc | 36 +--
cpp/src/arrow/ipc/json_test.cc | 5 +-
cpp/src/arrow/ipc/metadata_internal.cc | 9 +-
cpp/src/arrow/ipc/read_write_test.cc | 12 +-
cpp/src/arrow/ipc/reader.cc | 5 +-
cpp/src/arrow/ipc/test_common.cc | 21 +-
cpp/src/arrow/ipc/writer.cc | 168 ++++++-------
cpp/src/arrow/pretty_print.cc | 5 +-
cpp/src/arrow/python/arrow_to_pandas.cc | 2 +-
cpp/src/arrow/python/deserialize.cc | 2 +-
cpp/src/arrow/python/serialize.cc | 4 +-
cpp/src/arrow/scalar.cc | 12 +-
cpp/src/arrow/scalar.h | 11 +-
cpp/src/arrow/testing/util.h | 5 +
cpp/src/arrow/type.cc | 146 +++++++-----
cpp/src/arrow/type.h | 66 ++++--
cpp/src/arrow/type_fwd.h | 78 +++++--
cpp/src/arrow/type_test.cc | 25 +-
cpp/src/arrow/type_traits.h | 18 +-
cpp/src/arrow/visitor.cc | 6 +-
cpp/src/arrow/visitor.h | 6 +-
cpp/src/arrow/visitor_inline.h | 3 +-
cpp/src/gandiva/llvm_types_test.cc | 3 +-
python/pyarrow/array.pxi | 10 +-
python/pyarrow/includes/libarrow.pxd | 31 ++-
python/pyarrow/lib.pyx | 3 +-
python/pyarrow/public-api.pxi | 4 +-
python/pyarrow/scalar.pxi | 7 +-
python/pyarrow/types.pxi | 6 +-
python/pyarrow/types.py | 5 +-
r/R/enums.R | 7 +-
r/R/type.R | 3 +-
ruby/red-arrow/ext/arrow/converters.hpp | 17 +-
ruby/red-arrow/ext/arrow/raw-records.cpp | 3 +-
ruby/red-arrow/ext/arrow/values.cpp | 3 +-
ruby/red-arrow/test/test-dense-union-data-type.rb | 4 +-
ruby/red-arrow/test/test-sparse-union-data-type.rb | 4 +-
68 files changed, 1014 insertions(+), 719 deletions(-)
diff --git a/c_glib/arrow-glib/basic-array.cpp b/c_glib/arrow-glib/basic-array.cpp
index dc45db7..edce2ed 100644
--- a/c_glib/arrow-glib/basic-array.cpp
+++ b/c_glib/arrow-glib/basic-array.cpp
@@ -2649,16 +2649,11 @@ garrow_array_new_raw_valist(std::shared_ptr<arrow::Array> *arrow_array,
case arrow::Type::type::MAP:
type = GARROW_TYPE_MAP_ARRAY;
break;
- case arrow::Type::type::UNION:
- {
- auto arrow_union_array =
- std::static_pointer_cast<arrow::UnionArray>(*arrow_array);
- if (arrow_union_array->mode() == arrow::UnionMode::SPARSE) {
- type = GARROW_TYPE_SPARSE_UNION_ARRAY;
- } else {
- type = GARROW_TYPE_DENSE_UNION_ARRAY;
- }
- }
+ case arrow::Type::type::SPARSE_UNION:
+ type = GARROW_TYPE_SPARSE_UNION_ARRAY;
+ break;
+ case arrow::Type::type::DENSE_UNION:
+ type = GARROW_TYPE_DENSE_UNION_ARRAY;
break;
case arrow::Type::type::DICTIONARY:
type = GARROW_TYPE_DICTIONARY_ARRAY;
diff --git a/c_glib/arrow-glib/basic-data-type.cpp b/c_glib/arrow-glib/basic-data-type.cpp
index 0febfe1..75984cd 100644
--- a/c_glib/arrow-glib/basic-data-type.cpp
+++ b/c_glib/arrow-glib/basic-data-type.cpp
@@ -1376,16 +1376,11 @@ garrow_data_type_new_raw(std::shared_ptr<arrow::DataType> *arrow_data_type)
case arrow::Type::type::STRUCT:
type = GARROW_TYPE_STRUCT_DATA_TYPE;
break;
- case arrow::Type::type::UNION:
- {
- auto arrow_union_data_type =
- std::static_pointer_cast<arrow::UnionType>(*arrow_data_type);
- if (arrow_union_data_type->mode() == arrow::UnionMode::SPARSE) {
- type = GARROW_TYPE_SPARSE_UNION_DATA_TYPE;
- } else {
- type = GARROW_TYPE_DENSE_UNION_DATA_TYPE;
- }
- }
+ case arrow::Type::type::SPARSE_UNION:
+ type = GARROW_TYPE_SPARSE_UNION_DATA_TYPE;
+ break;
+ case arrow::Type::type::DENSE_UNION:
+ type = GARROW_TYPE_DENSE_UNION_DATA_TYPE;
break;
case arrow::Type::type::DICTIONARY:
type = GARROW_TYPE_DICTIONARY_DATA_TYPE;
diff --git a/c_glib/arrow-glib/composite-array.cpp b/c_glib/arrow-glib/composite-array.cpp
index 322c078..14dda37 100644
--- a/c_glib/arrow-glib/composite-array.cpp
+++ b/c_glib/arrow-glib/composite-array.cpp
@@ -1019,13 +1019,13 @@ garrow_sparse_union_array_new_internal(GArrowSparseUnionDataType *data_type,
arrow_field_names.push_back(arrow_field->name());
}
arrow_sparse_union_array_result =
- arrow::UnionArray::MakeSparse(*arrow_type_ids,
+ arrow::SparseUnionArray::Make(*arrow_type_ids,
arrow_fields,
arrow_field_names,
arrow_union_data_type->type_codes());
} else {
arrow_sparse_union_array_result =
- arrow::UnionArray::MakeSparse(*arrow_type_ids, arrow_fields);
+ arrow::SparseUnionArray::Make(*arrow_type_ids, arrow_fields);
}
if (garrow::check(error,
arrow_sparse_union_array_result,
@@ -1217,14 +1217,14 @@ garrow_dense_union_array_new_internal(GArrowDenseUnionDataType *data_type,
arrow_field_names.push_back(arrow_field->name());
}
arrow_dense_union_array_result =
- arrow::UnionArray::MakeDense(*arrow_type_ids,
+ arrow::DenseUnionArray::Make(*arrow_type_ids,
*arrow_value_offsets,
arrow_fields,
arrow_field_names,
arrow_union_data_type->type_codes());
} else {
arrow_dense_union_array_result =
- arrow::UnionArray::MakeDense(*arrow_type_ids,
+ arrow::DenseUnionArray::Make(*arrow_type_ids,
*arrow_value_offsets,
arrow_fields);
}
diff --git a/c_glib/arrow-glib/composite-data-type.cpp b/c_glib/arrow-glib/composite-data-type.cpp
index 1fdbef9..3944ff2 100644
--- a/c_glib/arrow-glib/composite-data-type.cpp
+++ b/c_glib/arrow-glib/composite-data-type.cpp
@@ -573,9 +573,8 @@ garrow_sparse_union_data_type_new(GList *fields,
}
auto arrow_data_type =
- std::make_shared<arrow::UnionType>(arrow_fields,
- arrow_type_codes,
- arrow::UnionMode::SPARSE);
+ std::make_shared<arrow::SparseUnionType>(arrow_fields,
+ arrow_type_codes);
auto data_type = g_object_new(GARROW_TYPE_SPARSE_UNION_DATA_TYPE,
"data-type", &arrow_data_type,
NULL);
@@ -623,9 +622,8 @@ garrow_dense_union_data_type_new(GList *fields,
}
auto arrow_data_type =
- std::make_shared<arrow::UnionType>(arrow_fields,
- arrow_type_codes,
- arrow::UnionMode::DENSE);
+ std::make_shared<arrow::DenseUnionType>(arrow_fields,
+ arrow_type_codes);
auto data_type = g_object_new(GARROW_TYPE_DENSE_UNION_DATA_TYPE,
"data-type", &arrow_data_type,
NULL);
diff --git a/c_glib/arrow-glib/type.cpp b/c_glib/arrow-glib/type.cpp
index 8c1667f..6d46845 100644
--- a/c_glib/arrow-glib/type.cpp
+++ b/c_glib/arrow-glib/type.cpp
@@ -96,8 +96,10 @@ garrow_type_from_raw(arrow::Type::type type)
return GARROW_TYPE_STRUCT;
case arrow::Type::type::MAP:
return GARROW_TYPE_MAP;
- case arrow::Type::type::UNION:
- return GARROW_TYPE_UNION;
+ case arrow::Type::type::SPARSE_UNION:
+ return GARROW_TYPE_SPARSE_UNION;
+ case arrow::Type::type::DENSE_UNION:
+ return GARROW_TYPE_DENSE_UNION;
case arrow::Type::type::DICTIONARY:
return GARROW_TYPE_DICTIONARY;
default:
diff --git a/c_glib/arrow-glib/type.h b/c_glib/arrow-glib/type.h
index 9c9974c..ac32707 100644
--- a/c_glib/arrow-glib/type.h
+++ b/c_glib/arrow-glib/type.h
@@ -54,7 +54,8 @@ G_BEGIN_DECLS
* type. Storage type depends on the parameters.
* @GARROW_TYPE_LIST: A list of some logical data type.
* @GARROW_TYPE_STRUCT: Struct of logical types.
- * @GARROW_TYPE_UNION: Unions of logical types.
+ * @GARROW_TYPE_SPARSE_UNION: Sparse unions of logical types.
+ * @GARROW_TYPE_DENSE_UNION: Dense unions of logical types.
* @GARROW_TYPE_DICTIONARY: Dictionary aka Category type.
* @GARROW_TYPE_MAP: A repeated struct logical type.
* @GARROW_TYPE_EXTENSION: Custom data type, implemented by user.
@@ -94,7 +95,8 @@ typedef enum {
GARROW_TYPE_DECIMAL,
GARROW_TYPE_LIST,
GARROW_TYPE_STRUCT,
- GARROW_TYPE_UNION,
+ GARROW_TYPE_SPARSE_UNION,
+ GARROW_TYPE_DENSE_UNION,
GARROW_TYPE_DICTIONARY,
GARROW_TYPE_MAP,
GARROW_TYPE_EXTENSION,
diff --git a/c_glib/test/test-dense-union-data-type.rb b/c_glib/test/test-dense-union-data-type.rb
index 231767f..85d1d85 100644
--- a/c_glib/test/test-dense-union-data-type.rb
+++ b/c_glib/test/test-dense-union-data-type.rb
@@ -33,11 +33,11 @@ class TestDenseUnionDataType < Test::Unit::TestCase
end
def test_type
- assert_equal(Arrow::Type::UNION, @data_type.id)
+ assert_equal(Arrow::Type::DENSE_UNION, @data_type.id)
end
def test_to_s
- assert_equal("union[dense]<number: int32=2, text: string=9>",
+ assert_equal("dense_union<number: int32=2, text: string=9>",
@data_type.to_s)
end
diff --git a/c_glib/test/test-sparse-union-data-type.rb b/c_glib/test/test-sparse-union-data-type.rb
index 30e24f7..34626bb 100644
--- a/c_glib/test/test-sparse-union-data-type.rb
+++ b/c_glib/test/test-sparse-union-data-type.rb
@@ -33,11 +33,11 @@ class TestSparseUnionDataType < Test::Unit::TestCase
end
def test_type
- assert_equal(Arrow::Type::UNION, @data_type.id)
+ assert_equal(Arrow::Type::SPARSE_UNION, @data_type.id)
end
def test_to_s
- assert_equal("union[sparse]<number: int32=2, text: string=9>",
+ assert_equal("sparse_union<number: int32=2, text: string=9>",
@data_type.to_s)
end
diff --git a/cpp/src/arrow/adapters/orc/adapter_util.cc b/cpp/src/arrow/adapters/orc/adapter_util.cc
index 710a2a8..5a36e2c 100644
--- a/cpp/src/arrow/adapters/orc/adapter_util.cc
+++ b/cpp/src/arrow/adapters/orc/adapter_util.cc
@@ -415,7 +415,7 @@ Status GetArrowType(const liborc::Type* type, std::shared_ptr<DataType>* out) {
fields.push_back(field("_union_" + std::to_string(child), elemtype));
type_codes.push_back(static_cast<int8_t>(child));
}
- *out = union_(fields, type_codes);
+ *out = sparse_union(fields, type_codes);
break;
}
default: {
diff --git a/cpp/src/arrow/array/array_nested.cc b/cpp/src/arrow/array/array_nested.cc
index 7e4a71a..4c337fc 100644
--- a/cpp/src/arrow/array/array_nested.cc
+++ b/cpp/src/arrow/array/array_nested.cc
@@ -556,44 +556,77 @@ Result<ArrayVector> StructArray::Flatten(MemoryPool* pool) const {
// ----------------------------------------------------------------------
// UnionArray
-void UnionArray::SetData(const std::shared_ptr<ArrayData>& data) {
- this->Array::SetData(data);
+void UnionArray::SetData(std::shared_ptr<ArrayData> data) {
+ this->Array::SetData(std::move(data));
- ARROW_CHECK_EQ(data->type->id(), Type::UNION);
- ARROW_CHECK_EQ(data->buffers.size(), 3);
union_type_ = checked_cast<const UnionType*>(data_->type.get());
+ ARROW_CHECK_GE(data_->buffers.size(), 2);
auto type_codes = data_->buffers[1];
- auto value_offsets = data_->buffers[2];
raw_type_codes_ = type_codes == nullptr
? nullptr
: reinterpret_cast<const int8_t*>(type_codes->data());
+ boxed_fields_.resize(data_->child_data.size());
+}
+
+void SparseUnionArray::SetData(std::shared_ptr<ArrayData> data) {
+ this->UnionArray::SetData(std::move(data));
+
+ ARROW_CHECK_EQ(data_->type->id(), Type::SPARSE_UNION);
+ ARROW_CHECK_EQ(data_->buffers.size(), 2);
+}
+
+void DenseUnionArray::SetData(const std::shared_ptr<ArrayData>& data) {
+ this->UnionArray::SetData(std::move(data));
+
+ ARROW_CHECK_EQ(data_->type->id(), Type::DENSE_UNION);
+ ARROW_CHECK_EQ(data_->buffers.size(), 3);
+ auto value_offsets = data_->buffers[2];
raw_value_offsets_ = value_offsets == nullptr
? nullptr
: reinterpret_cast<const int32_t*>(value_offsets->data());
- boxed_fields_.resize(data->child_data.size());
}
-UnionArray::UnionArray(const std::shared_ptr<ArrayData>& data) { SetData(data); }
+SparseUnionArray::SparseUnionArray(std::shared_ptr<ArrayData> data) {
+ SetData(std::move(data));
+}
+
+SparseUnionArray::SparseUnionArray(std::shared_ptr<DataType> type, int64_t length,
+ ArrayVector children,
+ std::shared_ptr<Buffer> type_codes,
+ std::shared_ptr<Buffer> null_bitmap,
+ int64_t null_count, int64_t offset) {
+ auto internal_data = ArrayData::Make(
+ std::move(type), length,
+ BufferVector{std::move(null_bitmap), std::move(type_codes)}, null_count, offset);
+ for (const auto& child : children) {
+ internal_data->child_data.push_back(child->data());
+ }
+ SetData(std::move(internal_data));
+}
+
+DenseUnionArray::DenseUnionArray(const std::shared_ptr<ArrayData>& data) {
+ SetData(data);
+}
-UnionArray::UnionArray(const std::shared_ptr<DataType>& type, int64_t length,
- const std::vector<std::shared_ptr<Array>>& children,
- const std::shared_ptr<Buffer>& type_codes,
- const std::shared_ptr<Buffer>& value_offsets,
- const std::shared_ptr<Buffer>& null_bitmap, int64_t null_count,
- int64_t offset) {
+DenseUnionArray::DenseUnionArray(std::shared_ptr<DataType> type, int64_t length,
+ ArrayVector children, std::shared_ptr<Buffer> type_ids,
+ std::shared_ptr<Buffer> value_offsets,
+ std::shared_ptr<Buffer> null_bitmap, int64_t null_count,
+ int64_t offset) {
auto internal_data = ArrayData::Make(
- type, length, {null_bitmap, type_codes, value_offsets}, null_count, offset);
+ std::move(type), length,
+ BufferVector{std::move(null_bitmap), std::move(type_ids), std::move(value_offsets)},
+ null_count, offset);
for (const auto& child : children) {
internal_data->child_data.push_back(child->data());
}
SetData(internal_data);
}
-Result<std::shared_ptr<Array>> UnionArray::MakeDense(
- const Array& type_ids, const Array& value_offsets,
- const std::vector<std::shared_ptr<Array>>& children,
- const std::vector<std::string>& field_names, const std::vector<int8_t>& type_codes) {
+Result<std::shared_ptr<Array>> DenseUnionArray::Make(
+ const Array& type_ids, const Array& value_offsets, ArrayVector children,
+ std::vector<std::string> field_names, std::vector<type_code_t> type_codes) {
if (value_offsets.length() == 0) {
return Status::Invalid("UnionArray offsets must have non-zero length");
}
@@ -607,7 +640,7 @@ Result<std::shared_ptr<Array>> UnionArray::MakeDense(
}
if (value_offsets.null_count() != 0) {
- return Status::Invalid("MakeDense does not allow NAs in value_offsets");
+ return Status::Invalid("Make does not allow NAs in value_offsets");
}
if (field_names.size() > 0 && field_names.size() != children.size()) {
@@ -622,19 +655,19 @@ Result<std::shared_ptr<Array>> UnionArray::MakeDense(
checked_cast<const Int8Array&>(type_ids).values(),
checked_cast<const Int32Array&>(value_offsets).values()};
- std::shared_ptr<DataType> union_type =
- union_(children, field_names, type_codes, UnionMode::DENSE);
- auto internal_data = ArrayData::Make(union_type, type_ids.length(), std::move(buffers),
- type_ids.null_count(), type_ids.offset());
+ auto union_type = dense_union(children, std::move(field_names), std::move(type_codes));
+ auto internal_data =
+ ArrayData::Make(std::move(union_type), type_ids.length(), std::move(buffers),
+ type_ids.null_count(), type_ids.offset());
for (const auto& child : children) {
internal_data->child_data.push_back(child->data());
}
- return std::make_shared<UnionArray>(internal_data);
+ return std::make_shared<DenseUnionArray>(std::move(internal_data));
}
-Result<std::shared_ptr<Array>> UnionArray::MakeSparse(
- const Array& type_ids, const std::vector<std::shared_ptr<Array>>& children,
- const std::vector<std::string>& field_names, const std::vector<int8_t>& type_codes) {
+Result<std::shared_ptr<Array>> SparseUnionArray::Make(
+ const Array& type_ids, ArrayVector children, std::vector<std::string> field_names,
+ std::vector<int8_t> type_codes) {
if (type_ids.type_id() != Type::INT8) {
return Status::TypeError("UnionArray type_ids must be signed int8");
}
@@ -648,11 +681,11 @@ Result<std::shared_ptr<Array>> UnionArray::MakeSparse(
}
BufferVector buffers = {type_ids.null_bitmap(),
- checked_cast<const Int8Array&>(type_ids).values(), nullptr};
- std::shared_ptr<DataType> union_type =
- union_(children, field_names, type_codes, UnionMode::SPARSE);
- auto internal_data = ArrayData::Make(union_type, type_ids.length(), std::move(buffers),
- type_ids.null_count(), type_ids.offset());
+ checked_cast<const Int8Array&>(type_ids).values()};
+ auto union_type = sparse_union(children, std::move(field_names), std::move(type_codes));
+ auto internal_data =
+ ArrayData::Make(std::move(union_type), type_ids.length(), std::move(buffers),
+ type_ids.null_count(), type_ids.offset());
for (const auto& child : children) {
internal_data->child_data.push_back(child->data());
if (child->length() != type_ids.length()) {
@@ -660,7 +693,7 @@ Result<std::shared_ptr<Array>> UnionArray::MakeSparse(
"Sparse UnionArray must have len(child) == len(type_ids) for all children");
}
}
- return std::make_shared<UnionArray>(internal_data);
+ return std::make_shared<SparseUnionArray>(std::move(internal_data));
}
std::shared_ptr<Array> UnionArray::child(int i) const { return field(i); }
diff --git a/cpp/src/arrow/array/array_nested.h b/cpp/src/arrow/array/array_nested.h
index 391dfaf..666b00f 100644
--- a/cpp/src/arrow/array/array_nested.h
+++ b/cpp/src/arrow/array/array_nested.h
@@ -31,6 +31,7 @@
#include "arrow/result.h"
#include "arrow/status.h"
#include "arrow/type.h"
+#include "arrow/util/checked_cast.h"
#include "arrow/util/macros.h"
#include "arrow/util/visibility.h"
@@ -334,132 +335,158 @@ class ARROW_EXPORT StructArray : public Array {
// ----------------------------------------------------------------------
// Union
-/// Concrete Array class for union data
+/// Base class for SparseUnionArray and DenseUnionArray
class ARROW_EXPORT UnionArray : public Array {
public:
- using TypeClass = UnionType;
-
using type_code_t = int8_t;
- explicit UnionArray(const std::shared_ptr<ArrayData>& data);
+ /// Note that this buffer does not account for any slice offset
+ std::shared_ptr<Buffer> type_codes() const { return data_->buffers[1]; }
+
+ const type_code_t* raw_type_codes() const { return raw_type_codes_ + data_->offset; }
+
+ /// The physical child id containing value at index.
+ int child_id(int64_t i) const {
+ return union_type_->child_ids()[raw_type_codes_[i + data_->offset]];
+ }
+
+ const UnionType* union_type() const { return union_type_; }
+
+ UnionMode::type mode() const { return union_type_->mode(); }
- UnionArray(const std::shared_ptr<DataType>& type, int64_t length,
- const std::vector<std::shared_ptr<Array>>& children,
- const std::shared_ptr<Buffer>& type_ids,
- const std::shared_ptr<Buffer>& value_offsets = NULLPTR,
- const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
- int64_t null_count = kUnknownNullCount, int64_t offset = 0);
+ // Return the given field as an individual array.
+ // For sparse unions, the returned array has its offset, length and null
+ // count adjusted.
+ ARROW_DEPRECATED("Deprecated in 1.0.0. Use field(pos)")
+ std::shared_ptr<Array> child(int pos) const;
- /// \brief Construct Dense UnionArray from types_ids, value_offsets and children
+ /// \brief Return the given field as an individual array.
///
- /// This function does the bare minimum of validation of the offsets and
- /// input types. The value_offsets are assumed to be well-formed.
+ /// For sparse unions, the returned array has its offset, length and null
+ /// count adjusted.
+ std::shared_ptr<Array> field(int pos) const;
+
+ protected:
+ void SetData(std::shared_ptr<ArrayData> data);
+
+ const type_code_t* raw_type_codes_;
+ const UnionType* union_type_;
+
+ // For caching boxed child data
+ mutable std::vector<std::shared_ptr<Array>> boxed_fields_;
+};
+
+/// Concrete Array class for sparse union data
+class ARROW_EXPORT SparseUnionArray : public UnionArray {
+ public:
+ using TypeClass = SparseUnionType;
+
+ explicit SparseUnionArray(std::shared_ptr<ArrayData> data);
+
+ SparseUnionArray(std::shared_ptr<DataType> type, int64_t length, ArrayVector children,
+ std::shared_ptr<Buffer> type_ids,
+ std::shared_ptr<Buffer> null_bitmap = NULLPTR,
+ int64_t null_count = kUnknownNullCount, int64_t offset = 0);
+
+ /// \brief Construct SparseUnionArray from type_ids and children
+ ///
+ /// This function does the bare minimum of validation of the input types.
///
/// \param[in] type_ids An array of logical type ids for the union type
- /// \param[in] value_offsets An array of signed int32 values indicating the
- /// relative offset into the respective child array for the type in a given slot.
- /// The respective offsets for each child value array must be in order / increasing.
/// \param[in] children Vector of children Arrays containing the data for each type.
- /// \param[in] field_names Vector of strings containing the name of each field.
/// \param[in] type_codes Vector of type codes.
- static Result<std::shared_ptr<Array>> MakeDense(
- const Array& type_ids, const Array& value_offsets,
- const std::vector<std::shared_ptr<Array>>& children,
- const std::vector<std::string>& field_names = {},
- const std::vector<type_code_t>& type_codes = {});
+ static Result<std::shared_ptr<Array>> Make(const Array& type_ids, ArrayVector children,
+ std::vector<type_code_t> type_codes) {
+ return Make(std::move(type_ids), std::move(children), std::vector<std::string>{},
+ std::move(type_codes));
+ }
- /// \brief Construct Dense UnionArray from types_ids, value_offsets and children
+ /// \brief Construct SparseUnionArray with custom field names from type_ids and children
///
- /// This function does the bare minimum of validation of the offsets and
- /// input types. The value_offsets are assumed to be well-formed.
+ /// This function does the bare minimum of validation of the input types.
///
/// \param[in] type_ids An array of logical type ids for the union type
- /// \param[in] value_offsets An array of signed int32 values indicating the
- /// relative offset into the respective child array for the type in a given slot.
- /// The respective offsets for each child value array must be in order / increasing.
/// \param[in] children Vector of children Arrays containing the data for each type.
+ /// \param[in] field_names Vector of strings containing the name of each field.
/// \param[in] type_codes Vector of type codes.
- static Result<std::shared_ptr<Array>> MakeDense(
- const Array& type_ids, const Array& value_offsets,
- const std::vector<std::shared_ptr<Array>>& children,
- const std::vector<type_code_t>& type_codes) {
- return MakeDense(type_ids, value_offsets, children, std::vector<std::string>{},
- type_codes);
+ static Result<std::shared_ptr<Array>> Make(const Array& type_ids, ArrayVector children,
+ std::vector<std::string> field_names = {},
+ std::vector<type_code_t> type_codes = {});
+
+ const SparseUnionType* union_type() const {
+ return internal::checked_cast<const SparseUnionType*>(union_type_);
}
- /// \brief Construct Sparse UnionArray from type_ids and children
+ protected:
+ void SetData(std::shared_ptr<ArrayData> data);
+};
+
+/// Concrete Array class for dense union data
+class ARROW_EXPORT DenseUnionArray : public UnionArray {
+ public:
+ using TypeClass = DenseUnionType;
+
+ explicit DenseUnionArray(const std::shared_ptr<ArrayData>& data);
+
+ DenseUnionArray(std::shared_ptr<DataType> type, int64_t length, ArrayVector children,
+ std::shared_ptr<Buffer> type_ids,
+ std::shared_ptr<Buffer> value_offsets = NULLPTR,
+ std::shared_ptr<Buffer> null_bitmap = NULLPTR,
+ int64_t null_count = kUnknownNullCount, int64_t offset = 0);
+
+ /// \brief Construct DenseUnionArray from type_ids, value_offsets, and children
///
/// This function does the bare minimum of validation of the offsets and
/// input types.
///
/// \param[in] type_ids An array of logical type ids for the union type
+ /// \param[in] value_offsets An array of signed int32 values indicating the
+ /// relative offset into the respective child array for the type in a given slot.
+ /// The respective offsets for each child value array must be in order / increasing.
/// \param[in] children Vector of children Arrays containing the data for each type.
- /// \param[in] field_names Vector of strings containing the name of each field.
/// \param[in] type_codes Vector of type codes.
- static Result<std::shared_ptr<Array>> MakeSparse(
- const Array& type_ids, const std::vector<std::shared_ptr<Array>>& children,
- const std::vector<std::string>& field_names = {},
- const std::vector<type_code_t>& type_codes = {});
+ static Result<std::shared_ptr<Array>> Make(const Array& type_ids,
+ const Array& value_offsets,
+ ArrayVector children,
+ std::vector<type_code_t> type_codes) {
+ return Make(type_ids, value_offsets, std::move(children), std::vector<std::string>{},
+ std::move(type_codes));
+ }
- /// \brief Construct Sparse UnionArray from type_ids and children
+ /// \brief Construct DenseUnionArray with custom field names from type_ids,
+ /// value_offsets, and children
///
/// This function does the bare minimum of validation of the offsets and
/// input types.
///
/// \param[in] type_ids An array of logical type ids for the union type
+ /// \param[in] value_offsets An array of signed int32 values indicating the
+ /// relative offset into the respective child array for the type in a given slot.
+ /// The respective offsets for each child value array must be in order / increasing.
/// \param[in] children Vector of children Arrays containing the data for each type.
+ /// \param[in] field_names Vector of strings containing the name of each field.
/// \param[in] type_codes Vector of type codes.
- static Result<std::shared_ptr<Array>> MakeSparse(
- const Array& type_ids, const std::vector<std::shared_ptr<Array>>& children,
- const std::vector<type_code_t>& type_codes) {
- return MakeSparse(type_ids, children, std::vector<std::string>{}, type_codes);
- }
-
- /// Note that this buffer does not account for any slice offset
- std::shared_ptr<Buffer> type_codes() const { return data_->buffers[1]; }
-
- const type_code_t* raw_type_codes() const { return raw_type_codes_ + data_->offset; }
-
- /// The physical child id containing value at index.
- int child_id(int64_t i) const {
- return union_type_->child_ids()[raw_type_codes_[i + data_->offset]];
+ static Result<std::shared_ptr<Array>> Make(const Array& type_ids,
+ const Array& value_offsets,
+ ArrayVector children,
+ std::vector<std::string> field_names = {},
+ std::vector<type_code_t> type_codes = {});
+
+ const DenseUnionType* union_type() const {
+ return internal::checked_cast<const DenseUnionType*>(union_type_);
}
- /// For dense arrays only.
/// Note that this buffer does not account for any slice offset
std::shared_ptr<Buffer> value_offsets() const { return data_->buffers[2]; }
- /// For dense arrays only.
int32_t value_offset(int64_t i) const { return raw_value_offsets_[i + data_->offset]; }
- /// For dense arrays only.
const int32_t* raw_value_offsets() const { return raw_value_offsets_ + data_->offset; }
- const UnionType* union_type() const { return union_type_; }
-
- UnionMode::type mode() const { return union_type_->mode(); }
-
- // Return the given field as an individual array.
- // For sparse unions, the returned array has its offset, length and null
- // count adjusted.
- ARROW_DEPRECATED("Deprecated in 1.0.0. Use field(pos)")
- std::shared_ptr<Array> child(int pos) const;
-
- /// \brief Return the given field as an individual array.
- ///
- /// For sparse unions, the returned array has its offset, length and null
- /// count adjusted.
- std::shared_ptr<Array> field(int pos) const;
-
protected:
- void SetData(const std::shared_ptr<ArrayData>& data);
-
- const type_code_t* raw_type_codes_;
const int32_t* raw_value_offsets_;
- const UnionType* union_type_;
- // For caching boxed child data
- mutable std::vector<std::shared_ptr<Array>> boxed_fields_;
+ void SetData(const std::shared_ptr<ArrayData>& data);
};
} // namespace arrow
diff --git a/cpp/src/arrow/array/array_test.cc b/cpp/src/arrow/array/array_test.cc
index 8936c42..4fb0a4a 100644
--- a/cpp/src/arrow/array/array_test.cc
+++ b/cpp/src/arrow/array/array_test.cc
@@ -296,8 +296,8 @@ TEST_F(TestArray, TestMakeArrayOfNull) {
fixed_size_list(int64(), 4),
dictionary(int32(), utf8()),
struct_({field("a", utf8()), field("b", int32())}),
- union_({field("a", utf8()), field("b", int32())}, {0, 1}, UnionMode::SPARSE),
- union_({field("a", utf8()), field("b", int32())}, {0, 1}, UnionMode::DENSE)
+ sparse_union({field("a", utf8()), field("b", int32())}, {0, 1}),
+ dense_union({field("a", utf8()), field("b", int32())}, {0, 1}),
// clang-format on
};
diff --git a/cpp/src/arrow/array/array_union_test.cc b/cpp/src/arrow/array/array_union_test.cc
index ee1abe1..b346efe 100644
--- a/cpp/src/arrow/array/array_union_test.cc
+++ b/cpp/src/arrow/array/array_union_test.cc
@@ -68,33 +68,33 @@ TEST(TestUnionArray, TestSliceEquals) {
TEST(TestSparseUnionArray, Validate) {
auto a = ArrayFromJSON(int32(), "[4, 5]");
- auto type = union_({field("a", int32())}, UnionMode::SPARSE);
+ auto type = sparse_union({field("a", int32())});
auto children = std::vector<std::shared_ptr<Array>>{a};
auto type_ids_array = ArrayFromJSON(int8(), "[0, 0, 0]");
auto type_ids = type_ids_array->data()->buffers[1];
- auto arr = std::make_shared<UnionArray>(type, 2, children, type_ids);
+ auto arr = std::make_shared<SparseUnionArray>(type, 2, children, type_ids);
ASSERT_OK(arr->ValidateFull());
- arr = std::make_shared<UnionArray>(type, 1, children, type_ids, nullptr, nullptr, 0,
- /*offset=*/1);
+ arr = std::make_shared<SparseUnionArray>(type, 1, children, type_ids, nullptr, 0,
+ /*offset=*/1);
ASSERT_OK(arr->ValidateFull());
- arr = std::make_shared<UnionArray>(type, 0, children, type_ids, nullptr, nullptr, 0,
- /*offset=*/2);
+ arr = std::make_shared<SparseUnionArray>(type, 0, children, type_ids, nullptr, 0,
+ /*offset=*/2);
ASSERT_OK(arr->ValidateFull());
// Length + offset < child length, but it's ok
- arr = std::make_shared<UnionArray>(type, 1, children, type_ids, nullptr, nullptr, 0,
- /*offset=*/0);
+ arr = std::make_shared<SparseUnionArray>(type, 1, children, type_ids, nullptr, 0,
+ /*offset=*/0);
ASSERT_OK(arr->ValidateFull());
// Length + offset > child length
- arr = std::make_shared<UnionArray>(type, 1, children, type_ids, nullptr, nullptr, 0,
- /*offset=*/2);
+ arr = std::make_shared<SparseUnionArray>(type, 1, children, type_ids, nullptr, 0,
+ /*offset=*/2);
ASSERT_RAISES(Invalid, arr->ValidateFull());
// Offset > child length
- arr = std::make_shared<UnionArray>(type, 0, children, type_ids, nullptr, nullptr, 0,
- /*offset=*/3);
+ arr = std::make_shared<SparseUnionArray>(type, 0, children, type_ids, nullptr, 0,
+ /*offset=*/3);
ASSERT_RAISES(Invalid, arr->ValidateFull());
}
@@ -165,43 +165,43 @@ TEST_F(TestUnionArrayFactories, TestMakeDense) {
// without field names and type codes
ASSERT_OK_AND_ASSIGN(result,
- UnionArray::MakeDense(*type_ids_, *value_offsets, children));
+ DenseUnionArray::Make(*type_ids_, *value_offsets, children));
ASSERT_OK(result->ValidateFull());
union_array = checked_cast<const UnionArray*>(result.get());
CheckUnionArray(*union_array, UnionMode::DENSE, {"0", "1", "2", "3"}, {0, 1, 2, 3});
// with field name
ASSERT_RAISES(Invalid,
- UnionArray::MakeDense(*type_ids_, *value_offsets, children, {"one"}));
+ DenseUnionArray::Make(*type_ids_, *value_offsets, children, {"one"}));
ASSERT_OK_AND_ASSIGN(
- result, UnionArray::MakeDense(*type_ids_, *value_offsets, children, field_names));
+ result, DenseUnionArray::Make(*type_ids_, *value_offsets, children, field_names));
ASSERT_OK(result->ValidateFull());
union_array = checked_cast<const UnionArray*>(result.get());
CheckUnionArray(*union_array, UnionMode::DENSE, field_names, {0, 1, 2, 3});
// with type codes
- ASSERT_RAISES(Invalid, UnionArray::MakeDense(*logical_type_ids_, *value_offsets,
+ ASSERT_RAISES(Invalid, DenseUnionArray::Make(*logical_type_ids_, *value_offsets,
children, std::vector<int8_t>{0}));
- ASSERT_OK_AND_ASSIGN(result, UnionArray::MakeDense(*logical_type_ids_, *value_offsets,
+ ASSERT_OK_AND_ASSIGN(result, DenseUnionArray::Make(*logical_type_ids_, *value_offsets,
children, type_codes_));
ASSERT_OK(result->ValidateFull());
union_array = checked_cast<const UnionArray*>(result.get());
CheckUnionArray(*union_array, UnionMode::DENSE, {"0", "1", "2", "3"}, type_codes_);
// with field names and type codes
- ASSERT_RAISES(Invalid, UnionArray::MakeDense(*logical_type_ids_, *value_offsets,
+ ASSERT_RAISES(Invalid, DenseUnionArray::Make(*logical_type_ids_, *value_offsets,
children, {"one"}, type_codes_));
- ASSERT_OK_AND_ASSIGN(result, UnionArray::MakeDense(*logical_type_ids_, *value_offsets,
+ ASSERT_OK_AND_ASSIGN(result, DenseUnionArray::Make(*logical_type_ids_, *value_offsets,
children, field_names, type_codes_));
ASSERT_OK(result->ValidateFull());
union_array = checked_cast<const UnionArray*>(result.get());
CheckUnionArray(*union_array, UnionMode::DENSE, field_names, type_codes_);
// Invalid type codes
- ASSERT_OK_AND_ASSIGN(result, UnionArray::MakeDense(*invalid_type_ids1_, *value_offsets,
+ ASSERT_OK_AND_ASSIGN(result, DenseUnionArray::Make(*invalid_type_ids1_, *value_offsets,
children, type_codes_));
ASSERT_RAISES(Invalid, result->ValidateFull());
- ASSERT_OK_AND_ASSIGN(result, UnionArray::MakeDense(*invalid_type_ids2_, *value_offsets,
+ ASSERT_OK_AND_ASSIGN(result, DenseUnionArray::Make(*invalid_type_ids2_, *value_offsets,
children, type_codes_));
ASSERT_RAISES(Invalid, result->ValidateFull());
@@ -209,11 +209,11 @@ TEST_F(TestUnionArrayFactories, TestMakeDense) {
std::shared_ptr<Array> invalid_offsets;
ArrayFromVector<Int32Type, int32_t>({1, 0, 0, 0, 1, 1, 1, 2, 1, 2}, &invalid_offsets);
ASSERT_OK_AND_ASSIGN(result,
- UnionArray::MakeDense(*type_ids_, *invalid_offsets, children));
+ DenseUnionArray::Make(*type_ids_, *invalid_offsets, children));
ASSERT_RAISES(Invalid, result->ValidateFull());
ArrayFromVector<Int32Type, int32_t>({1, 0, 0, 0, 1, -1, 1, 2, 1, 2}, &invalid_offsets);
ASSERT_OK_AND_ASSIGN(result,
- UnionArray::MakeDense(*type_ids_, *invalid_offsets, children));
+ DenseUnionArray::Make(*type_ids_, *invalid_offsets, children));
ASSERT_RAISES(Invalid, result->ValidateFull());
}
@@ -231,31 +231,31 @@ TEST_F(TestUnionArrayFactories, TestMakeSparse) {
std::shared_ptr<Array> result;
// without field names and type codes
- ASSERT_OK_AND_ASSIGN(result, UnionArray::MakeSparse(*type_ids_, children));
+ ASSERT_OK_AND_ASSIGN(result, SparseUnionArray::Make(*type_ids_, children));
ASSERT_OK(result->ValidateFull());
CheckUnionArray(checked_cast<UnionArray&>(*result), UnionMode::SPARSE,
{"0", "1", "2", "3"}, {0, 1, 2, 3});
// with field names
- ASSERT_RAISES(Invalid, UnionArray::MakeSparse(*type_ids_, children, {"one"}));
- ASSERT_OK_AND_ASSIGN(result, UnionArray::MakeSparse(*type_ids_, children, field_names));
+ ASSERT_RAISES(Invalid, SparseUnionArray::Make(*type_ids_, children, {"one"}));
+ ASSERT_OK_AND_ASSIGN(result, SparseUnionArray::Make(*type_ids_, children, field_names));
ASSERT_OK(result->ValidateFull());
CheckUnionArray(checked_cast<UnionArray&>(*result), UnionMode::SPARSE, field_names,
{0, 1, 2, 3});
// with type codes
- ASSERT_RAISES(Invalid, UnionArray::MakeSparse(*logical_type_ids_, children,
+ ASSERT_RAISES(Invalid, SparseUnionArray::Make(*logical_type_ids_, children,
std::vector<int8_t>{0}));
ASSERT_OK_AND_ASSIGN(result,
- UnionArray::MakeSparse(*logical_type_ids_, children, type_codes_));
+ SparseUnionArray::Make(*logical_type_ids_, children, type_codes_));
ASSERT_OK(result->ValidateFull());
CheckUnionArray(checked_cast<UnionArray&>(*result), UnionMode::SPARSE,
{"0", "1", "2", "3"}, type_codes_);
// with field names and type codes
- ASSERT_RAISES(Invalid, UnionArray::MakeSparse(*logical_type_ids_, children, {"one"},
+ ASSERT_RAISES(Invalid, SparseUnionArray::Make(*logical_type_ids_, children, {"one"},
type_codes_));
- ASSERT_OK_AND_ASSIGN(result, UnionArray::MakeSparse(*logical_type_ids_, children,
+ ASSERT_OK_AND_ASSIGN(result, SparseUnionArray::Make(*logical_type_ids_, children,
field_names, type_codes_));
ASSERT_OK(result->ValidateFull());
CheckUnionArray(checked_cast<UnionArray&>(*result), UnionMode::SPARSE, field_names,
@@ -263,15 +263,15 @@ TEST_F(TestUnionArrayFactories, TestMakeSparse) {
// Invalid type codes
ASSERT_OK_AND_ASSIGN(
- result, UnionArray::MakeSparse(*invalid_type_ids1_, children, type_codes_));
+ result, SparseUnionArray::Make(*invalid_type_ids1_, children, type_codes_));
ASSERT_RAISES(Invalid, result->ValidateFull());
ASSERT_OK_AND_ASSIGN(
- result, UnionArray::MakeSparse(*invalid_type_ids2_, children, type_codes_));
+ result, SparseUnionArray::Make(*invalid_type_ids2_, children, type_codes_));
ASSERT_RAISES(Invalid, result->ValidateFull());
// Invalid child length
ArrayFromVector<Int8Type>({0, 0, 0, 0, 0, -12, 0, 0, 0}, &children[3]);
- ASSERT_RAISES(Invalid, UnionArray::MakeSparse(*type_ids_, children));
+ ASSERT_RAISES(Invalid, SparseUnionArray::Make(*type_ids_, children));
}
template <typename B>
@@ -396,8 +396,8 @@ class SparseUnionBuilderTest : public UnionBuilderTest<SparseUnionBuilder> {
TEST_F(DenseUnionBuilderTest, Basics) {
union_builder.reset(new DenseUnionBuilder(
default_memory_pool(), {i8_builder, str_builder, dbl_builder},
- union_({field("i8", int8()), field("str", utf8()), field("dbl", float64())},
- {I8, STR, DBL}, UnionMode::DENSE)));
+ dense_union({field("i8", int8()), field("str", utf8()), field("dbl", float64())},
+ {I8, STR, DBL})));
AppendBasics();
auto expected_i8 = ArrayFromJSON(int8(), "[33, 10, -10]");
@@ -407,7 +407,7 @@ TEST_F(DenseUnionBuilderTest, Basics) {
auto expected_offsets = ArrayFromJSON(int32(), "[0, 0, 0, 1, 1, 1, 2, 2, 2]");
ASSERT_OK_AND_ASSIGN(auto expected,
- UnionArray::MakeDense(*expected_types, *expected_offsets,
+ DenseUnionArray::Make(*expected_types, *expected_offsets,
{expected_i8, expected_str, expected_dbl},
{"i8", "str", "dbl"}, {I8, STR, DBL}));
@@ -425,7 +425,7 @@ TEST_F(DenseUnionBuilderTest, InferredType) {
auto expected_offsets = ArrayFromJSON(int32(), "[0, 1, 0, 1, 2, 2, 0, 1, 2]");
ASSERT_OK_AND_ASSIGN(auto expected,
- UnionArray::MakeDense(*expected_types, *expected_offsets,
+ DenseUnionArray::Make(*expected_types, *expected_offsets,
{expected_i8, expected_str, expected_dbl},
{"i8", "str", "dbl"}, {I8, STR, DBL}));
@@ -437,17 +437,17 @@ TEST_F(DenseUnionBuilderTest, ListOfInferredType) {
std::shared_ptr<ListArray> actual;
AppendListOfInferred(&actual);
- auto expected_type =
- list(union_({field("i8", int8()), field("str", utf8()), field("dbl", float64())},
- {I8, STR, DBL}, UnionMode::DENSE));
+ auto expected_type = list(
+ dense_union({field("i8", int8()), field("str", utf8()), field("dbl", float64())},
+ {I8, STR, DBL}));
ASSERT_EQ(expected_type->ToString(), actual->type()->ToString());
}
TEST_F(SparseUnionBuilderTest, Basics) {
union_builder.reset(new SparseUnionBuilder(
default_memory_pool(), {i8_builder, str_builder, dbl_builder},
- union_({field("i8", int8()), field("str", utf8()), field("dbl", float64())},
- {I8, STR, DBL}, UnionMode::SPARSE)));
+ sparse_union({field("i8", int8()), field("str", utf8()), field("dbl", float64())},
+ {I8, STR, DBL})));
AppendBasics();
@@ -460,7 +460,7 @@ TEST_F(SparseUnionBuilderTest, Basics) {
ASSERT_OK_AND_ASSIGN(
auto expected,
- UnionArray::MakeSparse(*expected_types, {expected_i8, expected_str, expected_dbl},
+ SparseUnionArray::Make(*expected_types, {expected_i8, expected_str, expected_dbl},
{"i8", "str", "dbl"}, {I8, STR, DBL}));
ASSERT_EQ(expected->type()->ToString(), actual->type()->ToString());
@@ -479,7 +479,7 @@ TEST_F(SparseUnionBuilderTest, InferredType) {
ASSERT_OK_AND_ASSIGN(
auto expected,
- UnionArray::MakeSparse(*expected_types, {expected_i8, expected_str, expected_dbl},
+ SparseUnionArray::Make(*expected_types, {expected_i8, expected_str, expected_dbl},
{"i8", "str", "dbl"}, {I8, STR, DBL}));
ASSERT_EQ(expected->type()->ToString(), actual->type()->ToString());
@@ -491,8 +491,8 @@ TEST_F(SparseUnionBuilderTest, StructWithUnion) {
StructBuilder builder(struct_({field("u", union_builder->type())}),
default_memory_pool(), {union_builder});
ASSERT_EQ(union_builder->AppendChild(std::make_shared<Int32Builder>(), "i"), 0);
- ASSERT_TRUE(
- builder.type()->Equals(struct_({field("u", union_({field("i", int32())}, {0}))})));
+ ASSERT_TRUE(builder.type()->Equals(
+ struct_({field("u", sparse_union({field("i", int32())}, {0}))})));
}
} // namespace arrow
diff --git a/cpp/src/arrow/array/array_view_test.cc b/cpp/src/arrow/array/array_view_test.cc
index e9d87bd..dc61a0d 100644
--- a/cpp/src/arrow/array/array_view_test.cc
+++ b/cpp/src/arrow/array/array_view_test.cc
@@ -333,7 +333,7 @@ TEST(TestArrayView, SparseUnionAsStruct) {
auto child1 = ArrayFromJSON(int16(), "[0, -1, 42]");
auto child2 = ArrayFromJSON(int32(), "[0, 1069547520, -1071644672]");
auto indices = ArrayFromJSON(int8(), "[0, 0, 1]");
- ASSERT_OK_AND_ASSIGN(auto arr, UnionArray::MakeSparse(*indices, {child1, child2}));
+ ASSERT_OK_AND_ASSIGN(auto arr, SparseUnionArray::Make(*indices, {child1, child2}));
ASSERT_OK(arr->ValidateFull());
auto ty1 = struct_({field("a", int8()), field("b", uint16()), field("c", float32())});
@@ -343,7 +343,7 @@ TEST(TestArrayView, SparseUnionAsStruct) {
// With nulls
indices = ArrayFromJSON(int8(), "[null, 0, 1]");
- ASSERT_OK_AND_ASSIGN(arr, UnionArray::MakeSparse(*indices, {child1, child2}));
+ ASSERT_OK_AND_ASSIGN(arr, SparseUnionArray::Make(*indices, {child1, child2}));
ASSERT_OK(arr->ValidateFull());
expected = ArrayFromJSON(ty1, "[null, [0, 65535, 1.5], [1, 42, -2.5]]");
CheckView(arr, expected);
@@ -352,7 +352,7 @@ TEST(TestArrayView, SparseUnionAsStruct) {
// With nested nulls
child1 = ArrayFromJSON(int16(), "[0, -1, null]");
child2 = ArrayFromJSON(int32(), "[0, null, -1071644672]");
- ASSERT_OK_AND_ASSIGN(arr, UnionArray::MakeSparse(*indices, {child1, child2}));
+ ASSERT_OK_AND_ASSIGN(arr, SparseUnionArray::Make(*indices, {child1, child2}));
ASSERT_OK(arr->ValidateFull());
expected = ArrayFromJSON(ty1, "[null, [0, 65535, null], [1, null, -2.5]]");
CheckView(arr, expected);
diff --git a/cpp/src/arrow/array/builder_union.cc b/cpp/src/arrow/array/builder_union.cc
index ace91de..a5e4c13 100644
--- a/cpp/src/arrow/array/builder_union.cc
+++ b/cpp/src/arrow/array/builder_union.cc
@@ -39,22 +39,25 @@ Status BasicUnionBuilder::FinishInternal(std::shared_ptr<ArrayData>* out) {
RETURN_NOT_OK(children_[i]->FinishInternal(&child_data[i]));
}
- *out = ArrayData::Make(type(), length(), {null_bitmap, types, nullptr}, null_count_);
+ *out = ArrayData::Make(type(), length(), {null_bitmap, types}, null_count_);
(*out)->child_data = std::move(child_data);
return Status::OK();
}
+Status DenseUnionBuilder::FinishInternal(std::shared_ptr<ArrayData>* out) {
+ ARROW_RETURN_NOT_OK(BasicUnionBuilder::FinishInternal(out));
+ (*out)->buffers.resize(3);
+ ARROW_RETURN_NOT_OK(offsets_builder_.Finish(&(*out)->buffers[2]));
+ return Status::OK();
+}
+
BasicUnionBuilder::BasicUnionBuilder(
- MemoryPool* pool, UnionMode::type mode,
- const std::vector<std::shared_ptr<ArrayBuilder>>& children,
+ MemoryPool* pool, const std::vector<std::shared_ptr<ArrayBuilder>>& children,
const std::shared_ptr<DataType>& type)
- : ArrayBuilder(pool),
- child_fields_(children.size()),
- mode_(mode),
- types_builder_(pool) {
- DCHECK_EQ(type->id(), Type::UNION);
+ : ArrayBuilder(pool), child_fields_(children.size()), types_builder_(pool) {
const auto& union_type = checked_cast<const UnionType&>(*type);
- DCHECK_EQ(union_type.mode(), mode);
+ mode_ = union_type.mode();
+
DCHECK_EQ(children.size(), union_type.type_codes().size());
type_codes_ = union_type.type_codes();
@@ -73,9 +76,6 @@ BasicUnionBuilder::BasicUnionBuilder(
}
}
-BasicUnionBuilder::BasicUnionBuilder(MemoryPool* pool, UnionMode::type mode)
- : BasicUnionBuilder(pool, mode, {}, union_(mode)) {}
-
int8_t BasicUnionBuilder::AppendChild(const std::shared_ptr<ArrayBuilder>& new_child,
const std::string& field_name) {
children_.push_back(new_child);
@@ -96,7 +96,8 @@ std::shared_ptr<DataType> BasicUnionBuilder::type() const {
for (size_t i = 0; i < child_fields.size(); ++i) {
child_fields[i] = child_fields_[i]->WithType(children_[i]->type());
}
- return union_(std::move(child_fields), type_codes_, mode_);
+ return mode_ == UnionMode::SPARSE ? sparse_union(std::move(child_fields), type_codes_)
+ : dense_union(std::move(child_fields), type_codes_);
}
int8_t BasicUnionBuilder::NextTypeId() {
diff --git a/cpp/src/arrow/array/builder_union.h b/cpp/src/arrow/array/builder_union.h
index aba707e..e337673 100644
--- a/cpp/src/arrow/array/builder_union.h
+++ b/cpp/src/arrow/array/builder_union.h
@@ -57,14 +57,7 @@ class ARROW_EXPORT BasicUnionBuilder : public ArrayBuilder {
std::shared_ptr<DataType> type() const override;
protected:
- /// Use this constructor to initialize the UnionBuilder with no child builders,
- /// allowing type to be inferred. You will need to call AppendChild for each of the
- /// children builders you want to use.
- BasicUnionBuilder(MemoryPool* pool, UnionMode::type mode);
-
- /// Use this constructor to specify the type explicitly.
- /// You can still add child builders to the union after using this constructor
- BasicUnionBuilder(MemoryPool* pool, UnionMode::type mode,
+ BasicUnionBuilder(MemoryPool* pool,
const std::vector<std::shared_ptr<ArrayBuilder>>& children,
const std::shared_ptr<DataType>& type);
@@ -89,15 +82,14 @@ class ARROW_EXPORT DenseUnionBuilder : public BasicUnionBuilder {
/// allowing type to be inferred. You will need to call AppendChild for each of the
/// children builders you want to use.
explicit DenseUnionBuilder(MemoryPool* pool)
- : BasicUnionBuilder(pool, UnionMode::DENSE), offsets_builder_(pool) {}
+ : BasicUnionBuilder(pool, {}, dense_union(FieldVector{})), offsets_builder_(pool) {}
/// Use this constructor to specify the type explicitly.
/// You can still add child builders to the union after using this constructor
DenseUnionBuilder(MemoryPool* pool,
const std::vector<std::shared_ptr<ArrayBuilder>>& children,
const std::shared_ptr<DataType>& type)
- : BasicUnionBuilder(pool, UnionMode::DENSE, children, type),
- offsets_builder_(pool) {}
+ : BasicUnionBuilder(pool, children, type), offsets_builder_(pool) {}
Status AppendNull() final {
ARROW_RETURN_NOT_OK(types_builder_.Append(0));
@@ -130,10 +122,7 @@ class ARROW_EXPORT DenseUnionBuilder : public BasicUnionBuilder {
return AppendToBitmap(true);
}
- Status FinishInternal(std::shared_ptr<ArrayData>* out) override {
- ARROW_RETURN_NOT_OK(BasicUnionBuilder::FinishInternal(out));
- return offsets_builder_.Finish(&(*out)->buffers[2]);
- }
+ Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
private:
TypedBufferBuilder<int32_t> offsets_builder_;
@@ -148,14 +137,14 @@ class ARROW_EXPORT SparseUnionBuilder : public BasicUnionBuilder {
/// allowing type to be inferred. You will need to call AppendChild for each of the
/// children builders you want to use.
explicit SparseUnionBuilder(MemoryPool* pool)
- : BasicUnionBuilder(pool, UnionMode::SPARSE) {}
+ : BasicUnionBuilder(pool, {}, sparse_union(FieldVector{})) {}
/// Use this constructor to specify the type explicitly.
/// You can still add child builders to the union after using this constructor
SparseUnionBuilder(MemoryPool* pool,
const std::vector<std::shared_ptr<ArrayBuilder>>& children,
const std::shared_ptr<DataType>& type)
- : BasicUnionBuilder(pool, UnionMode::SPARSE, children, type) {}
+ : BasicUnionBuilder(pool, children, type) {}
Status AppendNull() final {
ARROW_RETURN_NOT_OK(types_builder_.Append(0));
diff --git a/cpp/src/arrow/array/concatenate_test.cc b/cpp/src/arrow/array/concatenate_test.cc
index 04e3a8f..9c887eb 100644
--- a/cpp/src/arrow/array/concatenate_test.cc
+++ b/cpp/src/arrow/array/concatenate_test.cc
@@ -218,7 +218,7 @@ TEST_F(ConcatenateTest, DISABLED_UnionType) {
auto bar = this->GeneratePrimitive<DoubleType>(size, null_probability);
auto baz = this->GeneratePrimitive<BooleanType>(size, null_probability);
auto type_ids = rng_.Numeric<Int8Type>(size, 0, 2, null_probability);
- ASSERT_OK_AND_ASSIGN(*out, UnionArray::MakeSparse(*type_ids, {foo, bar, baz}));
+ ASSERT_OK_AND_ASSIGN(*out, SparseUnionArray::Make(*type_ids, {foo, bar, baz}));
});
// dense mode
Check([this](int32_t size, double null_probability, std::shared_ptr<Array>* out) {
@@ -228,7 +228,7 @@ TEST_F(ConcatenateTest, DISABLED_UnionType) {
auto type_ids = rng_.Numeric<Int8Type>(size, 0, 2, null_probability);
auto value_offsets = rng_.Numeric<Int32Type>(size, 0, size, 0);
ASSERT_OK_AND_ASSIGN(
- *out, UnionArray::MakeDense(*type_ids, *value_offsets, {foo, bar, baz}));
+ *out, DenseUnionArray::Make(*type_ids, *value_offsets, {foo, bar, baz}));
});
}
diff --git a/cpp/src/arrow/array/diff.cc b/cpp/src/arrow/array/diff.cc
index 377f77a..6b98feb 100644
--- a/cpp/src/arrow/array/diff.cc
+++ b/cpp/src/arrow/array/diff.cc
@@ -574,7 +574,7 @@ class MakeFormatterImpl {
using UnionImpl::UnionImpl;
void operator()(const Array& array, int64_t index, std::ostream* os) {
- const auto& union_array = checked_cast<const UnionArray&>(array);
+ const auto& union_array = checked_cast<const SparseUnionArray&>(array);
DoFormat(union_array, index, index, os);
}
};
@@ -583,7 +583,7 @@ class MakeFormatterImpl {
using UnionImpl::UnionImpl;
void operator()(const Array& array, int64_t index, std::ostream* os) {
- const auto& union_array = checked_cast<const UnionArray&>(array);
+ const auto& union_array = checked_cast<const DenseUnionArray&>(array);
DoFormat(union_array, index, union_array.raw_value_offsets()[index], os);
}
};
diff --git a/cpp/src/arrow/array/diff_test.cc b/cpp/src/arrow/array/diff_test.cc
index 827c889..bfe46d4 100644
--- a/cpp/src/arrow/array/diff_test.cc
+++ b/cpp/src/arrow/array/diff_test.cc
@@ -350,7 +350,7 @@ TEST_F(DiffTest, BasicsWithStructs) {
}
TEST_F(DiffTest, BasicsWithUnions) {
- auto type = union_({field("foo", utf8()), field("bar", int32())}, {2, 5});
+ auto type = sparse_union({field("foo", utf8()), field("bar", int32())}, {2, 5});
// insert one
base_ = ArrayFromJSON(type, R"([[2, "!"], [5, 3], [5, 13]])");
@@ -509,8 +509,8 @@ TEST_F(DiffTest, UnifiedDiffFormatter) {
)");
// unions
- for (auto mode : {UnionMode::SPARSE, UnionMode::DENSE}) {
- type = union_({field("foo", utf8()), field("bar", int32())}, {2, 5}, mode);
+ for (auto union_ : UnionTypeFactories()) {
+ type = union_({field("foo", utf8()), field("bar", int32())}, {2, 5});
base_ = ArrayFromJSON(type, R"([[2, "!"], [5, 3], [5, 13]])");
target_ = ArrayFromJSON(type, R"([[2, "!"], [2, "3"], [5, 13]])");
AssertDiffAndFormat(R"(
diff --git a/cpp/src/arrow/array/util.cc b/cpp/src/arrow/array/util.cc
index 2760258..a30849a 100644
--- a/cpp/src/arrow/array/util.cc
+++ b/cpp/src/arrow/array/util.cc
@@ -233,12 +233,8 @@ class NullArrayFactory {
}
Status Visit(const UnionType& type) {
- if (type.mode() == UnionMode::DENSE) {
- out_->buffers.resize(3, buffer_);
- } else {
- out_->buffers = {buffer_, buffer_, nullptr};
- }
-
+ auto n_buffers = type.mode() == UnionMode::SPARSE ? 2 : 3;
+ out_->buffers.resize(n_buffers, buffer_);
for (int i = 0; i < type_->num_fields(); ++i) {
ARROW_ASSIGN_OR_RAISE(out_->child_data[i], CreateChild(i, length_));
}
diff --git a/cpp/src/arrow/array/validate.cc b/cpp/src/arrow/array/validate.cc
index e17243e..f422961 100644
--- a/cpp/src/arrow/array/validate.cc
+++ b/cpp/src/arrow/array/validate.cc
@@ -438,7 +438,8 @@ struct ValidateArrayDataVisitor {
}
// Check offsets
- const int32_t* offsets = array.raw_value_offsets();
+ const int32_t* offsets =
+ checked_cast<const DenseUnionArray&>(array).raw_value_offsets();
for (int64_t i = 0; i < array.length(); ++i) {
if (array.IsNull(i)) {
continue;
diff --git a/cpp/src/arrow/builder.cc b/cpp/src/arrow/builder.cc
index f314c29..6be07d6 100644
--- a/cpp/src/arrow/builder.cc
+++ b/cpp/src/arrow/builder.cc
@@ -80,6 +80,19 @@ struct DictionaryBuilderCase {
out->reset(new TYPE_CLASS##Builder(type, pool)); \
return Status::OK();
+Result<std::vector<std::shared_ptr<ArrayBuilder>>> FieldBuilders(const DataType& type,
+ MemoryPool* pool) {
+ std::vector<std::shared_ptr<ArrayBuilder>> field_builders;
+
+ for (const auto& field : type.fields()) {
+ std::unique_ptr<ArrayBuilder> builder;
+ RETURN_NOT_OK(MakeBuilder(pool, field->type(), &builder));
+ field_builders.emplace_back(std::move(builder));
+ }
+
+ return field_builders;
+}
+
Status MakeBuilder(MemoryPool* pool, const std::shared_ptr<DataType>& type,
std::unique_ptr<ArrayBuilder>* out) {
switch (type->id()) {
@@ -158,33 +171,20 @@ Status MakeBuilder(MemoryPool* pool, const std::shared_ptr<DataType>& type,
}
case Type::STRUCT: {
- const std::vector<std::shared_ptr<Field>>& fields = type->fields();
- std::vector<std::shared_ptr<ArrayBuilder>> field_builders;
-
- for (const auto& it : fields) {
- std::unique_ptr<ArrayBuilder> builder;
- RETURN_NOT_OK(MakeBuilder(pool, it->type(), &builder));
- field_builders.emplace_back(std::move(builder));
- }
+ ARROW_ASSIGN_OR_RAISE(auto field_builders, FieldBuilders(*type, pool));
out->reset(new StructBuilder(type, pool, std::move(field_builders)));
return Status::OK();
}
- case Type::UNION: {
- const auto& union_type = internal::checked_cast<const UnionType&>(*type);
- const std::vector<std::shared_ptr<Field>>& fields = type->fields();
- std::vector<std::shared_ptr<ArrayBuilder>> field_builders;
-
- for (const auto& it : fields) {
- std::unique_ptr<ArrayBuilder> builder;
- RETURN_NOT_OK(MakeBuilder(pool, it->type(), &builder));
- field_builders.emplace_back(std::move(builder));
- }
- if (union_type.mode() == UnionMode::DENSE) {
- out->reset(new DenseUnionBuilder(pool, std::move(field_builders), type));
- } else {
- out->reset(new SparseUnionBuilder(pool, std::move(field_builders), type));
- }
+ case Type::SPARSE_UNION: {
+ ARROW_ASSIGN_OR_RAISE(auto field_builders, FieldBuilders(*type, pool));
+ out->reset(new SparseUnionBuilder(pool, std::move(field_builders), type));
+ return Status::OK();
+ }
+
+ case Type::DENSE_UNION: {
+ ARROW_ASSIGN_OR_RAISE(auto field_builders, FieldBuilders(*type, pool));
+ out->reset(new DenseUnionBuilder(pool, std::move(field_builders), type));
return Status::OK();
}
diff --git a/cpp/src/arrow/c/bridge.cc b/cpp/src/arrow/c/bridge.cc
index c0e8e1d..4535543 100644
--- a/cpp/src/arrow/c/bridge.cc
+++ b/cpp/src/arrow/c/bridge.cc
@@ -1067,7 +1067,11 @@ struct SchemaImporter {
c_struct_->format, "'");
}
}
- type_ = union_(std::move(fields), std::move(type_codes), mode);
+ if (mode == UnionMode::SPARSE) {
+ type_ = sparse_union(std::move(fields), std::move(type_codes));
+ } else {
+ type_ = dense_union(std::move(fields), std::move(type_codes));
+ }
return Status::OK();
}
@@ -1322,14 +1326,16 @@ struct ArrayImporter {
Status Visit(const UnionType& type) {
auto mode = type.mode();
- RETURN_NOT_OK(CheckNumBuffers(3));
+ if (mode == UnionMode::SPARSE) {
+ RETURN_NOT_OK(CheckNumBuffers(2));
+ } else {
+ RETURN_NOT_OK(CheckNumBuffers(3));
+ }
RETURN_NOT_OK(AllocateArrayData());
RETURN_NOT_OK(ImportNullBitmap());
RETURN_NOT_OK(ImportFixedSizeBuffer(1, sizeof(int8_t)));
if (mode == UnionMode::DENSE) {
RETURN_NOT_OK(ImportFixedSizeBuffer(2, sizeof(int32_t)));
- } else {
- RETURN_NOT_OK(ImportUnusedBuffer(2));
}
return Status::OK();
}
@@ -1419,8 +1425,6 @@ struct ArrayImporter {
return ImportBuffer(buffer_id, buffer_size);
}
- Status ImportUnusedBuffer(int32_t buffer_id) { return ImportBuffer(buffer_id, 0); }
-
Status ImportFixedSizeBuffer(int32_t buffer_id, int64_t byte_width) {
// Compute visible size of buffer
int64_t buffer_size = byte_width * (c_struct_->length + c_struct_->offset);
diff --git a/cpp/src/arrow/c/bridge_test.cc b/cpp/src/arrow/c/bridge_test.cc
index 33d49cc..43273a3 100644
--- a/cpp/src/arrow/c/bridge_test.cc
+++ b/cpp/src/arrow/c/bridge_test.cc
@@ -347,13 +347,13 @@ TEST_F(TestSchemaExport, Union) {
// Dense
auto field_a = field("a", int8());
auto field_b = field("b", boolean(), /*nullable=*/false);
- auto type = union_({field_a, field_b}, {42, 43}, UnionMode::DENSE);
+ auto type = dense_union({field_a, field_b}, {42, 43});
TestNested(type, {"+ud:42,43", "c", "b"}, {"", "a", "b"},
{ARROW_FLAG_NULLABLE, ARROW_FLAG_NULLABLE, 0});
// Sparse
field_a = field("a", int8(), /*nullable=*/false);
field_b = field("b", boolean());
- type = union_({field_a, field_b}, {42, 43}, UnionMode::SPARSE);
+ type = sparse_union({field_a, field_b}, {42, 43});
TestNested(type, {"+us:42,43", "c", "b"}, {"", "a", "b"},
{ARROW_FLAG_NULLABLE, 0, ARROW_FLAG_NULLABLE});
}
@@ -800,12 +800,12 @@ TEST_F(TestArrayExport, Union) {
// Dense
auto field_a = field("a", int8());
auto field_b = field("b", boolean(), /*nullable=*/false);
- auto type = union_({field_a, field_b}, {42, 43}, UnionMode::DENSE);
+ auto type = dense_union({field_a, field_b}, {42, 43});
TestNested(type, data);
// Sparse
field_a = field("a", int8(), /*nullable=*/false);
field_b = field("b", boolean());
- type = union_({field_a, field_b}, {42, 43}, UnionMode::SPARSE);
+ type = sparse_union({field_a, field_b}, {42, 43});
TestNested(type, data);
}
@@ -1297,7 +1297,7 @@ TEST_F(TestSchemaImport, Union) {
FillPrimitive(AddChild(), "c", "ints");
FillStructLike("+us:43,42", 2);
auto expected =
- union_({field("strs", utf8()), field("ints", int8())}, {43, 42}, UnionMode::SPARSE);
+ sparse_union({field("strs", utf8()), field("ints", int8())}, {43, 42});
CheckImport(expected);
// Dense
@@ -1305,7 +1305,7 @@ TEST_F(TestSchemaImport, Union) {
FillPrimitive(AddChild(), "c", "ints");
FillStructLike("+ud:43,42", 2);
expected =
- union_({field("strs", utf8()), field("ints", int8())}, {43, 42}, UnionMode::DENSE);
+ dense_union({field("strs", utf8()), field("ints", int8())}, {43, 42});
CheckImport(expected);
}
@@ -1677,13 +1677,13 @@ class TestArrayImport : public ::testing::Test {
c->children = NLastChildren(c->n_children, c);
}
- void FillUnionLike(struct ArrowArray* c, int64_t length,
+ void FillUnionLike(struct ArrowArray* c, UnionMode::type mode, int64_t length,
int64_t null_count, int64_t offset,
int64_t n_children, const void** buffers) {
c->length = length;
c->null_count = null_count;
c->offset = offset;
- c->n_buffers = 3;
+ c->n_buffers = mode == UnionMode::SPARSE ? 2 : 3;
c->buffers = buffers;
c->n_children = n_children;
c->children = NLastChildren(c->n_children, c);
@@ -1717,10 +1717,10 @@ class TestArrayImport : public ::testing::Test {
FillStructLike(&c_struct_, length, null_count, offset, n_children, buffers);
}
- void FillUnionLike(int64_t length, int64_t null_count,
+ void FillUnionLike(UnionMode::type mode, int64_t length, int64_t null_count,
int64_t offset, int64_t n_children,
const void** buffers) {
- FillUnionLike(&c_struct_, length, null_count, offset, n_children, buffers);
+ FillUnionLike(&c_struct_, mode, length, null_count, offset, n_children, buffers);
}
void CheckImport(const std::shared_ptr<Array>& expected) {
@@ -2021,9 +2021,9 @@ TEST_F(TestArrayImport, Union) {
// Sparse
FillStringLike(AddChild(), 4, 0, 0, string_buffers_no_nulls1);
FillPrimitive(AddChild(), 4, -1, 0, primitive_buffers_nulls1_8);
- FillUnionLike(4, 0, 0, 2, sparse_union_buffers_no_nulls1);
+ FillUnionLike(UnionMode::SPARSE, 4, 0, 0, 2, sparse_union_buffers_no_nulls1);
auto type =
- union_({field("strs", utf8()), field("ints", int8())}, {43, 42}, UnionMode::SPARSE);
+ sparse_union({field("strs", utf8()), field("ints", int8())}, {43, 42});
auto expected =
ArrayFromJSON(type, R"([[42, 1], [42, null], [43, "bar"], [43, "quux"]])");
CheckImport(expected);
@@ -2031,9 +2031,9 @@ TEST_F(TestArrayImport, Union) {
// Dense
FillStringLike(AddChild(), 2, 0, 0, string_buffers_no_nulls1);
FillPrimitive(AddChild(), 3, -1, 0, primitive_buffers_nulls1_8);
- FillUnionLike(5, 0, 0, 2, dense_union_buffers_no_nulls1);
+ FillUnionLike(UnionMode::DENSE, 5, 0, 0, 2, dense_union_buffers_no_nulls1);
type =
- union_({field("strs", utf8()), field("ints", int8())}, {43, 42}, UnionMode::DENSE);
+ dense_union({field("strs", utf8()), field("ints", int8())}, {43, 42});
expected =
ArrayFromJSON(type, R"([[42, 1], [42, null], [43, "foo"], [43, ""], [42, 3]])");
CheckImport(expected);
@@ -2405,9 +2405,9 @@ TEST_F(TestSchemaRoundtrip, Union) {
auto f2 = field("f2", list(decimal(19, 4)));
auto type_codes = std::vector<int8_t>{42, 43};
- TestWithTypeFactory([&]() { return union_({f1, f2}, type_codes, UnionMode::SPARSE); });
+ TestWithTypeFactory([&]() { return sparse_union({f1, f2}, type_codes); });
f2 = f2->WithMetadata(key_value_metadata(kMetadataKeys2, kMetadataValues2));
- TestWithTypeFactory([&]() { return union_({f1, f2}, type_codes, UnionMode::DENSE); });
+ TestWithTypeFactory([&]() { return dense_union({f1, f2}, type_codes); });
}
TEST_F(TestSchemaRoundtrip, Dictionary) {
diff --git a/cpp/src/arrow/compare.cc b/cpp/src/arrow/compare.cc
index db40338..7600746 100644
--- a/cpp/src/arrow/compare.cc
+++ b/cpp/src/arrow/compare.cc
@@ -262,8 +262,10 @@ class RangeEqualsVisitor {
return false;
}
} else {
- const int32_t offset = left.raw_value_offsets()[i];
- const int32_t o_offset = right.raw_value_offsets()[o_i];
+ const int32_t offset =
+ checked_cast<const DenseUnionArray&>(left).raw_value_offsets()[i];
+ const int32_t o_offset =
+ checked_cast<const DenseUnionArray&>(right).raw_value_offsets()[o_i];
if (!left.field(child_num)->RangeEquals(offset, offset + 1, o_offset,
right.field(child_num))) {
return false;
diff --git a/cpp/src/arrow/compute/kernels/codegen_internal.cc b/cpp/src/arrow/compute/kernels/codegen_internal.cc
index 3ea3570..427ad2b 100644
--- a/cpp/src/arrow/compute/kernels/codegen_internal.cc
+++ b/cpp/src/arrow/compute/kernels/codegen_internal.cc
@@ -31,8 +31,8 @@ void ExecFail(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
ctx->SetStatus(Status::NotImplemented("This kernel is malformed"));
}
-void BinaryExecFlipped(KernelContext* ctx, ArrayKernelExec exec,
- const ExecBatch& batch, Datum* out) {
+void BinaryExecFlipped(KernelContext* ctx, ArrayKernelExec exec, const ExecBatch& batch,
+ Datum* out) {
ExecBatch flipped_batch = batch;
std::swap(flipped_batch.values[0], flipped_batch.values[1]);
exec(ctx, flipped_batch, out);
@@ -74,10 +74,15 @@ static void InitStaticData() {
Extend(g_floating_types, &g_numeric_types);
// Temporal types
- g_temporal_types = {date32(), date64(), time32(TimeUnit::SECOND),
- time32(TimeUnit::MILLI), time64(TimeUnit::MICRO),
- time64(TimeUnit::NANO), timestamp(TimeUnit::SECOND),
- timestamp(TimeUnit::MILLI), timestamp(TimeUnit::MICRO),
+ g_temporal_types = {date32(),
+ date64(),
+ time32(TimeUnit::SECOND),
+ time32(TimeUnit::MILLI),
+ time64(TimeUnit::MICRO),
+ time64(TimeUnit::NANO),
+ timestamp(TimeUnit::SECOND),
+ timestamp(TimeUnit::MILLI),
+ timestamp(TimeUnit::MICRO),
timestamp(TimeUnit::NANO)};
// Base binary types (without FixedSizeBinary)
@@ -142,19 +147,20 @@ const std::vector<std::shared_ptr<DataType>>& PrimitiveTypes() {
const std::vector<std::shared_ptr<DataType>>& ExampleParametricTypes() {
static DataTypeVector example_parametric_types = {
- decimal(12, 2),
- duration(TimeUnit::SECOND),
- timestamp(TimeUnit::SECOND),
- time32(TimeUnit::SECOND),
- time64(TimeUnit::MICRO),
- fixed_size_binary(0),
- list(null()),
- large_list(null()),
- fixed_size_list(field("dummy", null()), 0),
- struct_({}),
- union_({}),
- dictionary(int32(), null()),
- map(null(), null())};
+ decimal(12, 2),
+ duration(TimeUnit::SECOND),
+ timestamp(TimeUnit::SECOND),
+ time32(TimeUnit::SECOND),
+ time64(TimeUnit::MICRO),
+ fixed_size_binary(0),
+ list(null()),
+ large_list(null()),
+ fixed_size_list(field("dummy", null()), 0),
+ struct_({}),
+ sparse_union(FieldVector{}),
+ dense_union(FieldVector{}),
+ dictionary(int32(), null()),
+ map(null(), null())};
return example_parametric_types;
}
diff --git a/cpp/src/arrow/compute/kernels/vector_filter_test.cc b/cpp/src/arrow/compute/kernels/vector_filter_test.cc
index a835417..3277891 100644
--- a/cpp/src/arrow/compute/kernels/vector_filter_test.cc
+++ b/cpp/src/arrow/compute/kernels/vector_filter_test.cc
@@ -488,8 +488,8 @@ TEST_F(TestFilterKernelWithStruct, FilterStruct) {
class TestFilterKernelWithUnion : public TestFilterKernel<UnionType> {};
TEST_F(TestFilterKernelWithUnion, FilterUnion) {
- for (auto mode : {UnionMode::SPARSE, UnionMode::DENSE}) {
- auto union_type = union_({field("a", int32()), field("b", utf8())}, {2, 5}, mode);
+ for (auto union_ : UnionTypeFactories()) {
+ auto union_type = union_({field("a", int32()), field("b", utf8())}, {2, 5});
auto union_json = R"([
null,
[2, 222],
diff --git a/cpp/src/arrow/compute/kernels/vector_selection_internal.h b/cpp/src/arrow/compute/kernels/vector_selection_internal.h
index 6333ef3..8908b3b 100644
--- a/cpp/src/arrow/compute/kernels/vector_selection_internal.h
+++ b/cpp/src/arrow/compute/kernels/vector_selection_internal.h
@@ -494,28 +494,99 @@ class TakerImpl<IndexSequence, StructType> : public Taker<IndexSequence> {
};
template <typename IndexSequence>
-class TakerImpl<IndexSequence, UnionType> : public Taker<IndexSequence> {
+class TakerImpl<IndexSequence, SparseUnionType> : public Taker<IndexSequence> {
public:
using Taker<IndexSequence>::Taker;
Status Init() override {
- union_type_ = checked_cast<const UnionType*>(this->type_.get());
+ union_type_ = checked_cast<const SparseUnionType*>(this->type_.get());
+ children_.resize(this->type_->num_fields());
- if (union_type_->mode() == UnionMode::SPARSE) {
- sparse_children_.resize(this->type_->num_fields());
- } else {
- dense_children_.resize(this->type_->num_fields());
- child_length_.resize(union_type_->max_type_code() + 1);
+ for (int i = 0; i < this->type_->num_fields(); ++i) {
+ RETURN_NOT_OK(
+ Taker<IndexSequence>::Make(this->type_->field(i)->type(), &children_[i]));
}
+ return Status::OK();
+ }
+
+ Status SetContext(KernelContext* ctx) override {
+ pool_ = ctx->memory_pool();
+ null_bitmap_builder_.reset(new TypedBufferBuilder<bool>(pool_));
+ type_code_builder_.reset(new TypedBufferBuilder<int8_t>(pool_));
for (int i = 0; i < this->type_->num_fields(); ++i) {
- if (union_type_->mode() == UnionMode::SPARSE) {
- RETURN_NOT_OK(Taker<IndexSequence>::Make(this->type_->field(i)->type(),
- &sparse_children_[i]));
- } else {
- RETURN_NOT_OK(Taker<ArrayIndexSequence<Int32Type>>::Make(
- this->type_->field(i)->type(), &dense_children_[i]));
- }
+ RETURN_NOT_OK(children_[i]->SetContext(ctx));
+ }
+ return Status::OK();
+ }
+
+ Status Take(const Array& values, IndexSequence indices) override {
+ DCHECK(this->type_->Equals(values.type()));
+ const auto& union_array = checked_cast<const UnionArray&>(values);
+ auto type_codes = union_array.raw_type_codes();
+
+ RETURN_NOT_OK(null_bitmap_builder_->Reserve(indices.length()));
+ RETURN_NOT_OK(type_code_builder_->Reserve(indices.length()));
+ RETURN_NOT_OK(VisitIndices(indices, values, [&](int64_t index, bool is_valid) {
+ null_bitmap_builder_->UnsafeAppend(is_valid);
+ type_code_builder_->UnsafeAppend(type_codes[index]);
+ return Status::OK();
+ }));
+
+ // bounds checking was done while appending to the null bitmap
+ indices.set_never_out_of_bounds();
+
+ for (int i = 0; i < this->type_->num_fields(); ++i) {
+ RETURN_NOT_OK(children_[i]->Take(*union_array.field(i), indices));
+ }
+ return Status::OK();
+ }
+
+ Status Finish(std::shared_ptr<Array>* out) override {
+ auto null_count = null_bitmap_builder_->false_count();
+ auto length = null_bitmap_builder_->length();
+ std::shared_ptr<Buffer> null_bitmap, type_codes;
+ RETURN_NOT_OK(null_bitmap_builder_->Finish(&null_bitmap));
+ RETURN_NOT_OK(type_code_builder_->Finish(&type_codes));
+
+ ArrayVector fields(this->type_->num_fields());
+ for (int i = 0; i < this->type_->num_fields(); ++i) {
+ RETURN_NOT_OK(children_[i]->Finish(&fields[i]));
+ }
+
+ out->reset(new SparseUnionArray(this->type_, length, std::move(fields), type_codes,
+ null_bitmap, null_count));
+ return Status::OK();
+ }
+
+ protected:
+ int32_t* GetInt32(const std::shared_ptr<Buffer>& b) const {
+ return reinterpret_cast<int32_t*>(b->mutable_data());
+ }
+
+ const SparseUnionType* union_type_ = nullptr;
+ MemoryPool* pool_ = nullptr;
+ std::unique_ptr<TypedBufferBuilder<bool>> null_bitmap_builder_;
+ std::unique_ptr<TypedBufferBuilder<int8_t>> type_code_builder_;
+ std::unique_ptr<TypedBufferBuilder<int32_t>> offset_builder_;
+ std::vector<std::unique_ptr<Taker<IndexSequence>>> children_;
+ std::vector<int32_t> child_length_;
+};
+
+template <typename IndexSequence>
+class TakerImpl<IndexSequence, DenseUnionType> : public Taker<IndexSequence> {
+ public:
+ using Taker<IndexSequence>::Taker;
+
+ Status Init() override {
+ union_type_ = checked_cast<const DenseUnionType*>(this->type_.get());
+
+ dense_children_.resize(this->type_->num_fields());
+ child_length_.resize(union_type_->max_type_code() + 1);
+
+ for (int i = 0; i < this->type_->num_fields(); ++i) {
+ RETURN_NOT_OK(Taker<ArrayIndexSequence<Int32Type>>::Make(
+ this->type_->field(i)->type(), &dense_children_[i]));
}
return Status::OK();
@@ -525,18 +596,12 @@ class TakerImpl<IndexSequence, UnionType> : public Taker<IndexSequence> {
pool_ = ctx->memory_pool();
null_bitmap_builder_.reset(new TypedBufferBuilder<bool>(pool_));
type_code_builder_.reset(new TypedBufferBuilder<int8_t>(pool_));
+ offset_builder_.reset(new TypedBufferBuilder<int32_t>(pool_));
- if (union_type_->mode() == UnionMode::DENSE) {
- offset_builder_.reset(new TypedBufferBuilder<int32_t>(pool_));
- std::fill(child_length_.begin(), child_length_.end(), 0);
- }
+ std::fill(child_length_.begin(), child_length_.end(), 0);
for (int i = 0; i < this->type_->num_fields(); ++i) {
- if (union_type_->mode() == UnionMode::SPARSE) {
- RETURN_NOT_OK(sparse_children_[i]->SetContext(ctx));
- } else {
- RETURN_NOT_OK(dense_children_[i]->SetContext(ctx));
- }
+ RETURN_NOT_OK(dense_children_[i]->SetContext(ctx));
}
return Status::OK();
@@ -547,81 +612,65 @@ class TakerImpl<IndexSequence, UnionType> : public Taker<IndexSequence> {
const auto& union_array = checked_cast<const UnionArray&>(values);
auto type_codes = union_array.raw_type_codes();
- if (union_type_->mode() == UnionMode::SPARSE) {
- RETURN_NOT_OK(null_bitmap_builder_->Reserve(indices.length()));
- RETURN_NOT_OK(type_code_builder_->Reserve(indices.length()));
- RETURN_NOT_OK(VisitIndices(indices, values, [&](int64_t index, bool is_valid) {
- null_bitmap_builder_->UnsafeAppend(is_valid);
- type_code_builder_->UnsafeAppend(type_codes[index]);
- return Status::OK();
- }));
+ // Gathering from the offsets into child arrays is a bit tricky.
+ std::vector<uint32_t> child_counts(union_type_->max_type_code() + 1);
+ RETURN_NOT_OK(null_bitmap_builder_->Reserve(indices.length()));
+ RETURN_NOT_OK(type_code_builder_->Reserve(indices.length()));
+ RETURN_NOT_OK(VisitIndices(indices, values, [&](int64_t index, bool is_valid) {
+ null_bitmap_builder_->UnsafeAppend(is_valid);
+ type_code_builder_->UnsafeAppend(type_codes[index]);
+ child_counts[type_codes[index]] += is_valid;
+ return Status::OK();
+ }));
- // bounds checking was done while appending to the null bitmap
- indices.set_never_out_of_bounds();
+ // bounds checking was done while appending to the null bitmap
+ indices.set_never_out_of_bounds();
- for (int i = 0; i < this->type_->num_fields(); ++i) {
- RETURN_NOT_OK(sparse_children_[i]->Take(*union_array.field(i), indices));
- }
- } else {
- // Gathering from the offsets into child arrays is a bit tricky.
- std::vector<uint32_t> child_counts(union_type_->max_type_code() + 1);
- RETURN_NOT_OK(null_bitmap_builder_->Reserve(indices.length()));
- RETURN_NOT_OK(type_code_builder_->Reserve(indices.length()));
- RETURN_NOT_OK(VisitIndices(indices, values, [&](int64_t index, bool is_valid) {
- null_bitmap_builder_->UnsafeAppend(is_valid);
- type_code_builder_->UnsafeAppend(type_codes[index]);
- child_counts[type_codes[index]] += is_valid;
- return Status::OK();
- }));
-
- // bounds checking was done while appending to the null bitmap
- indices.set_never_out_of_bounds();
-
- // Allocate temporary storage for the offsets of all valid slots
- auto child_offsets_storage_size =
- std::accumulate(child_counts.begin(), child_counts.end(), 0);
- ARROW_ASSIGN_OR_RAISE(
- std::shared_ptr<Buffer> child_offsets_storage,
- AllocateBuffer(child_offsets_storage_size * sizeof(int32_t), pool_));
-
- // Partition offsets by type_code: child_offset_partitions[type_code] will
- // point to storage for child_counts[type_code] offsets
- std::vector<int32_t*> child_offset_partitions(child_counts.size());
- auto child_offsets_storage_data = GetInt32(child_offsets_storage);
- for (auto type_code : union_type_->type_codes()) {
- child_offset_partitions[type_code] = child_offsets_storage_data;
- child_offsets_storage_data += child_counts[type_code];
- }
- DCHECK_EQ(child_offsets_storage_data - GetInt32(child_offsets_storage),
- child_offsets_storage_size);
-
- // Fill child_offsets_storage with the taken offsets
- RETURN_NOT_OK(offset_builder_->Reserve(indices.length()));
- RETURN_NOT_OK(VisitIndices(indices, values, [&](int64_t index, bool is_valid) {
- auto type_code = type_codes[index];
- if (is_valid) {
- offset_builder_->UnsafeAppend(child_length_[type_code]++);
- *child_offset_partitions[type_code] = union_array.value_offset(index);
- ++child_offset_partitions[type_code];
- } else {
- offset_builder_->UnsafeAppend(0);
- }
- return Status::OK();
- }));
-
- // Take from each child at those offsets
- int64_t taken_offset_begin = 0;
- for (int i = 0; i < this->type_->num_fields(); ++i) {
- auto type_code = union_type_->type_codes()[i];
- auto length = child_counts[type_code];
- Int32Array taken_offsets(length, SliceBuffer(child_offsets_storage,
- sizeof(int32_t) * taken_offset_begin,
- sizeof(int32_t) * length));
- ArrayIndexSequence<Int32Type> child_indices(taken_offsets);
- child_indices.set_never_out_of_bounds();
- RETURN_NOT_OK(dense_children_[i]->Take(*union_array.field(i), child_indices));
- taken_offset_begin += length;
+ // Allocate temporary storage for the offsets of all valid slots
+ auto child_offsets_storage_size =
+ std::accumulate(child_counts.begin(), child_counts.end(), 0);
+ ARROW_ASSIGN_OR_RAISE(
+ std::shared_ptr<Buffer> child_offsets_storage,
+ AllocateBuffer(child_offsets_storage_size * sizeof(int32_t), pool_));
+
+ // Partition offsets by type_code: child_offset_partitions[type_code] will
+ // point to storage for child_counts[type_code] offsets
+ std::vector<int32_t*> child_offset_partitions(child_counts.size());
+ auto child_offsets_storage_data = GetInt32(child_offsets_storage);
+ for (auto type_code : union_type_->type_codes()) {
+ child_offset_partitions[type_code] = child_offsets_storage_data;
+ child_offsets_storage_data += child_counts[type_code];
+ }
+ DCHECK_EQ(child_offsets_storage_data - GetInt32(child_offsets_storage),
+ child_offsets_storage_size);
+
+ // Fill child_offsets_storage with the taken offsets
+ RETURN_NOT_OK(offset_builder_->Reserve(indices.length()));
+ RETURN_NOT_OK(VisitIndices(indices, values, [&](int64_t index, bool is_valid) {
+ auto type_code = type_codes[index];
+ if (is_valid) {
+ offset_builder_->UnsafeAppend(child_length_[type_code]++);
+ *child_offset_partitions[type_code] =
+ checked_cast<const DenseUnionArray&>(union_array).value_offset(index);
+ ++child_offset_partitions[type_code];
+ } else {
+ offset_builder_->UnsafeAppend(0);
}
+ return Status::OK();
+ }));
+
+ // Take from each child at those offsets
+ int64_t taken_offset_begin = 0;
+ for (int i = 0; i < this->type_->num_fields(); ++i) {
+ auto type_code = union_type_->type_codes()[i];
+ auto length = child_counts[type_code];
+ Int32Array taken_offsets(
+ length, SliceBuffer(child_offsets_storage, sizeof(int32_t) * taken_offset_begin,
+ sizeof(int32_t) * length));
+ ArrayIndexSequence<Int32Type> child_indices(taken_offsets);
+ child_indices.set_never_out_of_bounds();
+ RETURN_NOT_OK(dense_children_[i]->Take(*union_array.field(i), child_indices));
+ taken_offset_begin += length;
}
return Status::OK();
@@ -630,26 +679,18 @@ class TakerImpl<IndexSequence, UnionType> : public Taker<IndexSequence> {
Status Finish(std::shared_ptr<Array>* out) override {
auto null_count = null_bitmap_builder_->false_count();
auto length = null_bitmap_builder_->length();
- std::shared_ptr<Buffer> null_bitmap, type_codes;
+ std::shared_ptr<Buffer> null_bitmap, type_codes, offsets;
RETURN_NOT_OK(null_bitmap_builder_->Finish(&null_bitmap));
RETURN_NOT_OK(type_code_builder_->Finish(&type_codes));
-
- std::shared_ptr<Buffer> offsets;
- if (union_type_->mode() == UnionMode::DENSE) {
- RETURN_NOT_OK(offset_builder_->Finish(&offsets));
- }
+ RETURN_NOT_OK(offset_builder_->Finish(&offsets));
ArrayVector fields(this->type_->num_fields());
for (int i = 0; i < this->type_->num_fields(); ++i) {
- if (union_type_->mode() == UnionMode::SPARSE) {
- RETURN_NOT_OK(sparse_children_[i]->Finish(&fields[i]));
- } else {
- RETURN_NOT_OK(dense_children_[i]->Finish(&fields[i]));
- }
+ RETURN_NOT_OK(dense_children_[i]->Finish(&fields[i]));
}
- out->reset(new UnionArray(this->type_, length, std::move(fields), type_codes, offsets,
- null_bitmap, null_count));
+ out->reset(new DenseUnionArray(this->type_, length, std::move(fields), type_codes,
+ offsets, null_bitmap, null_count));
return Status::OK();
}
@@ -658,7 +699,7 @@ class TakerImpl<IndexSequence, UnionType> : public Taker<IndexSequence> {
return reinterpret_cast<int32_t*>(b->mutable_data());
}
- const UnionType* union_type_ = nullptr;
+ const DenseUnionType* union_type_ = nullptr;
MemoryPool* pool_ = nullptr;
std::unique_ptr<TypedBufferBuilder<bool>> null_bitmap_builder_;
std::unique_ptr<TypedBufferBuilder<int8_t>> type_code_builder_;
diff --git a/cpp/src/arrow/compute/kernels/vector_take_test.cc b/cpp/src/arrow/compute/kernels/vector_take_test.cc
index 51d183c..a7568df 100644
--- a/cpp/src/arrow/compute/kernels/vector_take_test.cc
+++ b/cpp/src/arrow/compute/kernels/vector_take_test.cc
@@ -514,10 +514,9 @@ TEST_F(TestTakeKernelWithStruct, TakeStruct) {
class TestTakeKernelWithUnion : public TestTakeKernel<UnionType> {};
// TODO: Restore Union take functionality
-
TEST_F(TestTakeKernelWithUnion, DISABLED_TakeUnion) {
- for (auto mode : {UnionMode::SPARSE, UnionMode::DENSE}) {
- auto union_type = union_({field("a", int32()), field("b", utf8())}, {2, 5}, mode);
+ for (auto union_ : UnionTypeFactories()) {
+ auto union_type = union_({field("a", int32()), field("b", utf8())}, {2, 5});
auto union_json = R"([
null,
[2, 222],
diff --git a/cpp/src/arrow/ipc/json_internal.cc b/cpp/src/arrow/ipc/json_internal.cc
index f658c33..9d66002 100644
--- a/cpp/src/arrow/ipc/json_internal.cc
+++ b/cpp/src/arrow/ipc/json_internal.cc
@@ -668,7 +668,8 @@ class ArrayWriter {
WriteIntegerField("TYPE_ID", array.raw_type_codes(), array.length());
if (type.mode() == UnionMode::DENSE) {
- WriteIntegerField("OFFSET", array.raw_value_offsets(), array.length());
+ auto offsets = checked_cast<const DenseUnionArray&>(array).raw_value_offsets();
+ WriteIntegerField("OFFSET", offsets, array.length());
}
std::vector<std::shared_ptr<Array>> children;
children.reserve(array.num_fields());
@@ -945,7 +946,11 @@ static Status GetUnion(const RjObject& json_type,
type_codes.push_back(static_cast<int8_t>(val.GetInt()));
}
- *type = union_(children, type_codes, mode);
+ if (mode == UnionMode::SPARSE) {
+ *type = sparse_union(std::move(children), std::move(type_codes));
+ } else {
+ *type = dense_union(std::move(children), std::move(type_codes));
+ }
return Status::OK();
}
@@ -1471,7 +1476,6 @@ class ArrayReader {
std::shared_ptr<Buffer> validity_buffer;
std::shared_ptr<Buffer> type_id_buffer;
- std::shared_ptr<Buffer> offsets_buffer;
RETURN_NOT_OK(GetValidityBuffer(is_valid_, &null_count, &validity_buffer));
@@ -1480,18 +1484,24 @@ class ArrayReader {
RETURN_NOT_OK(
GetIntArray<uint8_t>(json_type_ids->value.GetArray(), length_, &type_id_buffer));
- if (type.mode() == UnionMode::DENSE) {
+ std::vector<std::shared_ptr<Array>> children;
+ RETURN_NOT_OK(GetChildren(obj_, type, &children));
+
+ if (type.mode() == UnionMode::SPARSE) {
+ result_ = std::make_shared<SparseUnionArray>(
+ type_, length_, children, type_id_buffer, validity_buffer, null_count);
+ } else {
const auto& json_offsets = obj_.FindMember("OFFSET");
RETURN_NOT_ARRAY("OFFSET", json_offsets, obj_);
+
+ std::shared_ptr<Buffer> offsets_buffer;
RETURN_NOT_OK(
GetIntArray<int32_t>(json_offsets->value.GetArray(), length_, &offsets_buffer));
- }
- std::vector<std::shared_ptr<Array>> children;
- RETURN_NOT_OK(GetChildren(obj_, type, &children));
-
- result_ = std::make_shared<UnionArray>(type_, length_, children, type_id_buffer,
- offsets_buffer, validity_buffer, null_count);
+ result_ =
+ std::make_shared<DenseUnionArray>(type_, length_, children, type_id_buffer,
+ offsets_buffer, validity_buffer, null_count);
+ }
return Status::OK();
}
diff --git a/cpp/src/arrow/ipc/json_simple.cc b/cpp/src/arrow/ipc/json_simple.cc
index e189fd1..1ca69bf 100644
--- a/cpp/src/arrow/ipc/json_simple.cc
+++ b/cpp/src/arrow/ipc/json_simple.cc
@@ -805,7 +805,8 @@ Status GetConverter(const std::shared_ptr<DataType>& type,
SIMPLE_CONVERTER_CASE(Type::LARGE_BINARY, StringConverter<LargeBinaryType>)
SIMPLE_CONVERTER_CASE(Type::FIXED_SIZE_BINARY, FixedSizeBinaryConverter)
SIMPLE_CONVERTER_CASE(Type::DECIMAL, DecimalConverter)
- SIMPLE_CONVERTER_CASE(Type::UNION, UnionConverter)
+ SIMPLE_CONVERTER_CASE(Type::SPARSE_UNION, UnionConverter)
+ SIMPLE_CONVERTER_CASE(Type::DENSE_UNION, UnionConverter)
SIMPLE_CONVERTER_CASE(Type::INTERVAL_MONTHS, IntegerConverter<MonthIntervalType>)
SIMPLE_CONVERTER_CASE(Type::INTERVAL_DAY_TIME, DayTimeIntervalConverter)
default:
diff --git a/cpp/src/arrow/ipc/json_simple_test.cc b/cpp/src/arrow/ipc/json_simple_test.cc
index 5b21fd5..bf6098d 100644
--- a/cpp/src/arrow/ipc/json_simple_test.cc
+++ b/cpp/src/arrow/ipc/json_simple_test.cc
@@ -994,8 +994,8 @@ TEST(TestDenseUnion, Basics) {
auto field_a = field("a", int8());
auto field_b = field("b", boolean());
- auto type = union_({field_a, field_b}, {4, 8}, UnionMode::DENSE);
- auto array = checked_pointer_cast<UnionArray>(
+ auto type = dense_union({field_a, field_b}, {4, 8});
+ auto array = checked_pointer_cast<DenseUnionArray>(
ArrayFromJSON(type, "[null, [4, 122], [8, true], [4, null], null, [8, false]]"));
auto expected_types = ArrayFromJSON(int8(), "[null, 4, 8, 4, null, 8]");
@@ -1004,7 +1004,7 @@ TEST(TestDenseUnion, Basics) {
auto expected_b = ArrayFromJSON(boolean(), "[true, false]");
ASSERT_OK_AND_ASSIGN(
- auto expected, UnionArray::MakeDense(*expected_types, *expected_offsets,
+ auto expected, DenseUnionArray::Make(*expected_types, *expected_offsets,
{expected_a, expected_b}, {"a", "b"}, {4, 8}));
ASSERT_ARRAYS_EQUAL(*expected, *array);
@@ -1019,7 +1019,7 @@ TEST(TestSparseUnion, Basics) {
auto field_a = field("a", int8());
auto field_b = field("b", boolean());
- auto type = union_({field_a, field_b}, {4, 8}, UnionMode::SPARSE);
+ auto type = sparse_union({field_a, field_b}, {4, 8});
auto array = ArrayFromJSON(type, "[[4, 122], [8, true], [4, null], null, [8, false]]");
auto expected_types = ArrayFromJSON(int8(), "[4, 8, 4, null, 8]");
@@ -1027,7 +1027,7 @@ TEST(TestSparseUnion, Basics) {
auto expected_b = ArrayFromJSON(boolean(), "[null, true, null, null, false]");
ASSERT_OK_AND_ASSIGN(auto expected,
- UnionArray::MakeSparse(*expected_types, {expected_a, expected_b},
+ SparseUnionArray::Make(*expected_types, {expected_a, expected_b},
{"a", "b"}, {4, 8}));
ASSERT_ARRAYS_EQUAL(*expected, *array);
@@ -1036,7 +1036,7 @@ TEST(TestSparseUnion, Basics) {
TEST(TestDenseUnion, ListOfUnion) {
auto field_a = field("a", int8());
auto field_b = field("b", boolean());
- auto union_type = union_({field_a, field_b}, {4, 8}, UnionMode::DENSE);
+ auto union_type = dense_union({field_a, field_b}, {4, 8});
auto list_type = list(union_type);
auto array =
checked_pointer_cast<ListArray>(ArrayFromJSON(list_type,
@@ -1052,7 +1052,7 @@ TEST(TestDenseUnion, ListOfUnion) {
ASSERT_OK_AND_ASSIGN(
auto expected_values,
- UnionArray::MakeDense(*expected_types, *expected_offsets, {expected_a, expected_b},
+ DenseUnionArray::Make(*expected_types, *expected_offsets, {expected_a, expected_b},
{"a", "b"}, {4, 8}));
auto expected_list_offsets = ArrayFromJSON(int32(), "[0, 2, 5]");
ASSERT_OK_AND_ASSIGN(auto expected,
@@ -1061,9 +1061,9 @@ TEST(TestDenseUnion, ListOfUnion) {
ASSERT_ARRAYS_EQUAL(*expected, *array);
// ensure that the array is as dense as we expect
- auto array_values = checked_pointer_cast<UnionArray>(array->values());
+ auto array_values = checked_pointer_cast<DenseUnionArray>(array->values());
ASSERT_TRUE(array_values->value_offsets()->Equals(
- *checked_pointer_cast<UnionArray>(expected_values)->value_offsets()));
+ *checked_pointer_cast<DenseUnionArray>(expected_values)->value_offsets()));
ASSERT_ARRAYS_EQUAL(*expected_a, *array_values->field(0));
ASSERT_ARRAYS_EQUAL(*expected_b, *array_values->field(1));
}
@@ -1071,7 +1071,7 @@ TEST(TestDenseUnion, ListOfUnion) {
TEST(TestSparseUnion, ListOfUnion) {
auto field_a = field("a", int8());
auto field_b = field("b", boolean());
- auto union_type = union_({field_a, field_b}, {4, 8}, UnionMode::SPARSE);
+ auto union_type = sparse_union({field_a, field_b}, {4, 8});
auto list_type = list(union_type);
auto array = ArrayFromJSON(list_type,
"["
@@ -1084,7 +1084,7 @@ TEST(TestSparseUnion, ListOfUnion) {
auto expected_b = ArrayFromJSON(boolean(), "[null, true, null, null, false]");
ASSERT_OK_AND_ASSIGN(auto expected_values,
- UnionArray::MakeSparse(*expected_types, {expected_a, expected_b},
+ SparseUnionArray::Make(*expected_types, {expected_a, expected_b},
{"a", "b"}, {4, 8}));
auto expected_list_offsets = ArrayFromJSON(int32(), "[0, 2, 5]");
ASSERT_OK_AND_ASSIGN(auto expected,
@@ -1099,8 +1099,8 @@ TEST(TestDenseUnion, UnionOfStructs) {
field("wtf", struct_({field("whiskey", int8()), field("tango", float64()),
field("foxtrot", list(int8()))})),
field("q", struct_({field("quebec", utf8())}))};
- auto type = union_(fields, {0, 23, 47}, UnionMode::DENSE);
- auto array = checked_pointer_cast<UnionArray>(ArrayFromJSON(type, R"([
+ auto type = dense_union(fields, {0, 23, 47});
+ auto array = checked_pointer_cast<DenseUnionArray>(ArrayFromJSON(type, R"([
[0, {"alpha": 0.0, "bravo": "charlie"}],
[23, {"whiskey": 99}],
[0, {"bravo": "mike"}],
@@ -1122,7 +1122,7 @@ TEST(TestDenseUnion, UnionOfStructs) {
ASSERT_OK_AND_ASSIGN(
auto expected,
- UnionArray::MakeDense(*expected_types, *expected_offsets, expected_fields,
+ DenseUnionArray::Make(*expected_types, *expected_offsets, expected_fields,
{"ab", "wtf", "q"}, {0, 23, 47}));
ASSERT_ARRAYS_EQUAL(*expected, *array);
@@ -1141,7 +1141,7 @@ TEST(TestSparseUnion, UnionOfStructs) {
field("wtf", struct_({field("whiskey", int8()), field("tango", float64()),
field("foxtrot", list(int8()))})),
field("q", struct_({field("quebec", utf8())}))};
- auto type = union_(fields, {0, 23, 47}, UnionMode::SPARSE);
+ auto type = sparse_union(fields, {0, 23, 47});
auto array = ArrayFromJSON(type, R"([
[0, {"alpha": 0.0, "bravo": "charlie"}],
[23, {"whiskey": 99}],
@@ -1169,7 +1169,7 @@ TEST(TestSparseUnion, UnionOfStructs) {
ArrayFromJSON(fields[2]->type(), "[null, null, null, null, null]")};
ASSERT_OK_AND_ASSIGN(auto expected,
- UnionArray::MakeSparse(*expected_types, expected_fields,
+ SparseUnionArray::Make(*expected_types, expected_fields,
{"ab", "wtf", "q"}, {0, 23, 47}));
ASSERT_ARRAYS_EQUAL(*expected, *array);
@@ -1178,7 +1178,7 @@ TEST(TestSparseUnion, UnionOfStructs) {
TEST(TestDenseUnion, Errors) {
auto field_a = field("a", int8());
auto field_b = field("b", boolean());
- std::shared_ptr<DataType> type = union_({field_a, field_b}, {4, 8}, UnionMode::DENSE);
+ std::shared_ptr<DataType> type = dense_union({field_a, field_b}, {4, 8});
std::shared_ptr<Array> array;
ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[\"not a valid type_id\"]", &array));
@@ -1195,7 +1195,7 @@ TEST(TestDenseUnion, Errors) {
TEST(TestSparseUnion, Errors) {
auto field_a = field("a", int8());
auto field_b = field("b", boolean());
- std::shared_ptr<DataType> type = union_({field_a, field_b}, {4, 8}, UnionMode::SPARSE);
+ std::shared_ptr<DataType> type = sparse_union({field_a, field_b}, {4, 8});
std::shared_ptr<Array> array;
ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[\"not a valid type_id\"]", &array));
diff --git a/cpp/src/arrow/ipc/json_test.cc b/cpp/src/arrow/ipc/json_test.cc
index 944d352..e55a448 100644
--- a/cpp/src/arrow/ipc/json_test.cc
+++ b/cpp/src/arrow/ipc/json_test.cc
@@ -399,8 +399,9 @@ TEST(TestJsonSchemaWriter, FlatTypes) {
field("f15", date64()),
field("f16", timestamp(TimeUnit::NANO)),
field("f17", time64(TimeUnit::MICRO)),
- field("f18", union_({field("u1", int8()), field("u2", time32(TimeUnit::MILLI))},
- {0, 1}, UnionMode::DENSE)),
+ field("f18",
+ dense_union({field("u1", int8()), field("u2", time32(TimeUnit::MILLI))},
+ {0, 1})),
field("f19", large_list(uint8())),
field("f20", null()),
};
diff --git a/cpp/src/arrow/ipc/metadata_internal.cc b/cpp/src/arrow/ipc/metadata_internal.cc
index 4ac20db..e1b14e2 100644
--- a/cpp/src/arrow/ipc/metadata_internal.cc
+++ b/cpp/src/arrow/ipc/metadata_internal.cc
@@ -172,7 +172,14 @@ Status UnionFromFlatbuffer(const flatbuf::Union* union_data,
}
}
- return UnionType::Make(children, type_codes, mode).Value(out);
+ if (mode == UnionMode::SPARSE) {
+ ARROW_ASSIGN_OR_RAISE(
+ *out, SparseUnionType::Make(std::move(children), std::move(type_codes)));
+ } else {
+ ARROW_ASSIGN_OR_RAISE(
+ *out, DenseUnionType::Make(std::move(children), std::move(type_codes)));
+ }
+ return Status::OK();
}
#define INT_TO_FB_CASE(BIT_WIDTH, IS_SIGNED) \
diff --git a/cpp/src/arrow/ipc/read_write_test.cc b/cpp/src/arrow/ipc/read_write_test.cc
index 9393c44..9236708 100644
--- a/cpp/src/arrow/ipc/read_write_test.cc
+++ b/cpp/src/arrow/ipc/read_write_test.cc
@@ -629,24 +629,24 @@ TEST_F(TestWriteRecordBatch, SliceTruncatesBuffers) {
CheckArray(a1);
// Sparse Union
- auto union_type = union_({field("f0", a0->type())}, {0});
+ auto union_type = sparse_union({field("f0", a0->type())}, {0});
std::vector<int32_t> type_ids(a0->length());
std::shared_ptr<Buffer> ids_buffer;
ASSERT_OK(CopyBufferFromVector(type_ids, default_memory_pool(), &ids_buffer));
- a1 =
- std::make_shared<UnionArray>(union_type, a0->length(), struct_children, ids_buffer);
+ a1 = std::make_shared<SparseUnionArray>(union_type, a0->length(), struct_children,
+ ids_buffer);
CheckArray(a1);
// Dense union
- auto dense_union_type = union_({field("f0", a0->type())}, {0}, UnionMode::DENSE);
+ auto dense_union_type = dense_union({field("f0", a0->type())}, {0});
std::vector<int32_t> type_offsets;
for (int32_t i = 0; i < a0->length(); ++i) {
type_offsets.push_back(i);
}
std::shared_ptr<Buffer> offsets_buffer;
ASSERT_OK(CopyBufferFromVector(type_offsets, default_memory_pool(), &offsets_buffer));
- a1 = std::make_shared<UnionArray>(dense_union_type, a0->length(), struct_children,
- ids_buffer, offsets_buffer);
+ a1 = std::make_shared<DenseUnionArray>(dense_union_type, a0->length(), struct_children,
+ ids_buffer, offsets_buffer);
CheckArray(a1);
}
diff --git a/cpp/src/arrow/ipc/reader.cc b/cpp/src/arrow/ipc/reader.cc
index dbd476e..e20ee06 100644
--- a/cpp/src/arrow/ipc/reader.cc
+++ b/cpp/src/arrow/ipc/reader.cc
@@ -303,7 +303,8 @@ class ArrayLoader {
}
Status Visit(const UnionType& type) {
- out_->buffers.resize(3);
+ int n_buffers = type.mode() == UnionMode::SPARSE ? 2 : 3;
+ out_->buffers.resize(n_buffers);
RETURN_NOT_OK(LoadCommon());
if (out_->length > 0) {
@@ -312,7 +313,7 @@ class ArrayLoader {
RETURN_NOT_OK(GetBuffer(buffer_index_ + 1, &out_->buffers[2]));
}
}
- buffer_index_ += type.mode() == UnionMode::DENSE ? 2 : 1;
+ buffer_index_ += n_buffers - 1;
return LoadChildren(type.fields());
}
diff --git a/cpp/src/arrow/ipc/test_common.cc b/cpp/src/arrow/ipc/test_common.cc
index 00c3942..608c2f1 100644
--- a/cpp/src/arrow/ipc/test_common.cc
+++ b/cpp/src/arrow/ipc/test_common.cc
@@ -435,15 +435,12 @@ Status MakeStruct(std::shared_ptr<RecordBatch>* out) {
Status MakeUnion(std::shared_ptr<RecordBatch>* out) {
// Define schema
- std::vector<std::shared_ptr<Field>> union_types(
+ std::vector<std::shared_ptr<Field>> union_fields(
{field("u0", int32()), field("u1", uint8())});
std::vector<int8_t> type_codes = {5, 10};
- auto sparse_type =
- std::make_shared<UnionType>(union_types, type_codes, UnionMode::SPARSE);
-
- auto dense_type =
- std::make_shared<UnionType>(union_types, type_codes, UnionMode::DENSE);
+ auto sparse_type = sparse_union(union_fields, type_codes);
+ auto dense_type = dense_union(union_fields, type_codes);
auto f0 = field("sparse_nonnull", sparse_type, false);
auto f1 = field("sparse", sparse_type);
@@ -483,14 +480,14 @@ Status MakeUnion(std::shared_ptr<RecordBatch>* out) {
ARROW_ASSIGN_OR_RAISE(auto null_bitmap, internal::BytesToBits(null_bytes));
// construct individual nullable/non-nullable struct arrays
- auto sparse_no_nulls =
- std::make_shared<UnionArray>(sparse_type, length, sparse_children, type_ids_buffer);
- auto sparse = std::make_shared<UnionArray>(sparse_type, length, sparse_children,
- type_ids_buffer, NULLPTR, null_bitmap, 1);
+ auto sparse_no_nulls = std::make_shared<SparseUnionArray>(
+ sparse_type, length, sparse_children, type_ids_buffer);
+ auto sparse = std::make_shared<SparseUnionArray>(sparse_type, length, sparse_children,
+ type_ids_buffer, null_bitmap, 1);
auto dense =
- std::make_shared<UnionArray>(dense_type, length, dense_children, type_ids_buffer,
- offsets_buffer, null_bitmap, 1);
+ std::make_shared<DenseUnionArray>(dense_type, length, dense_children,
+ type_ids_buffer, offsets_buffer, null_bitmap, 1);
// construct batch
std::vector<std::shared_ptr<Array>> arrays = {sparse_no_nulls, sparse, dense};
diff --git a/cpp/src/arrow/ipc/writer.cc b/cpp/src/arrow/ipc/writer.cc
index 76c5ad4..d0d7deb 100644
--- a/cpp/src/arrow/ipc/writer.cc
+++ b/cpp/src/arrow/ipc/writer.cc
@@ -398,7 +398,7 @@ class RecordBatchSerializer {
return Status::OK();
}
- Status Visit(const UnionArray& array) {
+ Status Visit(const SparseUnionArray& array) {
const int64_t offset = array.offset();
const int64_t length = array.length();
@@ -409,91 +409,103 @@ class RecordBatchSerializer {
out_->body_buffers.emplace_back(type_codes);
--max_recursion_depth_;
- if (array.mode() == UnionMode::DENSE) {
- const auto& type = checked_cast<const UnionType&>(*array.type());
-
- std::shared_ptr<Buffer> value_offsets;
- RETURN_NOT_OK(GetTruncatedBuffer(
- offset, length, static_cast<int32_t>(sizeof(int32_t)), array.value_offsets(),
- options_.memory_pool, &value_offsets));
-
- // The Union type codes are not necessary 0-indexed
- int8_t max_code = 0;
- for (int8_t code : type.type_codes()) {
- if (code > max_code) {
- max_code = code;
- }
- }
+ for (int i = 0; i < array.num_fields(); ++i) {
+ // Sparse union, slicing is done for us by field()
+ RETURN_NOT_OK(VisitArray(*array.field(i)));
+ }
+ ++max_recursion_depth_;
+ return Status::OK();
+ }
- // Allocate an array of child offsets. Set all to -1 to indicate that we
- // haven't observed a first occurrence of a particular child yet
- std::vector<int32_t> child_offsets(max_code + 1, -1);
- std::vector<int32_t> child_lengths(max_code + 1, 0);
+ Status Visit(const DenseUnionArray& array) {
+ const int64_t offset = array.offset();
+ const int64_t length = array.length();
- if (offset != 0) {
- // This is an unpleasant case. Because the offsets are different for
- // each child array, when we have a sliced array, we need to "rebase"
- // the value_offsets for each array
-
- const int32_t* unshifted_offsets = array.raw_value_offsets();
- const int8_t* type_codes = array.raw_type_codes();
-
- // Allocate the shifted offsets
- ARROW_ASSIGN_OR_RAISE(
- auto shifted_offsets_buffer,
- AllocateBuffer(length * sizeof(int32_t), options_.memory_pool));
- int32_t* shifted_offsets =
- reinterpret_cast<int32_t*>(shifted_offsets_buffer->mutable_data());
-
- // Offsets may not be ascending, so we need to find out the start offset
- // for each child
- for (int64_t i = 0; i < length; ++i) {
- const uint8_t code = type_codes[i];
- if (child_offsets[code] == -1) {
- child_offsets[code] = unshifted_offsets[i];
- } else {
- child_offsets[code] = std::min(child_offsets[code], unshifted_offsets[i]);
- }
- }
+ std::shared_ptr<Buffer> type_codes;
+ RETURN_NOT_OK(GetTruncatedBuffer(
+ offset, length, static_cast<int32_t>(sizeof(UnionArray::type_code_t)),
+ array.type_codes(), options_.memory_pool, &type_codes));
+ out_->body_buffers.emplace_back(type_codes);
- // Now compute shifted offsets by subtracting child offset
- for (int64_t i = 0; i < length; ++i) {
- const int8_t code = type_codes[i];
- shifted_offsets[i] = unshifted_offsets[i] - child_offsets[code];
- // Update the child length to account for observed value
- child_lengths[code] = std::max(child_lengths[code], shifted_offsets[i] + 1);
- }
+ --max_recursion_depth_;
+ const auto& type = checked_cast<const UnionType&>(*array.type());
- value_offsets = std::move(shifted_offsets_buffer);
+ std::shared_ptr<Buffer> value_offsets;
+ RETURN_NOT_OK(
+ GetTruncatedBuffer(offset, length, static_cast<int32_t>(sizeof(int32_t)),
+ array.value_offsets(), options_.memory_pool, &value_offsets));
+
+ // The Union type codes are not necessary 0-indexed
+ int8_t max_code = 0;
+ for (int8_t code : type.type_codes()) {
+ if (code > max_code) {
+ max_code = code;
}
- out_->body_buffers.emplace_back(value_offsets);
-
- // Visit children and slice accordingly
- for (int i = 0; i < type.num_fields(); ++i) {
- std::shared_ptr<Array> child = array.field(i);
-
- // TODO: ARROW-809, for sliced unions, tricky to know how much to
- // truncate the children. For now, we are truncating the children to be
- // no longer than the parent union.
- if (offset != 0) {
- const int8_t code = type.type_codes()[i];
- const int64_t child_offset = child_offsets[code];
- const int64_t child_length = child_lengths[code];
-
- if (child_offset > 0) {
- child = child->Slice(child_offset, child_length);
- } else if (child_length < child->length()) {
- // This case includes when child is not encountered at all
- child = child->Slice(0, child_length);
- }
+ }
+
+ // Allocate an array of child offsets. Set all to -1 to indicate that we
+ // haven't observed a first occurrence of a particular child yet
+ std::vector<int32_t> child_offsets(max_code + 1, -1);
+ std::vector<int32_t> child_lengths(max_code + 1, 0);
+
+ if (offset != 0) {
+ // This is an unpleasant case. Because the offsets are different for
+ // each child array, when we have a sliced array, we need to "rebase"
+ // the value_offsets for each array
+
+ const int32_t* unshifted_offsets = array.raw_value_offsets();
+ const int8_t* type_codes = array.raw_type_codes();
+
+ // Allocate the shifted offsets
+ ARROW_ASSIGN_OR_RAISE(
+ auto shifted_offsets_buffer,
+ AllocateBuffer(length * sizeof(int32_t), options_.memory_pool));
+ int32_t* shifted_offsets =
+ reinterpret_cast<int32_t*>(shifted_offsets_buffer->mutable_data());
+
+ // Offsets may not be ascending, so we need to find out the start offset
+ // for each child
+ for (int64_t i = 0; i < length; ++i) {
+ const uint8_t code = type_codes[i];
+ if (child_offsets[code] == -1) {
+ child_offsets[code] = unshifted_offsets[i];
+ } else {
+ child_offsets[code] = std::min(child_offsets[code], unshifted_offsets[i]);
}
- RETURN_NOT_OK(VisitArray(*child));
}
- } else {
- for (int i = 0; i < array.num_fields(); ++i) {
- // Sparse union, slicing is done for us by field()
- RETURN_NOT_OK(VisitArray(*array.field(i)));
+
+ // Now compute shifted offsets by subtracting child offset
+ for (int64_t i = 0; i < length; ++i) {
+ const int8_t code = type_codes[i];
+ shifted_offsets[i] = unshifted_offsets[i] - child_offsets[code];
+ // Update the child length to account for observed value
+ child_lengths[code] = std::max(child_lengths[code], shifted_offsets[i] + 1);
+ }
+
+ value_offsets = std::move(shifted_offsets_buffer);
+ }
+ out_->body_buffers.emplace_back(value_offsets);
+
+ // Visit children and slice accordingly
+ for (int i = 0; i < type.num_fields(); ++i) {
+ std::shared_ptr<Array> child = array.field(i);
+
+ // TODO: ARROW-809, for sliced unions, tricky to know how much to
+ // truncate the children. For now, we are truncating the children to be
+ // no longer than the parent union.
+ if (offset != 0) {
+ const int8_t code = type.type_codes()[i];
+ const int64_t child_offset = child_offsets[code];
+ const int64_t child_length = child_lengths[code];
+
+ if (child_offset > 0) {
+ child = child->Slice(child_offset, child_length);
+ } else if (child_length < child->length()) {
+ // This case includes when child is not encountered at all
+ child = child->Slice(0, child_length);
+ }
}
+ RETURN_NOT_OK(VisitArray(*child));
}
++max_recursion_depth_;
return Status::OK();
diff --git a/cpp/src/arrow/pretty_print.cc b/cpp/src/arrow/pretty_print.cc
index d47f2d7..70b6fd0 100644
--- a/cpp/src/arrow/pretty_print.cc
+++ b/cpp/src/arrow/pretty_print.cc
@@ -353,8 +353,9 @@ class ArrayPrinter : public PrettyPrinter {
if (array.mode() == UnionMode::DENSE) {
Newline();
Write("-- value_offsets: ");
- Int32Array value_offsets(array.length(), array.value_offsets(), nullptr, 0,
- array.offset());
+ Int32Array value_offsets(
+ array.length(), checked_cast<const DenseUnionArray&>(array).value_offsets(),
+ nullptr, 0, array.offset());
RETURN_NOT_OK(PrettyPrint(value_offsets, indent_ + options_.indent_size, sink_));
}
diff --git a/cpp/src/arrow/python/arrow_to_pandas.cc b/cpp/src/arrow/python/arrow_to_pandas.cc
index 8a32a24..604e716 100644
--- a/cpp/src/arrow/python/arrow_to_pandas.cc
+++ b/cpp/src/arrow/python/arrow_to_pandas.cc
@@ -983,7 +983,7 @@ struct ObjectWriterVisitor {
std::is_same<ExtensionType, Type>::value ||
std::is_base_of<IntervalType, Type>::value ||
std::is_same<TimestampType, Type>::value ||
- std::is_same<UnionType, Type>::value,
+ std::is_base_of<UnionType, Type>::value,
Status>
Visit(const Type& type) {
return Status::NotImplemented("No implemented conversion to object dtype: ",
diff --git a/cpp/src/arrow/python/deserialize.cc b/cpp/src/arrow/python/deserialize.cc
index 97080df..c7d99d2 100644
--- a/cpp/src/arrow/python/deserialize.cc
+++ b/cpp/src/arrow/python/deserialize.cc
@@ -249,7 +249,7 @@ Status DeserializeSequence(PyObject* context, const Array& array, int64_t start_
const SerializedPyObject& blobs,
CreateSequenceFn&& create_sequence, SetItemFn&& set_item,
PyObject** out) {
- const auto& data = checked_cast<const UnionArray&>(array);
+ const auto& data = checked_cast<const DenseUnionArray&>(array);
OwnedRef result(create_sequence(stop_idx - start_idx));
RETURN_IF_PYERROR();
const int8_t* type_codes = data.raw_type_codes();
diff --git a/cpp/src/arrow/python/serialize.cc b/cpp/src/arrow/python/serialize.cc
index 89312bb..9d10041 100644
--- a/cpp/src/arrow/python/serialize.cc
+++ b/cpp/src/arrow/python/serialize.cc
@@ -301,8 +301,8 @@ class SequenceBuilder {
class DictBuilder {
public:
explicit DictBuilder(MemoryPool* pool = nullptr) : keys_(pool), vals_(pool) {
- builder_.reset(new StructBuilder(struct_({field("keys", union_(UnionMode::DENSE)),
- field("vals", union_(UnionMode::DENSE))}),
+ builder_.reset(new StructBuilder(struct_({field("keys", dense_union(FieldVector{})),
+ field("vals", dense_union(FieldVector{}))}),
pool, {keys_.builder(), vals_.builder()}));
}
diff --git a/cpp/src/arrow/scalar.cc b/cpp/src/arrow/scalar.cc
index cc32d8a..030b2cc 100644
--- a/cpp/src/arrow/scalar.cc
+++ b/cpp/src/arrow/scalar.cc
@@ -470,7 +470,12 @@ struct FromTypeVisitor {
return Status::Invalid("attempting to cast scalar of type null to ", *to_type_);
}
- Status Visit(const UnionType&) { return Status::NotImplemented("cast to ", *to_type_); }
+ Status Visit(const SparseUnionType&) {
+ return Status::NotImplemented("cast to ", *to_type_);
+ }
+ Status Visit(const DenseUnionType&) {
+ return Status::NotImplemented("cast to ", *to_type_);
+ }
Status Visit(const DictionaryType&) {
return Status::NotImplemented("cast to ", *to_type_);
}
@@ -499,7 +504,10 @@ struct ToTypeVisitor {
return Status::OK();
}
- Status Visit(const UnionType&) {
+ Status Visit(const SparseUnionType&) {
+ return Status::NotImplemented("cast from ", *from_.type);
+ }
+ Status Visit(const DenseUnionType&) {
return Status::NotImplemented("cast from ", *from_.type);
}
Status Visit(const DictionaryType&) {
diff --git a/cpp/src/arrow/scalar.h b/cpp/src/arrow/scalar.h
index 5caf04d..57751e7 100644
--- a/cpp/src/arrow/scalar.h
+++ b/cpp/src/arrow/scalar.h
@@ -373,7 +373,16 @@ struct ARROW_EXPORT StructScalar : public Scalar {
struct ARROW_EXPORT UnionScalar : public Scalar {
using Scalar::Scalar;
- using TypeClass = UnionType;
+};
+
+struct ARROW_EXPORT SparseUnionScalar : public UnionScalar {
+ using UnionScalar::UnionScalar;
+ using TypeClass = SparseUnionType;
+};
+
+struct ARROW_EXPORT DenseUnionScalar : public UnionScalar {
+ using UnionScalar::UnionScalar;
+ using TypeClass = DenseUnionType;
};
struct ARROW_EXPORT DictionaryScalar : public Scalar {
diff --git a/cpp/src/arrow/testing/util.h b/cpp/src/arrow/testing/util.h
index 94cd867..6022f7e 100644
--- a/cpp/src/arrow/testing/util.h
+++ b/cpp/src/arrow/testing/util.h
@@ -169,6 +169,11 @@ Result<std::shared_ptr<Array>> ArrayFromBuilderVisitor(
return ArrayFromBuilderVisitor(type, length, length, std::forward<Fn>(fn));
}
+static inline std::vector<std::shared_ptr<DataType> (*)(FieldVector, std::vector<int8_t>)>
+UnionTypeFactories() {
+ return {sparse_union, dense_union};
+}
+
// Get a TCP port number to listen on. This is a different number every time,
// as reusing the same port across tests can produce spurious bind errors on
// Windows.
diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc
index 5aef34b..a6b383b 100644
--- a/cpp/src/arrow/type.cc
+++ b/cpp/src/arrow/type.cc
@@ -40,6 +40,7 @@
#include "arrow/util/key_value_metadata.h"
#include "arrow/util/logging.h"
#include "arrow/util/make_unique.h"
+#include "arrow/util/range.h"
#include "arrow/util/vector.h"
#include "arrow/visitor_inline.h"
@@ -67,7 +68,9 @@ constexpr Type::type StructType::type_id;
constexpr Type::type Decimal128Type::type_id;
-constexpr Type::type UnionType::type_id;
+constexpr Type::type SparseUnionType::type_id;
+
+constexpr Type::type DenseUnionType::type_id;
constexpr Type::type Date32Type::type_id;
@@ -141,8 +144,10 @@ std::string ToString(Type::type id) {
return "LIST";
case Type::STRUCT:
return "STRUCT";
- case Type::UNION:
- return "UNION";
+ case Type::SPARSE_UNION:
+ return "SPARSE_UNION";
+ case Type::DENSE_UNION:
+ return "DENSE_UNION";
case Type::DICTIONARY:
return "DICTIONARY";
case Type::MAP:
@@ -553,27 +558,23 @@ std::string DurationType::ToString() const {
constexpr int8_t UnionType::kMaxTypeCode;
constexpr int UnionType::kInvalidChildId;
-UnionType::UnionType(const std::vector<std::shared_ptr<Field>>& fields,
- const std::vector<int8_t>& type_codes, UnionMode::type mode)
- : NestedType(Type::UNION),
- mode_(mode),
- type_codes_(type_codes),
+UnionMode::type UnionType::mode() const {
+ return id_ == Type::SPARSE_UNION ? UnionMode::SPARSE : UnionMode::DENSE;
+}
+
+UnionType::UnionType(std::vector<std::shared_ptr<Field>> fields,
+ std::vector<int8_t> type_codes, Type::type id)
+ : NestedType(id),
+ type_codes_(std::move(type_codes)),
child_ids_(kMaxTypeCode + 1, kInvalidChildId) {
- DCHECK_OK(ValidateParameters(fields, type_codes, mode));
- children_ = fields;
+ children_ = std::move(fields);
+ DCHECK_OK(ValidateParameters(children_, type_codes_, mode()));
for (int child_id = 0; child_id < static_cast<int>(type_codes_.size()); ++child_id) {
const auto type_code = type_codes_[child_id];
child_ids_[type_code] = child_id;
}
}
-Result<std::shared_ptr<DataType>> UnionType::Make(
- const std::vector<std::shared_ptr<Field>>& fields,
- const std::vector<int8_t>& type_codes, UnionMode::type mode) {
- RETURN_NOT_OK(ValidateParameters(fields, type_codes, mode));
- return std::make_shared<UnionType>(fields, type_codes, mode);
-}
-
Status UnionType::ValidateParameters(const std::vector<std::shared_ptr<Field>>& fields,
const std::vector<int8_t>& type_codes,
UnionMode::type mode) {
@@ -589,10 +590,9 @@ Status UnionType::ValidateParameters(const std::vector<std::shared_ptr<Field>>&
}
DataTypeLayout UnionType::layout() const {
- if (mode_ == UnionMode::SPARSE) {
- return DataTypeLayout({DataTypeLayout::Bitmap(),
- DataTypeLayout::FixedWidth(sizeof(uint8_t)),
- DataTypeLayout::AlwaysNull()});
+ if (mode() == UnionMode::SPARSE) {
+ return DataTypeLayout(
+ {DataTypeLayout::Bitmap(), DataTypeLayout::FixedWidth(sizeof(uint8_t))});
} else {
return DataTypeLayout({DataTypeLayout::Bitmap(),
DataTypeLayout::FixedWidth(sizeof(uint8_t)),
@@ -609,11 +609,7 @@ uint8_t UnionType::max_type_code() const {
std::string UnionType::ToString() const {
std::stringstream s;
- if (mode_ == UnionMode::SPARSE) {
- s << "union[sparse]<";
- } else {
- s << "union[dense]<";
- }
+ s << name() << "<";
for (size_t i = 0; i < children_.size(); ++i) {
if (i) {
@@ -625,6 +621,26 @@ std::string UnionType::ToString() const {
return s.str();
}
+SparseUnionType::SparseUnionType(std::vector<std::shared_ptr<Field>> fields,
+ std::vector<int8_t> type_codes)
+ : UnionType(fields, type_codes, Type::SPARSE_UNION) {}
+
+Result<std::shared_ptr<DataType>> SparseUnionType::Make(
+ std::vector<std::shared_ptr<Field>> fields, std::vector<int8_t> type_codes) {
+ RETURN_NOT_OK(ValidateParameters(fields, type_codes, UnionMode::SPARSE));
+ return std::make_shared<SparseUnionType>(fields, type_codes);
+}
+
+DenseUnionType::DenseUnionType(std::vector<std::shared_ptr<Field>> fields,
+ std::vector<int8_t> type_codes)
+ : UnionType(fields, type_codes, Type::DENSE_UNION) {}
+
+Result<std::shared_ptr<DataType>> DenseUnionType::Make(
+ std::vector<std::shared_ptr<Field>> fields, std::vector<int8_t> type_codes) {
+ RETURN_NOT_OK(ValidateParameters(fields, type_codes, UnionMode::DENSE));
+ return std::make_shared<DenseUnionType>(fields, type_codes);
+}
+
// ----------------------------------------------------------------------
// Struct type
@@ -1867,7 +1883,7 @@ std::string StructType::ComputeFingerprint() const {
std::string UnionType::ComputeFingerprint() const {
std::stringstream ss;
ss << TypeIdFingerprint(*this);
- switch (mode_) {
+ switch (mode()) {
case UnionMode::SPARSE:
ss << "[s";
break;
@@ -2025,45 +2041,59 @@ std::shared_ptr<DataType> struct_(const std::vector<std::shared_ptr<Field>>& fie
return std::make_shared<StructType>(fields);
}
-std::shared_ptr<DataType> union_(const std::vector<std::shared_ptr<Field>>& child_fields,
- const std::vector<int8_t>& type_codes,
- UnionMode::type mode) {
- return std::make_shared<UnionType>(child_fields, type_codes, mode);
+std::shared_ptr<DataType> sparse_union(FieldVector child_fields,
+ std::vector<int8_t> type_codes) {
+ if (type_codes.empty()) {
+ type_codes = internal::Iota(static_cast<int8_t>(child_fields.size()));
+ }
+ return std::make_shared<SparseUnionType>(std::move(child_fields),
+ std::move(type_codes));
+}
+std::shared_ptr<DataType> dense_union(FieldVector child_fields,
+ std::vector<int8_t> type_codes) {
+ if (type_codes.empty()) {
+ type_codes = internal::Iota(static_cast<int8_t>(child_fields.size()));
+ }
+ return std::make_shared<DenseUnionType>(std::move(child_fields), std::move(type_codes));
}
-std::shared_ptr<DataType> union_(const std::vector<std::shared_ptr<Field>>& child_fields,
- UnionMode::type mode) {
- std::vector<int8_t> type_codes(child_fields.size());
- for (int i = 0; i < static_cast<int>(child_fields.size()); ++i) {
- type_codes[i] = static_cast<int8_t>(i);
+FieldVector FieldsFromArraysAndNames(std::vector<std::string> names,
+ const ArrayVector& arrays) {
+ FieldVector fields(arrays.size());
+ int i = 0;
+ if (names.empty()) {
+ for (const auto& array : arrays) {
+ fields[i] = field(std::to_string(i), array->type());
+ ++i;
+ }
+ } else {
+ DCHECK_EQ(names.size(), arrays.size());
+ for (const auto& array : arrays) {
+ fields[i] = field(std::move(names[i]), array->type());
+ ++i;
+ }
}
- return std::make_shared<UnionType>(child_fields, type_codes, mode);
+ return fields;
}
-std::shared_ptr<DataType> union_(UnionMode::type mode) {
- std::vector<std::shared_ptr<Field>> child_fields;
- return union_(child_fields, mode);
+std::shared_ptr<DataType> sparse_union(const ArrayVector& children,
+ std::vector<std::string> field_names,
+ std::vector<int8_t> type_codes) {
+ if (type_codes.empty()) {
+ type_codes = internal::Iota(static_cast<int8_t>(children.size()));
+ }
+ auto fields = FieldsFromArraysAndNames(std::move(field_names), children);
+ return sparse_union(std::move(fields), std::move(type_codes));
}
-std::shared_ptr<DataType> union_(const std::vector<std::shared_ptr<Array>>& children,
- const std::vector<std::string>& field_names,
- const std::vector<int8_t>& given_type_codes,
- UnionMode::type mode) {
- std::vector<std::shared_ptr<Field>> fields;
- std::vector<int8_t> type_codes(given_type_codes);
- int8_t counter = 0;
- for (const auto& child : children) {
- if (field_names.size() == 0) {
- fields.push_back(field(std::to_string(counter), child->type()));
- } else {
- fields.push_back(field(std::move(field_names[counter]), child->type()));
- }
- if (given_type_codes.size() == 0) {
- type_codes.push_back(counter);
- }
- counter++;
+std::shared_ptr<DataType> dense_union(const ArrayVector& children,
+ std::vector<std::string> field_names,
+ std::vector<int8_t> type_codes) {
+ if (type_codes.empty()) {
+ type_codes = internal::Iota(static_cast<int8_t>(children.size()));
}
- return union_(fields, std::move(type_codes), mode);
+ auto fields = FieldsFromArraysAndNames(std::move(field_names), children);
+ return dense_union(std::move(fields), std::move(type_codes));
}
std::shared_ptr<DataType> dictionary(const std::shared_ptr<DataType>& index_type,
diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h
index 8bc7532..df830b6 100644
--- a/cpp/src/arrow/type.h
+++ b/cpp/src/arrow/type.h
@@ -127,8 +127,11 @@ struct Type {
/// Struct of logical types
STRUCT,
- /// Unions of logical types
- UNION,
+ /// Sparse unions of logical types
+ SPARSE_UNION,
+
+ /// Dense unions of logical types
+ DENSE_UNION,
/// Dictionary-encoded type, also called "categorical" or "factor"
/// in other programming languages. Holds the dictionary value
@@ -1015,25 +1018,22 @@ class ARROW_EXPORT Decimal128Type : public DecimalType {
/// \brief Concrete type class for union data
class ARROW_EXPORT UnionType : public NestedType {
public:
- static constexpr Type::type type_id = Type::UNION;
static constexpr int8_t kMaxTypeCode = 127;
static constexpr int kInvalidChildId = -1;
- static constexpr const char* type_name() { return "union"; }
-
- UnionType(const std::vector<std::shared_ptr<Field>>& fields,
- const std::vector<int8_t>& type_codes,
- UnionMode::type mode = UnionMode::SPARSE);
-
- // A constructor variant that validates input parameters
static Result<std::shared_ptr<DataType>> Make(
const std::vector<std::shared_ptr<Field>>& fields,
- const std::vector<int8_t>& type_codes, UnionMode::type mode = UnionMode::SPARSE);
+ const std::vector<int8_t>& type_codes, UnionMode::type mode = UnionMode::SPARSE) {
+ if (mode == UnionMode::SPARSE) {
+ return sparse_union(fields, type_codes);
+ } else {
+ return dense_union(fields, type_codes);
+ }
+ }
DataTypeLayout layout() const override;
std::string ToString() const override;
- std::string name() const override { return "union"; }
/// The array of logical type ids.
///
@@ -1046,21 +1046,55 @@ class ARROW_EXPORT UnionType : public NestedType {
uint8_t max_type_code() const;
- UnionMode::type mode() const { return mode_; }
+ UnionMode::type mode() const;
- private:
- std::string ComputeFingerprint() const override;
+ protected:
+ UnionType(std::vector<std::shared_ptr<Field>> fields, std::vector<int8_t> type_codes,
+ Type::type id);
static Status ValidateParameters(const std::vector<std::shared_ptr<Field>>& fields,
const std::vector<int8_t>& type_codes,
UnionMode::type mode);
- UnionMode::type mode_;
+ private:
+ std::string ComputeFingerprint() const override;
std::vector<int8_t> type_codes_;
std::vector<int> child_ids_;
};
+class ARROW_EXPORT SparseUnionType : public UnionType {
+ public:
+ static constexpr Type::type type_id = Type::SPARSE_UNION;
+
+ static constexpr const char* type_name() { return "sparse_union"; }
+
+ SparseUnionType(std::vector<std::shared_ptr<Field>> fields,
+ std::vector<int8_t> type_codes);
+
+ // A constructor variant that validates input parameters
+ static Result<std::shared_ptr<DataType>> Make(
+ std::vector<std::shared_ptr<Field>> fields, std::vector<int8_t> type_codes);
+
+ std::string name() const override { return "sparse_union"; }
+};
+
+class ARROW_EXPORT DenseUnionType : public UnionType {
+ public:
+ static constexpr Type::type type_id = Type::DENSE_UNION;
+
+ static constexpr const char* type_name() { return "dense_union"; }
+
+ DenseUnionType(std::vector<std::shared_ptr<Field>> fields,
+ std::vector<int8_t> type_codes);
+
+ // A constructor variant that validates input parameters
+ static Result<std::shared_ptr<DataType>> Make(
+ std::vector<std::shared_ptr<Field>> fields, std::vector<int8_t> type_codes);
+
+ std::string name() const override { return "dense_union"; }
+};
+
// ----------------------------------------------------------------------
// Date and time types
diff --git a/cpp/src/arrow/type_fwd.h b/cpp/src/arrow/type_fwd.h
index db51447..4c432cc 100644
--- a/cpp/src/arrow/type_fwd.h
+++ b/cpp/src/arrow/type_fwd.h
@@ -148,13 +148,20 @@ class Decimal128Array;
class Decimal128Builder;
struct Decimal128Scalar;
-class UnionType;
-class UnionArray;
-struct UnionScalar;
struct UnionMode {
enum type { SPARSE, DENSE };
};
+class SparseUnionType;
+class SparseUnionArray;
+class SparseUnionBuilder;
+struct SparseUnionScalar;
+
+class DenseUnionType;
+class DenseUnionArray;
+class DenseUnionBuilder;
+struct DenseUnionScalar;
+
template <typename TypeClass>
class NumericArray;
@@ -363,40 +370,83 @@ std::shared_ptr<DataType> ARROW_EXPORT time64(TimeUnit::type unit);
std::shared_ptr<DataType> ARROW_EXPORT
struct_(const std::vector<std::shared_ptr<Field>>& fields);
-/// \brief Create a UnionType instance
+/// \brief Create a SparseUnionType instance
+std::shared_ptr<DataType> ARROW_EXPORT sparse_union(FieldVector child_fields,
+ std::vector<int8_t> type_codes = {});
+/// \brief Create a DenseUnionType instance
+std::shared_ptr<DataType> ARROW_EXPORT dense_union(FieldVector child_fields,
+ std::vector<int8_t> type_codes = {});
+
+/// \brief Create a SparseUnionType instance
std::shared_ptr<DataType> ARROW_EXPORT
-union_(const std::vector<std::shared_ptr<Field>>& child_fields,
- const std::vector<int8_t>& type_codes, UnionMode::type mode = UnionMode::SPARSE);
+sparse_union(const ArrayVector& children, std::vector<std::string> field_names = {},
+ std::vector<int8_t> type_codes = {});
+/// \brief Create a DenseUnionType instance
+std::shared_ptr<DataType> ARROW_EXPORT
+dense_union(const ArrayVector& children, std::vector<std::string> field_names = {},
+ std::vector<int8_t> type_codes = {});
/// \brief Create a UnionType instance
-std::shared_ptr<DataType> ARROW_EXPORT
+ARROW_DEPRECATED("Deprecated in 1.0.0")
+inline std::shared_ptr<DataType> ARROW_EXPORT
union_(const std::vector<std::shared_ptr<Field>>& child_fields,
- UnionMode::type mode = UnionMode::SPARSE);
+ const std::vector<int8_t>& type_codes, UnionMode::type mode = UnionMode::SPARSE) {
+ if (mode == UnionMode::SPARSE) {
+ return sparse_union(child_fields, type_codes);
+ } else {
+ return dense_union(child_fields, type_codes);
+ }
+}
/// \brief Create a UnionType instance
-std::shared_ptr<DataType> ARROW_EXPORT union_(UnionMode::type mode = UnionMode::SPARSE);
+ARROW_DEPRECATED("Deprecated in 1.0.0")
+inline std::shared_ptr<DataType> ARROW_EXPORT
+union_(const std::vector<std::shared_ptr<Field>>& child_fields,
+ UnionMode::type mode = UnionMode::SPARSE) {
+ if (mode == UnionMode::SPARSE) {
+ return sparse_union(child_fields);
+ } else {
+ return dense_union(child_fields);
+ }
+}
/// \brief Create a UnionType instance
-std::shared_ptr<DataType> ARROW_EXPORT
+ARROW_DEPRECATED("Deprecated in 1.0.0")
+inline std::shared_ptr<DataType> ARROW_EXPORT
union_(const std::vector<std::shared_ptr<Array>>& children,
const std::vector<std::string>& field_names, const std::vector<int8_t>& type_codes,
- UnionMode::type mode = UnionMode::SPARSE);
+ UnionMode::type mode = UnionMode::SPARSE) {
+ if (mode == UnionMode::SPARSE) {
+ return sparse_union(children, field_names, type_codes);
+ } else {
+ return dense_union(children, field_names, type_codes);
+ }
+}
/// \brief Create a UnionType instance
+ARROW_DEPRECATED("Deprecated in 1.0.0")
inline std::shared_ptr<DataType> ARROW_EXPORT
union_(const std::vector<std::shared_ptr<Array>>& children,
const std::vector<std::string>& field_names,
UnionMode::type mode = UnionMode::SPARSE) {
- return union_(children, field_names, {}, mode);
+ if (mode == UnionMode::SPARSE) {
+ return sparse_union(children, field_names);
+ } else {
+ return dense_union(children, field_names);
+ }
}
/// \brief Create a UnionType instance
+ARROW_DEPRECATED("Deprecated in 1.0.0")
inline std::shared_ptr<DataType> ARROW_EXPORT
union_(const std::vector<std::shared_ptr<Array>>& children,
UnionMode::type mode = UnionMode::SPARSE) {
- return union_(children, {}, {}, mode);
+ if (mode == UnionMode::SPARSE) {
+ return sparse_union(children);
+ } else {
+ return dense_union(children);
+ }
}
-
/// \brief Create a DictionaryType instance
/// \param[in] index_type the type of the dictionary indices (must be
/// a signed integer)
diff --git a/cpp/src/arrow/type_test.cc b/cpp/src/arrow/type_test.cc
index 159913e..7cfde34 100644
--- a/cpp/src/arrow/type_test.cc
+++ b/cpp/src/arrow/type_test.cc
@@ -615,8 +615,10 @@ TEST_F(TestSchema, TestDeeplyNestedMetadataComparison) {
auto item0 = field("item", int32(), true);
auto item1 = field("item", int32(), true, key_value_metadata({{"foo", "baz"}}));
- Schema schema0({field("f", list(list(union_({field("struct", struct_({item0}))}))))});
- Schema schema1({field("f", list(list(union_({field("struct", struct_({item1}))}))))});
+ Schema schema0(
+ {field("f", list(list(sparse_union({field("struct", struct_({item0}))}))))});
+ Schema schema1(
+ {field("f", list(list(sparse_union({field("struct", struct_({item1}))}))))});
ASSERT_EQ(schema0.fingerprint(), schema1.fingerprint());
ASSERT_NE(schema0.metadata_fingerprint(), schema1.metadata_fingerprint());
@@ -1368,8 +1370,7 @@ TEST(TestNestedType, Equals) {
auto f_type = field(inner_name, int32());
std::vector<std::shared_ptr<Field>> fields = {f_type};
std::vector<int8_t> codes = {42};
- auto u_type = std::make_shared<UnionType>(fields, codes, UnionMode::SPARSE);
- return field(union_name, u_type);
+ return field(union_name, sparse_union(fields, codes));
};
auto s0 = create_struct("f0", "s0");
@@ -1513,16 +1514,12 @@ TEST(TestUnionType, Basics) {
child_ids2[11] = 1;
child_ids2[12] = 2;
- auto ty1 = checked_pointer_cast<UnionType>(union_(fields, UnionMode::DENSE));
- auto ty2 =
- checked_pointer_cast<UnionType>(union_(fields, type_codes1, UnionMode::DENSE));
- auto ty3 =
- checked_pointer_cast<UnionType>(union_(fields, type_codes2, UnionMode::DENSE));
- auto ty4 = checked_pointer_cast<UnionType>(union_(fields, UnionMode::SPARSE));
- auto ty5 =
- checked_pointer_cast<UnionType>(union_(fields, type_codes1, UnionMode::SPARSE));
- auto ty6 =
- checked_pointer_cast<UnionType>(union_(fields, type_codes2, UnionMode::SPARSE));
+ auto ty1 = checked_pointer_cast<UnionType>(dense_union(fields));
+ auto ty2 = checked_pointer_cast<UnionType>(dense_union(fields, type_codes1));
+ auto ty3 = checked_pointer_cast<UnionType>(dense_union(fields, type_codes2));
+ auto ty4 = checked_pointer_cast<UnionType>(sparse_union(fields));
+ auto ty5 = checked_pointer_cast<UnionType>(sparse_union(fields, type_codes1));
+ auto ty6 = checked_pointer_cast<UnionType>(sparse_union(fields, type_codes2));
ASSERT_EQ(ty1->type_codes(), type_codes1);
ASSERT_EQ(ty2->type_codes(), type_codes1);
diff --git a/cpp/src/arrow/type_traits.h b/cpp/src/arrow/type_traits.h
index d59af0d..dbe5844 100644
--- a/cpp/src/arrow/type_traits.h
+++ b/cpp/src/arrow/type_traits.h
@@ -358,9 +358,18 @@ struct TypeTraits<StructType> {
};
template <>
-struct TypeTraits<UnionType> {
- using ArrayType = UnionArray;
- using ScalarType = UnionScalar;
+struct TypeTraits<SparseUnionType> {
+ using ArrayType = SparseUnionArray;
+ using BuilderType = SparseUnionBuilder;
+ using ScalarType = SparseUnionScalar;
+ constexpr static bool is_parameter_free = false;
+};
+
+template <>
+struct TypeTraits<DenseUnionType> {
+ using ArrayType = DenseUnionArray;
+ using BuilderType = DenseUnionBuilder;
+ using ScalarType = DenseUnionScalar;
constexpr static bool is_parameter_free = false;
};
@@ -784,7 +793,8 @@ static inline bool is_nested(Type::type type_id) {
case Type::FIXED_SIZE_LIST:
case Type::MAP:
case Type::STRUCT:
- case Type::UNION:
+ case Type::SPARSE_UNION:
+ case Type::DENSE_UNION:
return true;
default:
break;
diff --git a/cpp/src/arrow/visitor.cc b/cpp/src/arrow/visitor.cc
index 111b9d9..0a452d5 100644
--- a/cpp/src/arrow/visitor.cc
+++ b/cpp/src/arrow/visitor.cc
@@ -63,7 +63,8 @@ ARRAY_VISITOR_DEFAULT(LargeListArray)
ARRAY_VISITOR_DEFAULT(MapArray)
ARRAY_VISITOR_DEFAULT(FixedSizeListArray)
ARRAY_VISITOR_DEFAULT(StructArray)
-ARRAY_VISITOR_DEFAULT(UnionArray)
+ARRAY_VISITOR_DEFAULT(SparseUnionArray)
+ARRAY_VISITOR_DEFAULT(DenseUnionArray)
ARRAY_VISITOR_DEFAULT(DictionaryArray)
ARRAY_VISITOR_DEFAULT(Decimal128Array)
ARRAY_VISITOR_DEFAULT(ExtensionArray)
@@ -110,7 +111,8 @@ TYPE_VISITOR_DEFAULT(LargeListType)
TYPE_VISITOR_DEFAULT(MapType)
TYPE_VISITOR_DEFAULT(FixedSizeListType)
TYPE_VISITOR_DEFAULT(StructType)
-TYPE_VISITOR_DEFAULT(UnionType)
+TYPE_VISITOR_DEFAULT(SparseUnionType)
+TYPE_VISITOR_DEFAULT(DenseUnionType)
TYPE_VISITOR_DEFAULT(DictionaryType)
TYPE_VISITOR_DEFAULT(ExtensionType)
diff --git a/cpp/src/arrow/visitor.h b/cpp/src/arrow/visitor.h
index 5e83d35..7ab136c 100644
--- a/cpp/src/arrow/visitor.h
+++ b/cpp/src/arrow/visitor.h
@@ -59,7 +59,8 @@ class ARROW_EXPORT ArrayVisitor {
virtual Status Visit(const MapArray& array);
virtual Status Visit(const FixedSizeListArray& array);
virtual Status Visit(const StructArray& array);
- virtual Status Visit(const UnionArray& array);
+ virtual Status Visit(const SparseUnionArray& array);
+ virtual Status Visit(const DenseUnionArray& array);
virtual Status Visit(const DictionaryArray& array);
virtual Status Visit(const ExtensionArray& array);
};
@@ -100,7 +101,8 @@ class ARROW_EXPORT TypeVisitor {
virtual Status Visit(const MapType& type);
virtual Status Visit(const FixedSizeListType& type);
virtual Status Visit(const StructType& type);
- virtual Status Visit(const UnionType& type);
+ virtual Status Visit(const SparseUnionType& type);
+ virtual Status Visit(const DenseUnionType& type);
virtual Status Visit(const DictionaryType& type);
virtual Status Visit(const ExtensionType& type);
};
diff --git a/cpp/src/arrow/visitor_inline.h b/cpp/src/arrow/visitor_inline.h
index 1600ad9..e1023f1 100644
--- a/cpp/src/arrow/visitor_inline.h
+++ b/cpp/src/arrow/visitor_inline.h
@@ -74,7 +74,8 @@ namespace arrow {
ACTION(Map); \
ACTION(FixedSizeList); \
ACTION(Struct); \
- ACTION(Union); \
+ ACTION(SparseUnion); \
+ ACTION(DenseUnion); \
ACTION(Dictionary); \
ACTION(Extension)
diff --git a/cpp/src/gandiva/llvm_types_test.cc b/cpp/src/gandiva/llvm_types_test.cc
index cbcad35..6669683 100644
--- a/cpp/src/gandiva/llvm_types_test.cc
+++ b/cpp/src/gandiva/llvm_types_test.cc
@@ -53,7 +53,8 @@ TEST_F(TestLLVMTypes, TestFound) {
}
TEST_F(TestLLVMTypes, TestNotFound) {
- EXPECT_EQ(types_->IRType(arrow::Type::type::UNION), nullptr);
+ EXPECT_EQ(types_->IRType(arrow::Type::SPARSE_UNION), nullptr);
+ EXPECT_EQ(types_->IRType(arrow::Type::DENSE_UNION), nullptr);
EXPECT_EQ(types_->DataVecType(arrow::null()), nullptr);
}
diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi
index 8c9b9ce..b024423 100644
--- a/python/pyarrow/array.pxi
+++ b/python/pyarrow/array.pxi
@@ -1590,7 +1590,8 @@ cdef class UnionArray(Array):
"""
if self.type.mode != "dense":
raise ArrowTypeError("Can only get value offsets for dense arrays")
- buf = pyarrow_wrap_buffer((<CUnionArray*> self.ap).value_offsets())
+ cdef CDenseUnionArray* dense = <CDenseUnionArray*> self.ap
+ buf = pyarrow_wrap_buffer(dense.value_offsets())
return Array.from_buffers(int32(), len(self), [None, buf])
@staticmethod
@@ -1626,7 +1627,7 @@ cdef class UnionArray(Array):
for x in type_codes:
c_type_codes.push_back(x)
with nogil:
- out = GetResultValue(CUnionArray.MakeDense(
+ out = GetResultValue(CDenseUnionArray.Make(
deref(types.ap), deref(value_offsets.ap), c, c_field_names,
c_type_codes))
cdef Array result = pyarrow_wrap_array(out)
@@ -1665,7 +1666,7 @@ cdef class UnionArray(Array):
for x in type_codes:
c_type_codes.push_back(x)
with nogil:
- out = GetResultValue(CUnionArray.MakeSparse(
+ out = GetResultValue(CSparseUnionArray.Make(
deref(types.ap), c, c_field_names, c_type_codes))
cdef Array result = pyarrow_wrap_array(out)
result.validate()
@@ -2055,7 +2056,8 @@ cdef dict _array_classes = {
_Type_LARGE_LIST: LargeListArray,
_Type_MAP: MapArray,
_Type_FIXED_SIZE_LIST: FixedSizeListArray,
- _Type_UNION: UnionArray,
+ _Type_SPARSE_UNION: UnionArray,
+ _Type_DENSE_UNION: UnionArray,
_Type_BINARY: BinaryArray,
_Type_STRING: StringArray,
_Type_LARGE_BINARY: LargeBinaryArray,
diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd
index 9ed9af3..af434d5 100644
--- a/python/pyarrow/includes/libarrow.pxd
+++ b/python/pyarrow/includes/libarrow.pxd
@@ -83,7 +83,8 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
_Type_LARGE_LIST" arrow::Type::LARGE_LIST"
_Type_FIXED_SIZE_LIST" arrow::Type::FIXED_SIZE_LIST"
_Type_STRUCT" arrow::Type::STRUCT"
- _Type_UNION" arrow::Type::UNION"
+ _Type_SPARSE_UNION" arrow::Type::SPARSE_UNION"
+ _Type_DENSE_UNION" arrow::Type::DENSE_UNION"
_Type_DICTIONARY" arrow::Type::DICTIONARY"
_Type_MAP" arrow::Type::MAP"
@@ -368,12 +369,18 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
vector[int] GetAllFieldIndices(const c_string& name)
cdef cppclass CUnionType" arrow::UnionType"(CDataType):
- CUnionType(const vector[shared_ptr[CField]]& fields,
- const vector[int8_t]& type_codes, UnionMode mode)
UnionMode mode()
const vector[int8_t]& type_codes()
const vector[int]& child_ids()
+ cdef shared_ptr[CDataType] CMakeSparseUnionType" arrow::sparse_union"(
+ vector[shared_ptr[CField]] fields,
+ vector[int8_t] type_codes)
+
+ cdef shared_ptr[CDataType] CMakeDenseUnionType" arrow::dense_union"(
+ vector[shared_ptr[CField]] fields,
+ vector[int8_t] type_codes)
+
cdef cppclass CSchema" arrow::Schema":
CSchema(const vector[shared_ptr[CField]]& fields)
CSchema(const vector[shared_ptr[CField]]& fields,
@@ -548,28 +555,32 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
shared_ptr[CDataType] value_type()
cdef cppclass CUnionArray" arrow::UnionArray"(CArray):
+ shared_ptr[CBuffer] type_codes()
+ int8_t* raw_type_codes()
+ int child_id(int64_t index)
+ shared_ptr[CArray] field(int pos)
+ const CArray* UnsafeField(int pos)
+ UnionMode mode()
+
+ cdef cppclass CSparseUnionArray" arrow::SparseUnionArray"(CUnionArray):
@staticmethod
- CResult[shared_ptr[CArray]] MakeSparse(
+ CResult[shared_ptr[CArray]] Make(
const CArray& type_codes,
const vector[shared_ptr[CArray]]& children,
const vector[c_string]& field_names,
const vector[int8_t]& type_codes)
+ cdef cppclass CDenseUnionArray" arrow::DenseUnionArray"(CUnionArray):
@staticmethod
- CResult[shared_ptr[CArray]] MakeDense(
+ CResult[shared_ptr[CArray]] Make(
const CArray& type_codes,
const CArray& value_offsets,
const vector[shared_ptr[CArray]]& children,
const vector[c_string]& field_names,
const vector[int8_t]& type_codes)
- shared_ptr[CBuffer] type_codes()
- int8_t* raw_type_codes()
int32_t value_offset(int i)
shared_ptr[CBuffer] value_offsets()
- int child_id(int64_t index)
- shared_ptr[CArray] field(int pos)
- UnionMode mode()
cdef cppclass CBinaryArray" arrow::BinaryArray"(CArray):
const uint8_t* GetValue(int i, int32_t* length)
diff --git a/python/pyarrow/lib.pyx b/python/pyarrow/lib.pyx
index e33f5d0..cd5a252 100644
--- a/python/pyarrow/lib.pyx
+++ b/python/pyarrow/lib.pyx
@@ -93,7 +93,8 @@ Type_LARGE_LIST = _Type_LARGE_LIST
Type_MAP = _Type_MAP
Type_FIXED_SIZE_LIST = _Type_FIXED_SIZE_LIST
Type_STRUCT = _Type_STRUCT
-Type_UNION = _Type_UNION
+Type_SPARSE_UNION = _Type_SPARSE_UNION
+Type_DENSE_UNION = _Type_DENSE_UNION
Type_DICTIONARY = _Type_DICTIONARY
UnionMode_SPARSE = _UnionMode_SPARSE
diff --git a/python/pyarrow/public-api.pxi b/python/pyarrow/public-api.pxi
index 20898ff..fc400d7 100644
--- a/python/pyarrow/public-api.pxi
+++ b/python/pyarrow/public-api.pxi
@@ -93,7 +93,9 @@ cdef api object pyarrow_wrap_data_type(
out = FixedSizeListType.__new__(FixedSizeListType)
elif type.get().id() == _Type_STRUCT:
out = StructType.__new__(StructType)
- elif type.get().id() == _Type_UNION:
+ elif type.get().id() == _Type_SPARSE_UNION:
+ out = UnionType.__new__(UnionType)
+ elif type.get().id() == _Type_DENSE_UNION:
out = UnionType.__new__(UnionType)
elif type.get().id() == _Type_TIMESTAMP:
out = TimestampType.__new__(TimestampType)
diff --git a/python/pyarrow/scalar.pxi b/python/pyarrow/scalar.pxi
index 128bc7c..606ae87 100644
--- a/python/pyarrow/scalar.pxi
+++ b/python/pyarrow/scalar.pxi
@@ -850,11 +850,13 @@ cdef class UnionValue(ArrayValue):
cdef getitem(self, int64_t i):
cdef int child_id = self.ap.child_id(i)
cdef shared_ptr[CArray] child = self.ap.field(child_id)
+ cdef CDenseUnionArray* dense
if self.ap.mode() == _UnionMode_SPARSE:
return box_scalar(self.type[child_id].type, child, i)
else:
+ dense = <CDenseUnionArray*> self.ap
return box_scalar(self.type[child_id].type, child,
- self.ap.value_offset(i))
+ dense.value_offset(i))
def as_py(self):
"""
@@ -984,7 +986,8 @@ cdef dict _array_value_classes = {
_Type_LARGE_LIST: LargeListValue,
_Type_MAP: MapValue,
_Type_FIXED_SIZE_LIST: FixedSizeListValue,
- _Type_UNION: UnionValue,
+ _Type_SPARSE_UNION: UnionValue,
+ _Type_DENSE_UNION: UnionValue,
_Type_BINARY: BinaryValue,
_Type_STRING: StringValue,
_Type_LARGE_BINARY: LargeBinaryValue,
diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi
index 8c59b90..0c3cc07 100644
--- a/python/pyarrow/types.pxi
+++ b/python/pyarrow/types.pxi
@@ -2429,11 +2429,9 @@ def union(children_fields, mode, type_codes=None):
c_type_codes = range(c_fields.size())
if mode == UnionMode_SPARSE:
- union_type.reset(new CUnionType(c_fields, c_type_codes,
- _UnionMode_SPARSE))
+ union_type = CMakeSparseUnionType(c_fields, c_type_codes)
else:
- union_type.reset(new CUnionType(c_fields, c_type_codes,
- _UnionMode_DENSE))
+ union_type = CMakeDenseUnionType(c_fields, c_type_codes)
return pyarrow_wrap_data_type(union_type)
diff --git a/python/pyarrow/types.py b/python/pyarrow/types.py
index cc34cb6..6679154 100644
--- a/python/pyarrow/types.py
+++ b/python/pyarrow/types.py
@@ -35,8 +35,9 @@ _DATE_TYPES = {lib.Type_DATE32, lib.Type_DATE64}
_TIME_TYPES = {lib.Type_TIME32, lib.Type_TIME64}
_TEMPORAL_TYPES = {lib.Type_TIMESTAMP,
lib.Type_DURATION} | _TIME_TYPES | _DATE_TYPES
+_UNION_TYPES = {lib.Type_SPARSE_UNION, lib.Type_DENSE_UNION}
_NESTED_TYPES = {lib.Type_LIST, lib.Type_LARGE_LIST, lib.Type_STRUCT,
- lib.Type_UNION, lib.Type_MAP}
+ lib.Type_MAP} | _UNION_TYPES
def is_null(t):
@@ -190,7 +191,7 @@ def is_union(t):
"""
Return True if value is an instance of a union type.
"""
- return t.id == lib.Type_UNION
+ return t.id in _UNION_TYPES
def is_nested(t):
diff --git a/r/R/enums.R b/r/R/enums.R
index a818a35..223d5a7 100644
--- a/r/R/enums.R
+++ b/r/R/enums.R
@@ -68,9 +68,10 @@ Type <- enum("Type::type",
DECIMAL = 23L,
LIST = 24L,
STRUCT = 25L,
- UNION = 26L,
- DICTIONARY = 27L,
- MAP = 28L
+ SPARSE_UNION = 26L,
+ DENSE_UNION = 27L,
+ DICTIONARY = 28L,
+ MAP = 29L
)
#' @rdname enums
diff --git a/r/R/type.R b/r/R/type.R
index 4b0db1d..ab30524 100644
--- a/r/R/type.R
+++ b/r/R/type.R
@@ -70,7 +70,8 @@ DataType <- R6Class("DataType",
DECIMAL = shared_ptr(Decimal128Type, self$pointer()),
LIST = shared_ptr(ListType, self$pointer()),
STRUCT = shared_ptr(StructType, self$pointer()),
- UNION = stop("Type UNION not implemented yet"),
+ SPARSE_UNION = stop("Type SPARSE_UNION not implemented yet"),
+ DENSE_UNION = stop("Type DENSE_UNION not implemented yet"),
DICTIONARY = shared_ptr(DictionaryType, self$pointer()),
MAP = stop("Type MAP not implemented yet")
)
diff --git a/ruby/red-arrow/ext/arrow/converters.hpp b/ruby/red-arrow/ext/arrow/converters.hpp
index e357eda..3a19c8c 100644
--- a/ruby/red-arrow/ext/arrow/converters.hpp
+++ b/ruby/red-arrow/ext/arrow/converters.hpp
@@ -285,7 +285,8 @@ namespace red_arrow {
// VISIT(Interval)
VISIT(List)
VISIT(Struct)
- VISIT(Union)
+ VISIT(SparseUnion)
+ VISIT(DenseUnion)
VISIT(Dictionary)
VISIT(Decimal128)
// TODO
@@ -388,7 +389,8 @@ namespace red_arrow {
// VISIT(Interval)
VISIT(List)
VISIT(Struct)
- VISIT(Union)
+ VISIT(SparseUnion)
+ VISIT(DenseUnion)
VISIT(Dictionary)
VISIT(Decimal128)
// TODO
@@ -432,10 +434,10 @@ namespace red_arrow {
index_ = index;
switch (array.mode()) {
case arrow::UnionMode::SPARSE:
- convert_sparse(array);
+ convert_sparse(static_cast<const arrow::SparseUnionArray&>(array));
break;
case arrow::UnionMode::DENSE:
- convert_dense(array);
+ convert_dense(static_cast<const arrow::DenseUnionArray&>(array));
break;
default:
rb_raise(rb_eArgError, "Invalid union mode");
@@ -479,7 +481,8 @@ namespace red_arrow {
// VISIT(Interval)
VISIT(List)
VISIT(Struct)
- VISIT(Union)
+ VISIT(SparseUnion)
+ VISIT(DenseUnion)
VISIT(Dictionary)
VISIT(Decimal128)
// TODO
@@ -516,7 +519,7 @@ namespace red_arrow {
return 0;
}
- void convert_sparse(const arrow::UnionArray& array) {
+ void convert_sparse(const arrow::SparseUnionArray& array) {
const auto type =
std::static_pointer_cast<arrow::UnionType>(array.type()).get();
const auto tag = "[raw-records][union-sparse-array]";
@@ -530,7 +533,7 @@ namespace red_arrow {
field_name_ = field_name_keep;
}
- void convert_dense(const arrow::UnionArray& array) {
+ void convert_dense(const arrow::DenseUnionArray& array) {
const auto type =
std::static_pointer_cast<arrow::UnionType>(array.type()).get();
const auto tag = "[raw-records][union-dense-array]";
diff --git a/ruby/red-arrow/ext/arrow/raw-records.cpp b/ruby/red-arrow/ext/arrow/raw-records.cpp
index 2866263..28e857f 100644
--- a/ruby/red-arrow/ext/arrow/raw-records.cpp
+++ b/ruby/red-arrow/ext/arrow/raw-records.cpp
@@ -100,7 +100,8 @@ namespace red_arrow {
// VISIT(Interval)
VISIT(List)
VISIT(Struct)
- VISIT(Union)
+ VISIT(SparseUnion)
+ VISIT(DenseUnion)
VISIT(Dictionary)
VISIT(Decimal128)
// TODO
diff --git a/ruby/red-arrow/ext/arrow/values.cpp b/ruby/red-arrow/ext/arrow/values.cpp
index 624d265..fd2f127 100644
--- a/ruby/red-arrow/ext/arrow/values.cpp
+++ b/ruby/red-arrow/ext/arrow/values.cpp
@@ -81,7 +81,8 @@ namespace red_arrow {
// VISIT(Interval)
VISIT(List)
VISIT(Struct)
- VISIT(Union)
+ VISIT(SparseUnion)
+ VISIT(DenseUnion)
VISIT(Dictionary)
VISIT(Decimal128)
// TODO
diff --git a/ruby/red-arrow/test/test-dense-union-data-type.rb b/ruby/red-arrow/test/test-dense-union-data-type.rb
index 96699e5..d8da6f7 100644
--- a/ruby/red-arrow/test/test-dense-union-data-type.rb
+++ b/ruby/red-arrow/test/test-dense-union-data-type.rb
@@ -28,12 +28,12 @@ class DenseUnionDataTypeTest < Test::Unit::TestCase
end
test("ordered arguments") do
- assert_equal("union[dense]<visible: bool=2, count: int32=9>",
+ assert_equal("dense_union<visible: bool=2, count: int32=9>",
Arrow::DenseUnionDataType.new(@fields, [2, 9]).to_s)
end
test("description") do
- assert_equal("union[dense]<visible: bool=2, count: int32=9>",
+ assert_equal("dense_union<visible: bool=2, count: int32=9>",
Arrow::DenseUnionDataType.new(fields: @fields,
type_codes: [2, 9]).to_s)
end
diff --git a/ruby/red-arrow/test/test-sparse-union-data-type.rb b/ruby/red-arrow/test/test-sparse-union-data-type.rb
index 4159b42..e672f82 100644
--- a/ruby/red-arrow/test/test-sparse-union-data-type.rb
+++ b/ruby/red-arrow/test/test-sparse-union-data-type.rb
@@ -28,12 +28,12 @@ class SparseUnionDataTypeTest < Test::Unit::TestCase
end
test("ordered arguments") do
- assert_equal("union[sparse]<visible: bool=2, count: int32=9>",
+ assert_equal("sparse_union<visible: bool=2, count: int32=9>",
Arrow::SparseUnionDataType.new(@fields, [2, 9]).to_s)
end
test("description") do
- assert_equal("union[sparse]<visible: bool=2, count: int32=9>",
+ assert_equal("sparse_union<visible: bool=2, count: int32=9>",
Arrow::SparseUnionDataType.new(fields: @fields,
type_codes: [2, 9]).to_s)
end