You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ap...@apache.org on 2019/04/09 18:33:01 UTC
[arrow] branch master updated: ARROW-4622: [C++][Python] MakeDense
and MakeSparse in UnionArray should accept a vector of Field
This is an automated email from the ASF dual-hosted git repository.
apitrou pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 003d8d0 ARROW-4622: [C++][Python] MakeDense and MakeSparse in UnionArray should accept a vector of Field
003d8d0 is described below
commit 003d8d0d50b68d9629d567cae12a3aed4156125b
Author: Kenta Murata <mr...@mrkn.jp>
AuthorDate: Tue Apr 9 20:32:54 2019 +0200
ARROW-4622: [C++][Python] MakeDense and MakeSparse in UnionArray should accept a vector of Field
## TODO:
- [x] Write tests for existing behaviors
- [x] Support to supply field names
- [x] union_(field_names, children, mode)
- [x] Support to supply type codes
- [x] make format
- [x] Fix GLib binding
- [x] Fix Ruby binding
- [x] Fix Python binding
- [ ] Make CI green
Author: Kenta Murata <mr...@mrkn.jp>
Author: Antoine Pitrou <an...@python.org>
Closes #3723 from mrkn/make_union_array_with_field_names and squashes the following commits:
1480c3c72 <Antoine Pitrou> Some nits
90db62f97 <Kenta Murata> Fix coding style
c81b1c4fd <Kenta Murata> ninja format
8c598c9cf <Kenta Murata> Consolidate test cases
6c840454c <Kenta Murata> Fix variable names
b04e7cfdb <Kenta Murata> Fix style
40c1c6257 <Kenta Murata> Add support to create union array with field names and type codes
ec24b41d9 <Kenta Murata> Refactoring
66ae94210 <Kenta Murata> Add support to supply type codes
dc475ad1d <Kenta Murata> make format
18c574a51 <Kenta Murata> Add support to supply type codes to union_
d64882111 <Kenta Murata> Replace MakeUnionType with union_
09fd89ce9 <Kenta Murata> Add support to supply field names
ce7ee3752 <Kenta Murata> Add tests of MakeDense and MakeSparse of UnionArray
---
cpp/src/arrow/CMakeLists.txt | 6 +-
cpp/src/arrow/array-union-test.cc | 133 ++++++++++++++++++++++++++++++++---
cpp/src/arrow/array.cc | 28 +++++++-
cpp/src/arrow/array.h | 119 +++++++++++++++++++++++++++++++
cpp/src/arrow/type.cc | 14 +++-
cpp/src/arrow/type.h | 18 ++++-
python/pyarrow/array.pxi | 34 +++++++--
python/pyarrow/includes/libarrow.pxd | 4 ++
python/pyarrow/tests/test_array.py | 69 ++++++++++++++++--
9 files changed, 399 insertions(+), 26 deletions(-)
diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt
index d4fb19f..c045704 100644
--- a/cpp/src/arrow/CMakeLists.txt
+++ b/cpp/src/arrow/CMakeLists.txt
@@ -322,7 +322,8 @@ if(WIN32)
array-test.cc
array-binary-test.cc
array-list-test.cc
- array-struct-test.cc)
+ array-struct-test.cc
+ array-union-test.cc)
else()
add_arrow_test(array-test
SOURCES
@@ -330,7 +331,8 @@ else()
array-binary-test.cc
array-dict-test.cc
array-list-test.cc
- array-struct-test.cc)
+ array-struct-test.cc
+ array-union-test.cc)
endif()
add_arrow_test(buffer-test)
diff --git a/cpp/src/arrow/array-union-test.cc b/cpp/src/arrow/array-union-test.cc
index 067d195..86cbeae 100644
--- a/cpp/src/arrow/array-union-test.cc
+++ b/cpp/src/arrow/array-union-test.cc
@@ -15,20 +15,15 @@
// specific language governing permissions and limitations
// under the License.
-#include <cstdint>
-#include <cstring>
-#include <memory>
-#include <vector>
+#include <string>
#include <gtest/gtest.h>
#include "arrow/array.h"
-#include "arrow/builder.h"
-#include "arrow/status.h"
// TODO ipc shouldn't be included here
#include "arrow/ipc/test-common.h"
-#include "arrow/testing/gtest_common.h"
#include "arrow/testing/gtest_util.h"
+#include "arrow/testing/util.h"
#include "arrow/type.h"
#include "arrow/util/checked_cast.h"
@@ -36,7 +31,7 @@ namespace arrow {
using internal::checked_cast;
-TEST(TestUnionArrayAdHoc, TestSliceEquals) {
+TEST(TestUnionArray, TestSliceEquals) {
std::shared_ptr<RecordBatch> batch;
ASSERT_OK(ipc::test::MakeUnion(&batch));
@@ -71,4 +66,126 @@ TEST(TestUnionArrayAdHoc, TestSliceEquals) {
CheckUnion(batch->column(2));
}
+// -------------------------------------------------------------------------
+// Tests for MakeDense and MakeSparse
+
+class TestUnionArrayFactories : public ::testing::Test {
+ public:
+ void SetUp() {
+ pool_ = default_memory_pool();
+ ArrayFromVector<Int8Type>({0, 1, 2, 0, 1, 3, 2, 0, 2, 1}, &type_ids_);
+ }
+
+ void CheckUnionArray(const UnionArray& array, UnionMode::type mode,
+ const std::vector<std::string>& field_names,
+ const std::vector<uint8_t>& type_codes) {
+ ASSERT_EQ(mode, array.mode());
+ CheckFieldNames(array, field_names);
+ CheckTypeCodes(array, type_codes);
+ }
+
+ void CheckFieldNames(const UnionArray& array, const std::vector<std::string>& names) {
+ const auto& type = checked_cast<const UnionType&>(*array.type());
+ ASSERT_EQ(type.num_children(), names.size());
+ for (int i = 0; i < type.num_children(); ++i) {
+ ASSERT_EQ(type.child(i)->name(), names[i]);
+ }
+ }
+
+ void CheckTypeCodes(const UnionArray& array, const std::vector<uint8_t>& codes) {
+ const auto& type = checked_cast<const UnionType&>(*array.type());
+ ASSERT_EQ(codes, type.type_codes());
+ }
+
+ protected:
+ MemoryPool* pool_;
+ std::shared_ptr<Array> type_ids_;
+};
+
+TEST_F(TestUnionArrayFactories, TestMakeDense) {
+ std::shared_ptr<Array> value_offsets;
+ ArrayFromVector<Int32Type, int32_t>({0, 0, 0, 1, 1, 0, 1, 2, 1, 2}, &value_offsets);
+
+ auto children = std::vector<std::shared_ptr<Array>>(4);
+ ArrayFromVector<StringType, std::string>({"abc", "def", "xyz"}, &children[0]);
+ ArrayFromVector<UInt8Type>({10, 20, 30}, &children[1]);
+ ArrayFromVector<DoubleType>({1.618, 2.718, 3.142}, &children[2]);
+ ArrayFromVector<Int8Type>({-12}, &children[3]);
+
+ std::vector<std::string> field_names = {"str", "int1", "real", "int2"};
+ std::vector<uint8_t> type_codes = {1, 2, 4, 8};
+
+ std::shared_ptr<Array> result;
+
+ // without field names and type codes
+ ASSERT_OK(UnionArray::MakeDense(*type_ids_, *value_offsets, children, &result));
+ CheckUnionArray(checked_cast<UnionArray&>(*result), UnionMode::DENSE,
+ {"0", "1", "2", "3"}, {0, 1, 2, 3});
+
+ // with field name
+ ASSERT_RAISES(Invalid, UnionArray::MakeDense(*type_ids_, *value_offsets, children,
+ {"one"}, &result));
+ ASSERT_OK(
+ UnionArray::MakeDense(*type_ids_, *value_offsets, children, field_names, &result));
+ CheckUnionArray(checked_cast<UnionArray&>(*result), UnionMode::DENSE, field_names,
+ {0, 1, 2, 3});
+
+ // with type codes
+ ASSERT_RAISES(Invalid, UnionArray::MakeDense(*type_ids_, *value_offsets, children,
+ std::vector<uint8_t>{0}, &result));
+ ASSERT_OK(
+ UnionArray::MakeDense(*type_ids_, *value_offsets, children, type_codes, &result));
+ CheckUnionArray(checked_cast<UnionArray&>(*result), UnionMode::DENSE,
+ {"0", "1", "2", "3"}, type_codes);
+
+ // with field names and type codes
+ ASSERT_RAISES(Invalid, UnionArray::MakeDense(*type_ids_, *value_offsets, children,
+ {"one"}, type_codes, &result));
+ ASSERT_OK(UnionArray::MakeDense(*type_ids_, *value_offsets, children, field_names,
+ type_codes, &result));
+ CheckUnionArray(checked_cast<UnionArray&>(*result), UnionMode::DENSE, field_names,
+ type_codes);
+}
+
+TEST_F(TestUnionArrayFactories, TestMakeSparse) {
+ auto children = std::vector<std::shared_ptr<Array>>(4);
+ ArrayFromVector<StringType, std::string>(
+ {"abc", "", "", "def", "", "", "", "xyz", "", ""}, &children[0]);
+ ArrayFromVector<UInt8Type>({0, 10, 0, 0, 20, 0, 0, 0, 0, 30}, &children[1]);
+ ArrayFromVector<DoubleType>({0.0, 0.0, 1.618, 0.0, 0.0, 0.0, 2.718, 0.0, 3.142, 0.0},
+ &children[2]);
+ ArrayFromVector<Int8Type>({0, 0, 0, 0, 0, -12, 0, 0, 0, 0}, &children[3]);
+
+ std::vector<std::string> field_names = {"str", "int1", "real", "int2"};
+ std::vector<uint8_t> type_codes = {1, 2, 4, 8};
+
+ std::shared_ptr<Array> result;
+
+ // without field names and type codes
+ ASSERT_OK(UnionArray::MakeSparse(*type_ids_, children, &result));
+ CheckUnionArray(checked_cast<UnionArray&>(*result), UnionMode::SPARSE,
+ {"0", "1", "2", "3"}, {0, 1, 2, 3});
+
+ // with field names
+ ASSERT_RAISES(Invalid, UnionArray::MakeSparse(*type_ids_, children, {"one"}, &result));
+ ASSERT_OK(UnionArray::MakeSparse(*type_ids_, children, field_names, &result));
+ CheckUnionArray(checked_cast<UnionArray&>(*result), UnionMode::SPARSE, field_names,
+ {0, 1, 2, 3});
+
+ // with type codes
+ ASSERT_RAISES(Invalid, UnionArray::MakeSparse(*type_ids_, children,
+ std::vector<uint8_t>{0}, &result));
+ ASSERT_OK(UnionArray::MakeSparse(*type_ids_, children, type_codes, &result));
+ CheckUnionArray(checked_cast<UnionArray&>(*result), UnionMode::SPARSE,
+ {"0", "1", "2", "3"}, type_codes);
+
+ // with field names and type codes
+ ASSERT_RAISES(Invalid, UnionArray::MakeSparse(*type_ids_, children, {"one"}, type_codes,
+ &result));
+ ASSERT_OK(
+ UnionArray::MakeSparse(*type_ids_, children, field_names, type_codes, &result));
+ CheckUnionArray(checked_cast<UnionArray&>(*result), UnionMode::SPARSE, field_names,
+ type_codes);
+}
+
} // namespace arrow
diff --git a/cpp/src/arrow/array.cc b/cpp/src/arrow/array.cc
index bcf4342..5956dd2 100644
--- a/cpp/src/arrow/array.cc
+++ b/cpp/src/arrow/array.cc
@@ -504,6 +504,8 @@ UnionArray::UnionArray(const std::shared_ptr<DataType>& type, int64_t length,
Status UnionArray::MakeDense(const Array& type_ids, const Array& value_offsets,
const std::vector<std::shared_ptr<Array>>& children,
+ const std::vector<std::string>& field_names,
+ const std::vector<uint8_t>& type_codes,
std::shared_ptr<Array>* out) {
if (value_offsets.length() == 0) {
return Status::Invalid("UnionArray offsets must have non-zero length");
@@ -521,10 +523,20 @@ Status UnionArray::MakeDense(const Array& type_ids, const Array& value_offsets,
return Status::Invalid("MakeDense does not allow NAs in value_offsets");
}
+ if (field_names.size() > 0 && field_names.size() != children.size()) {
+ return Status::Invalid("field_names must have the same length as children");
+ }
+
+ if (type_codes.size() > 0 && type_codes.size() != children.size()) {
+ return Status::Invalid("type_codes must have the same length as children");
+ }
+
BufferVector buffers = {type_ids.null_bitmap(),
checked_cast<const Int8Array&>(type_ids).values(),
checked_cast<const Int32Array&>(value_offsets).values()};
- auto union_type = union_(children, UnionMode::DENSE);
+
+ std::shared_ptr<DataType> union_type =
+ union_(children, field_names, type_codes, UnionMode::DENSE);
auto internal_data = ArrayData::Make(union_type, type_ids.length(), std::move(buffers),
type_ids.null_count(), type_ids.offset());
for (const auto& child : children) {
@@ -536,13 +548,25 @@ Status UnionArray::MakeDense(const Array& type_ids, const Array& value_offsets,
Status UnionArray::MakeSparse(const Array& type_ids,
const std::vector<std::shared_ptr<Array>>& children,
+ const std::vector<std::string>& field_names,
+ const std::vector<uint8_t>& type_codes,
std::shared_ptr<Array>* out) {
if (type_ids.type_id() != Type::INT8) {
return Status::Invalid("UnionArray type_ids must be signed int8");
}
+
+ if (field_names.size() > 0 && field_names.size() != children.size()) {
+ return Status::Invalid("field_names must have the same length as children");
+ }
+
+ if (type_codes.size() > 0 && type_codes.size() != children.size()) {
+ return Status::Invalid("type_codes must have the same length as children");
+ }
+
BufferVector buffers = {type_ids.null_bitmap(),
checked_cast<const Int8Array&>(type_ids).values(), nullptr};
- auto union_type = union_(children, UnionMode::SPARSE);
+ std::shared_ptr<DataType> union_type =
+ union_(children, field_names, type_codes, UnionMode::SPARSE);
auto internal_data = ArrayData::Make(union_type, type_ids.length(), std::move(buffers),
type_ids.null_count(), type_ids.offset());
for (const auto& child : children) {
diff --git a/cpp/src/arrow/array.h b/cpp/src/arrow/array.h
index a81f934..653993d 100644
--- a/cpp/src/arrow/array.h
+++ b/cpp/src/arrow/array.h
@@ -734,11 +734,75 @@ class ARROW_EXPORT UnionArray : public Array {
/// relative offset into the respective child array for the type in a given slot.
/// The respective offsets for each child value array must be in order / increasing.
/// \param[in] children Vector of children Arrays containing the data for each type.
+ /// \param[in] field_names Vector of strings containing the name of each field.
+ /// \param[in] type_codes Vector of type codes.
/// \param[out] out Will have length equal to value_offsets.length()
static Status MakeDense(const Array& type_ids, const Array& value_offsets,
const std::vector<std::shared_ptr<Array>>& children,
+ const std::vector<std::string>& field_names,
+ const std::vector<uint8_t>& type_codes,
std::shared_ptr<Array>* out);
+ /// \brief Construct Dense UnionArray from types_ids, value_offsets and children
+ ///
+ /// This function does the bare minimum of validation of the offsets and
+ /// input types. The value_offsets are assumed to be well-formed.
+ ///
+ /// \param[in] type_ids An array of 8-bit signed integers, enumerated from
+ /// 0 corresponding to each type.
+ /// \param[in] value_offsets An array of signed int32 values indicating the
+ /// relative offset into the respective child array for the type in a given slot.
+ /// The respective offsets for each child value array must be in order / increasing.
+ /// \param[in] children Vector of children Arrays containing the data for each type.
+ /// \param[in] field_names Vector of strings containing the name of each field.
+ /// \param[out] out Will have length equal to value_offsets.length()
+ static Status MakeDense(const Array& type_ids, const Array& value_offsets,
+ const std::vector<std::shared_ptr<Array>>& children,
+ const std::vector<std::string>& field_names,
+ std::shared_ptr<Array>* out) {
+ return MakeDense(type_ids, value_offsets, children, field_names, {}, out);
+ }
+
+ /// \brief Construct Dense UnionArray from types_ids, value_offsets and children
+ ///
+ /// This function does the bare minimum of validation of the offsets and
+ /// input types. The value_offsets are assumed to be well-formed.
+ ///
+ /// \param[in] type_ids An array of 8-bit signed integers, enumerated from
+ /// 0 corresponding to each type.
+ /// \param[in] value_offsets An array of signed int32 values indicating the
+ /// relative offset into the respective child array for the type in a given slot.
+ /// The respective offsets for each child value array must be in order / increasing.
+ /// \param[in] children Vector of children Arrays containing the data for each type.
+ /// \param[in] type_codes Vector of type codes.
+ /// \param[out] out Will have length equal to value_offsets.length()
+ static Status MakeDense(const Array& type_ids, const Array& value_offsets,
+ const std::vector<std::shared_ptr<Array>>& children,
+ const std::vector<uint8_t>& type_codes,
+ std::shared_ptr<Array>* out) {
+ return MakeDense(type_ids, value_offsets, children, {}, type_codes, out);
+ }
+
+ /// \brief Construct Dense UnionArray from types_ids, value_offsets and children
+ ///
+ /// This function does the bare minimum of validation of the offsets and
+ /// input types. The value_offsets are assumed to be well-formed.
+ ///
+ /// The name of each field is filled by the index of the field.
+ ///
+ /// \param[in] type_ids An array of 8-bit signed integers, enumerated from
+ /// 0 corresponding to each type.
+ /// \param[in] value_offsets An array of signed int32 values indicating the
+ /// relative offset into the respective child array for the type in a given slot.
+ /// The respective offsets for each child value array must be in order / increasing.
+ /// \param[in] children Vector of children Arrays containing the data for each type.
+ /// \param[out] out Will have length equal to value_offsets.length()
+ static Status MakeDense(const Array& type_ids, const Array& value_offsets,
+ const std::vector<std::shared_ptr<Array>>& children,
+ std::shared_ptr<Array>* out) {
+ return MakeDense(type_ids, value_offsets, children, {}, {}, out);
+ }
+
/// \brief Construct Sparse UnionArray from type_ids and children
///
/// This function does the bare minimum of validation of the offsets and
@@ -747,11 +811,66 @@ class ARROW_EXPORT UnionArray : public Array {
/// \param[in] type_ids An array of 8-bit signed integers, enumerated from
/// 0 corresponding to each type.
/// \param[in] children Vector of children Arrays containing the data for each type.
+ /// \param[in] field_names Vector of strings containing the name of each field.
+ /// \param[in] type_codes Vector of type codes.
/// \param[out] out Will have length equal to type_ids.length()
static Status MakeSparse(const Array& type_ids,
const std::vector<std::shared_ptr<Array>>& children,
+ const std::vector<std::string>& field_names,
+ const std::vector<uint8_t>& type_codes,
std::shared_ptr<Array>* out);
+ /// \brief Construct Sparse UnionArray from type_ids and children
+ ///
+ /// This function does the bare minimum of validation of the offsets and
+ /// input types.
+ ///
+ /// \param[in] type_ids An array of 8-bit signed integers, enumerated from
+ /// 0 corresponding to each type.
+ /// \param[in] children Vector of children Arrays containing the data for each type.
+ /// \param[in] field_names Vector of strings containing the name of each field.
+ /// \param[out] out Will have length equal to type_ids.length()
+ static Status MakeSparse(const Array& type_ids,
+ const std::vector<std::shared_ptr<Array>>& children,
+ const std::vector<std::string>& field_names,
+ std::shared_ptr<Array>* out) {
+ return MakeSparse(type_ids, children, field_names, {}, out);
+ }
+
+ /// \brief Construct Sparse UnionArray from type_ids and children
+ ///
+ /// This function does the bare minimum of validation of the offsets and
+ /// input types.
+ ///
+ /// \param[in] type_ids An array of 8-bit signed integers, enumerated from
+ /// 0 corresponding to each type.
+ /// \param[in] children Vector of children Arrays containing the data for each type.
+ /// \param[in] type_codes Vector of type codes.
+ /// \param[out] out Will have length equal to type_ids.length()
+ static Status MakeSparse(const Array& type_ids,
+ const std::vector<std::shared_ptr<Array>>& children,
+ const std::vector<uint8_t>& type_codes,
+ std::shared_ptr<Array>* out) {
+ return MakeSparse(type_ids, children, {}, type_codes, out);
+ }
+
+ /// \brief Construct Sparse UnionArray from type_ids and children
+ ///
+ /// This function does the bare minimum of validation of the offsets and
+ /// input types.
+ ///
+ /// The name of each field is filled by the index of the field.
+ ///
+ /// \param[in] type_ids An array of 8-bit signed integers, enumerated from
+ /// 0 corresponding to each type.
+ /// \param[in] children Vector of children Arrays containing the data for each type.
+ /// \param[out] out Will have length equal to type_ids.length()
+ static Status MakeSparse(const Array& type_ids,
+ const std::vector<std::shared_ptr<Array>>& children,
+ std::shared_ptr<Array>* out) {
+ return MakeSparse(type_ids, children, {}, {}, out);
+ }
+
/// Note that this buffer does not account for any slice offset
std::shared_ptr<Buffer> type_ids() const { return data_->buffers[1]; }
diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc
index fd37726..0e0d9fc 100644
--- a/cpp/src/arrow/type.cc
+++ b/cpp/src/arrow/type.cc
@@ -576,13 +576,21 @@ std::shared_ptr<DataType> union_(const std::vector<std::shared_ptr<Field>>& chil
}
std::shared_ptr<DataType> union_(const std::vector<std::shared_ptr<Array>>& children,
+ const std::vector<std::string>& field_names,
+ const std::vector<uint8_t>& given_type_codes,
UnionMode::type mode) {
std::vector<std::shared_ptr<Field>> types;
- std::vector<uint8_t> type_codes;
+ std::vector<uint8_t> type_codes(given_type_codes);
uint8_t counter = 0;
for (const auto& child : children) {
- types.push_back(field(std::to_string(counter), child->type()));
- type_codes.push_back(counter);
+ if (field_names.size() == 0) {
+ types.push_back(field(std::to_string(counter), child->type()));
+ } else {
+ types.push_back(field(field_names[counter], child->type()));
+ }
+ if (given_type_codes.size() == 0) {
+ type_codes.push_back(counter);
+ }
counter++;
}
return union_(types, type_codes, mode);
diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h
index 525d174..4c35378 100644
--- a/cpp/src/arrow/type.h
+++ b/cpp/src/arrow/type.h
@@ -920,7 +920,23 @@ union_(const std::vector<std::shared_ptr<Field>>& child_fields,
/// \brief Create a UnionType instance
std::shared_ptr<DataType> ARROW_EXPORT
union_(const std::vector<std::shared_ptr<Array>>& children,
- UnionMode::type mode = UnionMode::SPARSE);
+ const std::vector<std::string>& field_names,
+ const std::vector<uint8_t>& type_codes, UnionMode::type mode = UnionMode::SPARSE);
+
+/// \brief Create a UnionType instance
+inline std::shared_ptr<DataType> ARROW_EXPORT
+union_(const std::vector<std::shared_ptr<Array>>& children,
+ const std::vector<std::string>& field_names,
+ UnionMode::type mode = UnionMode::SPARSE) {
+ return union_(children, field_names, {}, mode);
+}
+
+/// \brief Create a UnionType instance
+inline std::shared_ptr<DataType> ARROW_EXPORT
+union_(const std::vector<std::shared_ptr<Array>>& children,
+ UnionMode::type mode = UnionMode::SPARSE) {
+ return union_(children, {}, {}, mode);
+}
/// \brief Create a DictionaryType instance
std::shared_ptr<DataType> ARROW_EXPORT
diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi
index 1966245..9c57634 100644
--- a/python/pyarrow/array.pxi
+++ b/python/pyarrow/array.pxi
@@ -1054,7 +1054,8 @@ cdef class UnionArray(Array):
"""
@staticmethod
- def from_dense(Array types, Array value_offsets, list children):
+ def from_dense(Array types, Array value_offsets, list children,
+ list field_names=None, list type_codes=None):
"""
Construct dense UnionArray from arrays of int8 types, int32 offsets and
children arrays
@@ -1064,6 +1065,8 @@ cdef class UnionArray(Array):
types : Array (int8 type)
value_offsets : Array (int32 type)
children : list
+ field_names : list
+ type_codes : list
Returns
-------
@@ -1072,15 +1075,25 @@ cdef class UnionArray(Array):
cdef shared_ptr[CArray] out
cdef vector[shared_ptr[CArray]] c
cdef Array child
+ cdef vector[c_string] c_field_names
+ cdef vector[uint8_t] c_type_codes
for child in children:
c.push_back(child.sp_array)
+ if field_names is not None:
+ for x in field_names:
+ c_field_names.push_back(tobytes(x))
+ if type_codes is not None:
+ for x in type_codes:
+ c_type_codes.push_back(x)
with nogil:
check_status(CUnionArray.MakeDense(
- deref(types.ap), deref(value_offsets.ap), c, &out))
+ deref(types.ap), deref(value_offsets.ap), c, c_field_names,
+ c_type_codes, &out))
return pyarrow_wrap_array(out)
@staticmethod
- def from_sparse(Array types, list children):
+ def from_sparse(Array types, list children, list field_names=None,
+ list type_codes=None):
"""
Construct sparse UnionArray from arrays of int8 types and children
arrays
@@ -1089,6 +1102,8 @@ cdef class UnionArray(Array):
----------
types : Array (int8 type)
children : list
+ field_names : list
+ type_codes : list
Returns
-------
@@ -1097,10 +1112,21 @@ cdef class UnionArray(Array):
cdef shared_ptr[CArray] out
cdef vector[shared_ptr[CArray]] c
cdef Array child
+ cdef vector[c_string] c_field_names
+ cdef vector[uint8_t] c_type_codes
for child in children:
c.push_back(child.sp_array)
+ if field_names is not None:
+ for x in field_names:
+ c_field_names.push_back(tobytes(x))
+ if type_codes is not None:
+ for x in type_codes:
+ c_type_codes.push_back(x)
with nogil:
- check_status(CUnionArray.MakeSparse(deref(types.ap), c, &out))
+ check_status(CUnionArray.MakeSparse(deref(types.ap), c,
+ c_field_names,
+ c_type_codes,
+ &out))
return pyarrow_wrap_array(out)
diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd
index e27f033..1649ee6 100644
--- a/python/pyarrow/includes/libarrow.pxd
+++ b/python/pyarrow/includes/libarrow.pxd
@@ -407,11 +407,15 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
@staticmethod
CStatus MakeSparse(const CArray& type_ids,
const vector[shared_ptr[CArray]]& children,
+ const vector[c_string]& field_names,
+ const vector[uint8_t]& type_codes,
shared_ptr[CArray]* out)
@staticmethod
CStatus MakeDense(const CArray& type_ids, const CArray& value_offsets,
const vector[shared_ptr[CArray]]& children,
+ const vector[c_string]& field_names,
+ const vector[uint8_t]& type_codes,
shared_ptr[CArray]* out)
uint8_t* raw_type_ids()
int32_t value_offset(int i)
diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py
index 8c0143c..ffbf7e3 100644
--- a/python/pyarrow/tests/test_array.py
+++ b/python/pyarrow/tests/test_array.py
@@ -407,9 +407,39 @@ def test_union_from_dense():
types = pa.array([0, 1, 0, 0, 1, 1, 0], type='int8')
value_offsets = pa.array([0, 0, 2, 1, 1, 2, 3], type='int32')
- result = pa.UnionArray.from_dense(types, value_offsets, [binary, int64])
-
- assert result.to_pylist() == [b'a', 1, b'c', b'b', 2, 3, b'd']
+ def check_result(result, expected_field_names, expected_type_codes):
+ assert result.to_pylist() == [b'a', 1, b'c', b'b', 2, 3, b'd']
+ actual_field_names = [result.type[i].name
+ for i in range(result.type.num_children)]
+ assert actual_field_names == expected_field_names
+ assert result.type.type_codes == expected_type_codes
+
+ # without field names and type codes
+ check_result(pa.UnionArray.from_dense(types, value_offsets,
+ [binary, int64]),
+ expected_field_names=['0', '1'],
+ expected_type_codes=[0, 1])
+
+ # with field names
+ check_result(pa.UnionArray.from_dense(types, value_offsets,
+ [binary, int64],
+ ['bin', 'int']),
+ expected_field_names=['bin', 'int'],
+ expected_type_codes=[0, 1])
+
+ # with type codes
+ check_result(pa.UnionArray.from_dense(types, value_offsets,
+ [binary, int64],
+ type_codes=[11, 13]),
+ expected_field_names=['0', '1'],
+ expected_type_codes=[11, 13])
+
+ # with field names and type codes
+ check_result(pa.UnionArray.from_dense(types, value_offsets,
+ [binary, int64],
+ ['bin', 'int'], [11, 13]),
+ expected_field_names=['bin', 'int'],
+ expected_type_codes=[11, 13])
def test_union_from_sparse():
@@ -418,9 +448,36 @@ def test_union_from_sparse():
int64 = pa.array([0, 1, 0, 0, 2, 3, 0], type='int64')
types = pa.array([0, 1, 0, 0, 1, 1, 0], type='int8')
- result = pa.UnionArray.from_sparse(types, [binary, int64])
-
- assert result.to_pylist() == [b'a', 1, b'b', b'c', 2, 3, b'd']
+ def check_result(result, expected_field_names, expected_type_codes):
+ assert result.to_pylist() == [b'a', 1, b'b', b'c', 2, 3, b'd']
+ actual_field_names = [result.type[i].name
+ for i in range(result.type.num_children)]
+ assert actual_field_names == expected_field_names
+ assert result.type.type_codes == expected_type_codes
+
+ # without field names and type codes
+ check_result(pa.UnionArray.from_sparse(types, [binary, int64]),
+ expected_field_names=['0', '1'],
+ expected_type_codes=[0, 1])
+
+ # with field names
+ check_result(pa.UnionArray.from_sparse(types, [binary, int64],
+ ['bin', 'int']),
+ expected_field_names=['bin', 'int'],
+ expected_type_codes=[0, 1])
+
+ # with type codes
+ check_result(pa.UnionArray.from_sparse(types, [binary, int64],
+ type_codes=[11, 13]),
+ expected_field_names=['0', '1'],
+ expected_type_codes=[11, 13])
+
+ # with field names and type codes
+ check_result(pa.UnionArray.from_sparse(types, [binary, int64],
+ ['bin', 'int'],
+ [11, 13]),
+ expected_field_names=['bin', 'int'],
+ expected_type_codes=[11, 13])
def test_union_array_slice():