You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2019/08/01 22:05:29 UTC
[arrow] branch master updated: ARROW-6068: [C++] Allow passing
Field instances to StructArray::Make
This is an automated email from the ASF dual-hosted git repository.
wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new d44f03f ARROW-6068: [C++] Allow passing Field instances to StructArray::Make
d44f03f is described below
commit d44f03fadd1b8deb4cb17ef5584d9094c718579c
Author: Antoine Pitrou <an...@python.org>
AuthorDate: Thu Aug 1 17:05:11 2019 -0500
ARROW-6068: [C++] Allow passing Field instances to StructArray::Make
This helps fix a Python test failure when hypothesis testing is enabled.
Closes #4981 from pitrou/ARROW-6068-struct-array-from-fields and squashes the following commits:
12049109a <Antoine Pitrou> ARROW-6068: Allow passing Field instances to StructArray::Make
Authored-by: Antoine Pitrou <an...@python.org>
Signed-off-by: Wes McKinney <we...@apache.org>
---
cpp/src/arrow/array-struct-test.cc | 59 +++++++++++++++++++++++++++++-------
cpp/src/arrow/array.cc | 22 ++++++++++----
cpp/src/arrow/array.h | 11 +++++++
python/pyarrow/array.pxi | 49 +++++++++++++++++++++---------
python/pyarrow/includes/libarrow.pxd | 10 +++++-
python/pyarrow/tests/strategies.py | 8 ++---
python/pyarrow/tests/test_array.py | 30 +++++++++++++++---
7 files changed, 147 insertions(+), 42 deletions(-)
diff --git a/cpp/src/arrow/array-struct-test.cc b/cpp/src/arrow/array-struct-test.cc
index a5fce6e..9acd2f9 100644
--- a/cpp/src/arrow/array-struct-test.cc
+++ b/cpp/src/arrow/array-struct-test.cc
@@ -80,8 +80,9 @@ TEST(StructArray, FromFieldNames) {
std::shared_ptr<Array> a, b, c, array, expected;
a = ArrayFromJSON(int32(), "[4, null]");
b = ArrayFromJSON(utf8(), R"([null, "foo"])");
+ std::vector<std::string> field_names = {"a", "b"};
- auto res = StructArray::Make({a, b}, {"a", "b"});
+ auto res = StructArray::Make({a, b}, field_names);
ASSERT_OK(res);
array = *res;
expected = ArrayFromJSON(struct_({field("a", int32()), field("b", utf8())}),
@@ -89,30 +90,33 @@ TEST(StructArray, FromFieldNames) {
AssertArraysEqual(*array, *expected);
// With non-zero offsets
- res = StructArray::Make({a, b}, {"a", "b"}, /*null_bitmap =*/nullptr, /*null_count =*/0,
- /*offset =*/1);
+ res =
+ StructArray::Make({a, b}, field_names, /*null_bitmap =*/nullptr, /*null_count =*/0,
+ /*offset =*/1);
ASSERT_OK(res);
array = *res;
expected = ArrayFromJSON(struct_({field("a", int32()), field("b", utf8())}),
R"([{"a": null, "b": "foo"}])");
AssertArraysEqual(*array, *expected);
- res = StructArray::Make({a, b}, {"a", "b"}, /*null_bitmap =*/nullptr, /*null_count =*/0,
- /*offset =*/2);
+ res =
+ StructArray::Make({a, b}, field_names, /*null_bitmap =*/nullptr, /*null_count =*/0,
+ /*offset =*/2);
ASSERT_OK(res);
array = *res;
expected = ArrayFromJSON(struct_({field("a", int32()), field("b", utf8())}), R"([])");
AssertArraysEqual(*array, *expected);
// Offset greater than length
- res = StructArray::Make({a, b}, {"a", "b"}, /*null_bitmap =*/nullptr, /*null_count =*/0,
- /*offset =*/3);
+ res =
+ StructArray::Make({a, b}, field_names, /*null_bitmap =*/nullptr, /*null_count =*/0,
+ /*offset =*/3);
ASSERT_RAISES(IndexError, res);
// With null bitmap
std::shared_ptr<Buffer> null_bitmap;
BitmapFromVector<bool>({false, true}, &null_bitmap);
- res = StructArray::Make({a, b}, {"a", "b"}, null_bitmap);
+ res = StructArray::Make({a, b}, field_names, null_bitmap);
ASSERT_OK(res);
array = *res;
expected = ArrayFromJSON(struct_({field("a", int32()), field("b", utf8())}),
@@ -120,16 +124,49 @@ TEST(StructArray, FromFieldNames) {
AssertArraysEqual(*array, *expected);
// Mismatching array lengths
+ field_names = {"a", "c"};
c = ArrayFromJSON(int64(), "[1, 2, 3]");
- res = StructArray::Make({a, c}, {"a", "c"});
+ res = StructArray::Make({a, c}, field_names);
ASSERT_RAISES(Invalid, res);
// Mismatching number of fields
- res = StructArray::Make({a, b}, {"a", "b", "c"});
+ field_names = {"a", "b", "c"};
+ res = StructArray::Make({a, b}, field_names);
ASSERT_RAISES(Invalid, res);
// Fail on 0 children (cannot infer array length)
- res = StructArray::Make({}, {});
+ field_names = {};
+ res = StructArray::Make({}, field_names);
+ ASSERT_RAISES(Invalid, res);
+}
+
+TEST(StructArray, FromFields) {
+ std::shared_ptr<Array> a, b, c, array, expected;
+ std::shared_ptr<Field> fa, fb, fc;
+ a = ArrayFromJSON(int32(), "[4, 5]");
+ b = ArrayFromJSON(utf8(), R"([null, "foo"])");
+ fa = field("a", int32(), /*nullable =*/false);
+ fb = field("b", utf8(), /*nullable =*/true);
+ fc = field("b", int64(), /*nullable =*/true);
+
+ auto res = StructArray::Make({a, b}, {fa, fb});
+ ASSERT_OK(res);
+ array = *res;
+ expected =
+ ArrayFromJSON(struct_({fa, fb}), R"([{"a": 4, "b": null}, {"a": 5, "b": "foo"}])");
+ AssertArraysEqual(*array, *expected);
+
+ // Mismatching array lengths
+ c = ArrayFromJSON(int64(), "[1, 2, 3]");
+ res = StructArray::Make({a, c}, {fa, fc});
+ ASSERT_RAISES(Invalid, res);
+
+ // Mismatching number of fields
+ res = StructArray::Make({a, b}, {fa, fb, fc});
+ ASSERT_RAISES(Invalid, res);
+
+ // Fail on 0 children (cannot infer array length)
+ res = StructArray::Make({}, std::vector<std::shared_ptr<Field>>{});
ASSERT_RAISES(Invalid, res);
}
diff --git a/cpp/src/arrow/array.cc b/cpp/src/arrow/array.cc
index 01f0ddb..847ed11 100644
--- a/cpp/src/arrow/array.cc
+++ b/cpp/src/arrow/array.cc
@@ -560,10 +560,10 @@ StructArray::StructArray(const std::shared_ptr<DataType>& type, int64_t length,
Result<std::shared_ptr<Array>> StructArray::Make(
const std::vector<std::shared_ptr<Array>>& children,
- const std::vector<std::string>& field_names, std::shared_ptr<Buffer> null_bitmap,
- int64_t null_count, int64_t offset) {
- if (children.size() != field_names.size()) {
- return Status::Invalid("Mismatching number of field names and child arrays");
+ const std::vector<std::shared_ptr<Field>>& fields,
+ std::shared_ptr<Buffer> null_bitmap, int64_t null_count, int64_t offset) {
+ if (children.size() != fields.size()) {
+ return Status::Invalid("Mismatching number of fields and child arrays");
}
int64_t length = 0;
if (children.size() == 0) {
@@ -578,12 +578,22 @@ Result<std::shared_ptr<Array>> StructArray::Make(
if (offset > length) {
return Status::IndexError("Offset greater than length of child arrays");
}
+ return std::make_shared<StructArray>(struct_(fields), length - offset, children,
+ null_bitmap, null_count, offset);
+}
+
+Result<std::shared_ptr<Array>> StructArray::Make(
+ const std::vector<std::shared_ptr<Array>>& children,
+ const std::vector<std::string>& field_names, std::shared_ptr<Buffer> null_bitmap,
+ int64_t null_count, int64_t offset) {
+ if (children.size() != field_names.size()) {
+ return Status::Invalid("Mismatching number of field names and child arrays");
+ }
std::vector<std::shared_ptr<Field>> fields(children.size());
for (size_t i = 0; i < children.size(); ++i) {
fields[i] = ::arrow::field(field_names[i], children[i]->type());
}
- return std::make_shared<StructArray>(struct_(fields), length - offset, children,
- null_bitmap, null_count, offset);
+ return Make(children, fields, std::move(null_bitmap), null_count, offset);
}
const StructType* StructArray::struct_type() const {
diff --git a/cpp/src/arrow/array.h b/cpp/src/arrow/array.h
index 2313994..1b3e47f 100644
--- a/cpp/src/arrow/array.h
+++ b/cpp/src/arrow/array.h
@@ -918,6 +918,17 @@ class ARROW_EXPORT StructArray : public Array {
std::shared_ptr<Buffer> null_bitmap = NULLPTR,
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
+ /// \brief Return a StructArray from child arrays and fields.
+ ///
+ /// The length is automatically inferred from the arguments.
+ /// There should be at least one child array. This method does not
+ /// check that field types and child array types are consistent.
+ static Result<std::shared_ptr<Array>> Make(
+ const std::vector<std::shared_ptr<Array>>& children,
+ const std::vector<std::shared_ptr<Field>>& fields,
+ std::shared_ptr<Buffer> null_bitmap = NULLPTR,
+ int64_t null_count = kUnknownNullCount, int64_t offset = 0);
+
const StructType* struct_type() const;
// Return a shared pointer in case the requestor desires to share ownership
diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi
index 4341c41..d974418 100644
--- a/python/pyarrow/array.pxi
+++ b/python/pyarrow/array.pxi
@@ -1380,16 +1380,20 @@ cdef class StructArray(Array):
return [pyarrow_wrap_array(arr) for arr in arrays]
@staticmethod
- def from_arrays(arrays, names=None):
+ def from_arrays(arrays, names=None, fields=None):
"""
- Construct StructArray from collection of arrays representing each field
- in the struct
+ Construct StructArray from collection of arrays representing
+ each field in the struct.
+
+ Either field names or field instances must be passed.
Parameters
----------
arrays : sequence of Array
- names : List[str]
- Field names
+ names : List[str] (optional)
+ Field names for each struct child
+ fields : List[Field] (optional)
+ Field instances for each struct child
Returns
-------
@@ -1399,29 +1403,46 @@ cdef class StructArray(Array):
shared_ptr[CArray] c_array
vector[shared_ptr[CArray]] c_arrays
vector[c_string] c_names
+ vector[shared_ptr[CField]] c_fields
CResult[shared_ptr[CArray]] c_result
ssize_t num_arrays
ssize_t length
ssize_t i
+ Field py_field
DataType struct_type
- if names is None:
- raise ValueError('Names are currently required')
+ if names is None and fields is None:
+ raise ValueError('Must pass either names or fields')
+ if names is not None and fields is not None:
+ raise ValueError('Must pass either names or fields, not both')
arrays = [asarray(x) for x in arrays]
for arr in arrays:
c_arrays.push_back(pyarrow_unwrap_array(arr))
- for name in names:
- c_names.push_back(tobytes(name))
+ if names is not None:
+ for name in names:
+ c_names.push_back(tobytes(name))
+ else:
+ for item in fields:
+ if isinstance(item, tuple):
+ py_field = field(*item)
+ else:
+ py_field = item
+ c_fields.push_back(py_field.sp_field)
- if c_arrays.size() == 0 and c_names.size() == 0:
+ if (c_arrays.size() == 0 and c_names.size() == 0 and
+ c_fields.size() == 0):
# The C++ side doesn't allow this
return array([], struct([]))
- # XXX Cannot pass "nullptr" for a shared_ptr<T> argument:
- # https://github.com/cython/cython/issues/3020
- c_result = CStructArray.Make(c_arrays, c_names,
- shared_ptr[CBuffer](), -1, 0)
+ if names is not None:
+ # XXX Cannot pass "nullptr" for a shared_ptr<T> argument:
+ # https://github.com/cython/cython/issues/3020
+ c_result = CStructArray.MakeFromFieldNames(
+ c_arrays, c_names, shared_ptr[CBuffer](), -1, 0)
+ else:
+ c_result = CStructArray.MakeFromFields(
+ c_arrays, c_fields, shared_ptr[CBuffer](), -1, 0)
return pyarrow_wrap_array(GetResultValue(c_result))
diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd
index bfda15d..4dc6427 100644
--- a/python/pyarrow/includes/libarrow.pxd
+++ b/python/pyarrow/includes/libarrow.pxd
@@ -480,13 +480,21 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
# XXX Cython crashes if default argument values are declared here
# https://github.com/cython/cython/issues/2167
@staticmethod
- CResult[shared_ptr[CArray]] Make(
+ CResult[shared_ptr[CArray]] MakeFromFieldNames "Make"(
vector[shared_ptr[CArray]] children,
vector[c_string] field_names,
shared_ptr[CBuffer] null_bitmap,
int64_t null_count,
int64_t offset)
+ @staticmethod
+ CResult[shared_ptr[CArray]] MakeFromFields "Make"(
+ vector[shared_ptr[CArray]] children,
+ vector[shared_ptr[CField]] fields,
+ shared_ptr[CBuffer] null_bitmap,
+ int64_t null_count,
+ int64_t offset)
+
shared_ptr[CArray] field(int pos)
shared_ptr[CArray] GetFieldByName(const c_string& name) const
diff --git a/python/pyarrow/tests/strategies.py b/python/pyarrow/tests/strategies.py
index 7312ce3..498d738 100644
--- a/python/pyarrow/tests/strategies.py
+++ b/python/pyarrow/tests/strategies.py
@@ -167,13 +167,11 @@ def arrays(draw, type, size=None):
if pa.types.is_struct(type):
h.assume(len(type) > 0)
- names, child_arrays = [], []
+ fields, child_arrays = [], []
for field in type:
- names.append(field.name)
+ fields.append(field)
child_arrays.append(draw(arrays(field.type, size=size)))
- # fields' metadata are lost here, because from_arrays doesn't accept
- # a fields argumentum, only names
- return pa.StructArray.from_arrays(child_arrays, names=names)
+ return pa.StructArray.from_arrays(child_arrays, fields=fields)
if (pa.types.is_boolean(type) or pa.types.is_integer(type) or
pa.types.is_floating(type)):
diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py
index c14d291..c591544 100644
--- a/python/pyarrow/tests/test_array.py
+++ b/python/pyarrow/tests/test_array.py
@@ -388,17 +388,21 @@ def test_struct_from_buffers():
def test_struct_from_arrays():
- a = pa.array([4, 5, None])
+ a = pa.array([4, 5, 6])
b = pa.array(["bar", None, ""])
c = pa.array([[1, 2], None, [3, None]])
-
- arr = pa.StructArray.from_arrays([a, b, c], ["a", "b", "c"])
- assert arr.to_pylist() == [
+ expected_list = [
{'a': 4, 'b': 'bar', 'c': [1, 2]},
{'a': 5, 'b': None, 'c': None},
- {'a': None, 'b': '', 'c': [3, None]},
+ {'a': 6, 'b': '', 'c': [3, None]},
]
+ # From field names
+ arr = pa.StructArray.from_arrays([a, b, c], ["a", "b", "c"])
+ assert arr.type == pa.struct(
+ [("a", a.type), ("b", b.type), ("c", c.type)])
+ assert arr.to_pylist() == expected_list
+
with pytest.raises(ValueError):
pa.StructArray.from_arrays([a, b, c], ["a", "b"])
@@ -406,6 +410,22 @@ def test_struct_from_arrays():
assert arr.type == pa.struct([])
assert arr.to_pylist() == []
+ # From fields
+ fa = pa.field("a", a.type, nullable=False)
+ fb = pa.field("b", b.type)
+ fc = pa.field("c", b.type)
+ arr = pa.StructArray.from_arrays([a, b, c], fields=[fa, fb, fc])
+ assert arr.type == pa.struct([fa, fb, fc])
+ assert not arr.type[0].nullable
+ assert arr.to_pylist() == expected_list
+
+ with pytest.raises(ValueError):
+ pa.StructArray.from_arrays([a, b, c], fields=[fa, fb])
+
+ arr = pa.StructArray.from_arrays([], fields=[])
+ assert arr.type == pa.struct([])
+ assert arr.to_pylist() == []
+
def test_dictionary_from_numpy():
indices = np.repeat([0, 1, 2], 2)