You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2019/08/01 22:05:29 UTC

[arrow] branch master updated: ARROW-6068: [C++] Allow passing Field instances to StructArray::Make

This is an automated email from the ASF dual-hosted git repository.

wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new d44f03f  ARROW-6068: [C++] Allow passing Field instances to StructArray::Make
d44f03f is described below

commit d44f03fadd1b8deb4cb17ef5584d9094c718579c
Author: Antoine Pitrou <an...@python.org>
AuthorDate: Thu Aug 1 17:05:11 2019 -0500

    ARROW-6068: [C++] Allow passing Field instances to StructArray::Make
    
    This helps fix a Python test failure when hypothesis testing is enabled.
    
    Closes #4981 from pitrou/ARROW-6068-struct-array-from-fields and squashes the following commits:
    
    12049109a <Antoine Pitrou> ARROW-6068:  Allow passing Field instances to StructArray::Make
    
    Authored-by: Antoine Pitrou <an...@python.org>
    Signed-off-by: Wes McKinney <we...@apache.org>
---
 cpp/src/arrow/array-struct-test.cc   | 59 +++++++++++++++++++++++++++++-------
 cpp/src/arrow/array.cc               | 22 ++++++++++----
 cpp/src/arrow/array.h                | 11 +++++++
 python/pyarrow/array.pxi             | 49 +++++++++++++++++++++---------
 python/pyarrow/includes/libarrow.pxd | 10 +++++-
 python/pyarrow/tests/strategies.py   |  8 ++---
 python/pyarrow/tests/test_array.py   | 30 +++++++++++++++---
 7 files changed, 147 insertions(+), 42 deletions(-)

diff --git a/cpp/src/arrow/array-struct-test.cc b/cpp/src/arrow/array-struct-test.cc
index a5fce6e..9acd2f9 100644
--- a/cpp/src/arrow/array-struct-test.cc
+++ b/cpp/src/arrow/array-struct-test.cc
@@ -80,8 +80,9 @@ TEST(StructArray, FromFieldNames) {
   std::shared_ptr<Array> a, b, c, array, expected;
   a = ArrayFromJSON(int32(), "[4, null]");
   b = ArrayFromJSON(utf8(), R"([null, "foo"])");
+  std::vector<std::string> field_names = {"a", "b"};
 
-  auto res = StructArray::Make({a, b}, {"a", "b"});
+  auto res = StructArray::Make({a, b}, field_names);
   ASSERT_OK(res);
   array = *res;
   expected = ArrayFromJSON(struct_({field("a", int32()), field("b", utf8())}),
@@ -89,30 +90,33 @@ TEST(StructArray, FromFieldNames) {
   AssertArraysEqual(*array, *expected);
 
   // With non-zero offsets
-  res = StructArray::Make({a, b}, {"a", "b"}, /*null_bitmap =*/nullptr, /*null_count =*/0,
-                          /*offset =*/1);
+  res =
+      StructArray::Make({a, b}, field_names, /*null_bitmap =*/nullptr, /*null_count =*/0,
+                        /*offset =*/1);
   ASSERT_OK(res);
   array = *res;
   expected = ArrayFromJSON(struct_({field("a", int32()), field("b", utf8())}),
                            R"([{"a": null, "b": "foo"}])");
   AssertArraysEqual(*array, *expected);
 
-  res = StructArray::Make({a, b}, {"a", "b"}, /*null_bitmap =*/nullptr, /*null_count =*/0,
-                          /*offset =*/2);
+  res =
+      StructArray::Make({a, b}, field_names, /*null_bitmap =*/nullptr, /*null_count =*/0,
+                        /*offset =*/2);
   ASSERT_OK(res);
   array = *res;
   expected = ArrayFromJSON(struct_({field("a", int32()), field("b", utf8())}), R"([])");
   AssertArraysEqual(*array, *expected);
 
   // Offset greater than length
-  res = StructArray::Make({a, b}, {"a", "b"}, /*null_bitmap =*/nullptr, /*null_count =*/0,
-                          /*offset =*/3);
+  res =
+      StructArray::Make({a, b}, field_names, /*null_bitmap =*/nullptr, /*null_count =*/0,
+                        /*offset =*/3);
   ASSERT_RAISES(IndexError, res);
 
   // With null bitmap
   std::shared_ptr<Buffer> null_bitmap;
   BitmapFromVector<bool>({false, true}, &null_bitmap);
-  res = StructArray::Make({a, b}, {"a", "b"}, null_bitmap);
+  res = StructArray::Make({a, b}, field_names, null_bitmap);
   ASSERT_OK(res);
   array = *res;
   expected = ArrayFromJSON(struct_({field("a", int32()), field("b", utf8())}),
@@ -120,16 +124,49 @@ TEST(StructArray, FromFieldNames) {
   AssertArraysEqual(*array, *expected);
 
   // Mismatching array lengths
+  field_names = {"a", "c"};
   c = ArrayFromJSON(int64(), "[1, 2, 3]");
-  res = StructArray::Make({a, c}, {"a", "c"});
+  res = StructArray::Make({a, c}, field_names);
   ASSERT_RAISES(Invalid, res);
 
   // Mismatching number of fields
-  res = StructArray::Make({a, b}, {"a", "b", "c"});
+  field_names = {"a", "b", "c"};
+  res = StructArray::Make({a, b}, field_names);
   ASSERT_RAISES(Invalid, res);
 
   // Fail on 0 children (cannot infer array length)
-  res = StructArray::Make({}, {});
+  field_names = {};
+  res = StructArray::Make({}, field_names);
+  ASSERT_RAISES(Invalid, res);
+}
+
+TEST(StructArray, FromFields) {
+  std::shared_ptr<Array> a, b, c, array, expected;
+  std::shared_ptr<Field> fa, fb, fc;
+  a = ArrayFromJSON(int32(), "[4, 5]");
+  b = ArrayFromJSON(utf8(), R"([null, "foo"])");
+  fa = field("a", int32(), /*nullable =*/false);
+  fb = field("b", utf8(), /*nullable =*/true);
+  fc = field("b", int64(), /*nullable =*/true);
+
+  auto res = StructArray::Make({a, b}, {fa, fb});
+  ASSERT_OK(res);
+  array = *res;
+  expected =
+      ArrayFromJSON(struct_({fa, fb}), R"([{"a": 4, "b": null}, {"a": 5, "b": "foo"}])");
+  AssertArraysEqual(*array, *expected);
+
+  // Mismatching array lengths
+  c = ArrayFromJSON(int64(), "[1, 2, 3]");
+  res = StructArray::Make({a, c}, {fa, fc});
+  ASSERT_RAISES(Invalid, res);
+
+  // Mismatching number of fields
+  res = StructArray::Make({a, b}, {fa, fb, fc});
+  ASSERT_RAISES(Invalid, res);
+
+  // Fail on 0 children (cannot infer array length)
+  res = StructArray::Make({}, std::vector<std::shared_ptr<Field>>{});
   ASSERT_RAISES(Invalid, res);
 }
 
diff --git a/cpp/src/arrow/array.cc b/cpp/src/arrow/array.cc
index 01f0ddb..847ed11 100644
--- a/cpp/src/arrow/array.cc
+++ b/cpp/src/arrow/array.cc
@@ -560,10 +560,10 @@ StructArray::StructArray(const std::shared_ptr<DataType>& type, int64_t length,
 
 Result<std::shared_ptr<Array>> StructArray::Make(
     const std::vector<std::shared_ptr<Array>>& children,
-    const std::vector<std::string>& field_names, std::shared_ptr<Buffer> null_bitmap,
-    int64_t null_count, int64_t offset) {
-  if (children.size() != field_names.size()) {
-    return Status::Invalid("Mismatching number of field names and child arrays");
+    const std::vector<std::shared_ptr<Field>>& fields,
+    std::shared_ptr<Buffer> null_bitmap, int64_t null_count, int64_t offset) {
+  if (children.size() != fields.size()) {
+    return Status::Invalid("Mismatching number of fields and child arrays");
   }
   int64_t length = 0;
   if (children.size() == 0) {
@@ -578,12 +578,22 @@ Result<std::shared_ptr<Array>> StructArray::Make(
   if (offset > length) {
     return Status::IndexError("Offset greater than length of child arrays");
   }
+  return std::make_shared<StructArray>(struct_(fields), length - offset, children,
+                                       null_bitmap, null_count, offset);
+}
+
+Result<std::shared_ptr<Array>> StructArray::Make(
+    const std::vector<std::shared_ptr<Array>>& children,
+    const std::vector<std::string>& field_names, std::shared_ptr<Buffer> null_bitmap,
+    int64_t null_count, int64_t offset) {
+  if (children.size() != field_names.size()) {
+    return Status::Invalid("Mismatching number of field names and child arrays");
+  }
   std::vector<std::shared_ptr<Field>> fields(children.size());
   for (size_t i = 0; i < children.size(); ++i) {
     fields[i] = ::arrow::field(field_names[i], children[i]->type());
   }
-  return std::make_shared<StructArray>(struct_(fields), length - offset, children,
-                                       null_bitmap, null_count, offset);
+  return Make(children, fields, std::move(null_bitmap), null_count, offset);
 }
 
 const StructType* StructArray::struct_type() const {
diff --git a/cpp/src/arrow/array.h b/cpp/src/arrow/array.h
index 2313994..1b3e47f 100644
--- a/cpp/src/arrow/array.h
+++ b/cpp/src/arrow/array.h
@@ -918,6 +918,17 @@ class ARROW_EXPORT StructArray : public Array {
       std::shared_ptr<Buffer> null_bitmap = NULLPTR,
       int64_t null_count = kUnknownNullCount, int64_t offset = 0);
 
+  /// \brief Return a StructArray from child arrays and fields.
+  ///
+  /// The length is automatically inferred from the arguments.
+  /// There should be at least one child array.  This method does not
+  /// check that field types and child array types are consistent.
+  static Result<std::shared_ptr<Array>> Make(
+      const std::vector<std::shared_ptr<Array>>& children,
+      const std::vector<std::shared_ptr<Field>>& fields,
+      std::shared_ptr<Buffer> null_bitmap = NULLPTR,
+      int64_t null_count = kUnknownNullCount, int64_t offset = 0);
+
   const StructType* struct_type() const;
 
   // Return a shared pointer in case the requestor desires to share ownership
diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi
index 4341c41..d974418 100644
--- a/python/pyarrow/array.pxi
+++ b/python/pyarrow/array.pxi
@@ -1380,16 +1380,20 @@ cdef class StructArray(Array):
         return [pyarrow_wrap_array(arr) for arr in arrays]
 
     @staticmethod
-    def from_arrays(arrays, names=None):
+    def from_arrays(arrays, names=None, fields=None):
         """
-        Construct StructArray from collection of arrays representing each field
-        in the struct
+        Construct StructArray from collection of arrays representing
+        each field in the struct.
+
+        Either field names or field instances must be passed.
 
         Parameters
         ----------
         arrays : sequence of Array
-        names : List[str]
-            Field names
+        names : List[str] (optional)
+            Field names for each struct child
+        fields : List[Field] (optional)
+            Field instances for each struct child
 
         Returns
         -------
@@ -1399,29 +1403,46 @@ cdef class StructArray(Array):
             shared_ptr[CArray] c_array
             vector[shared_ptr[CArray]] c_arrays
             vector[c_string] c_names
+            vector[shared_ptr[CField]] c_fields
             CResult[shared_ptr[CArray]] c_result
             ssize_t num_arrays
             ssize_t length
             ssize_t i
+            Field py_field
             DataType struct_type
 
-        if names is None:
-            raise ValueError('Names are currently required')
+        if names is None and fields is None:
+            raise ValueError('Must pass either names or fields')
+        if names is not None and fields is not None:
+            raise ValueError('Must pass either names or fields, not both')
 
         arrays = [asarray(x) for x in arrays]
         for arr in arrays:
             c_arrays.push_back(pyarrow_unwrap_array(arr))
-        for name in names:
-            c_names.push_back(tobytes(name))
+        if names is not None:
+            for name in names:
+                c_names.push_back(tobytes(name))
+        else:
+            for item in fields:
+                if isinstance(item, tuple):
+                    py_field = field(*item)
+                else:
+                    py_field = item
+                c_fields.push_back(py_field.sp_field)
 
-        if c_arrays.size() == 0 and c_names.size() == 0:
+        if (c_arrays.size() == 0 and c_names.size() == 0 and
+                c_fields.size() == 0):
             # The C++ side doesn't allow this
             return array([], struct([]))
 
-        # XXX Cannot pass "nullptr" for a shared_ptr<T> argument:
-        # https://github.com/cython/cython/issues/3020
-        c_result = CStructArray.Make(c_arrays, c_names,
-                                     shared_ptr[CBuffer](), -1, 0)
+        if names is not None:
+            # XXX Cannot pass "nullptr" for a shared_ptr<T> argument:
+            # https://github.com/cython/cython/issues/3020
+            c_result = CStructArray.MakeFromFieldNames(
+                c_arrays, c_names, shared_ptr[CBuffer](), -1, 0)
+        else:
+            c_result = CStructArray.MakeFromFields(
+                c_arrays, c_fields, shared_ptr[CBuffer](), -1, 0)
         return pyarrow_wrap_array(GetResultValue(c_result))
 
 
diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd
index bfda15d..4dc6427 100644
--- a/python/pyarrow/includes/libarrow.pxd
+++ b/python/pyarrow/includes/libarrow.pxd
@@ -480,13 +480,21 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
         # XXX Cython crashes if default argument values are declared here
         # https://github.com/cython/cython/issues/2167
         @staticmethod
-        CResult[shared_ptr[CArray]] Make(
+        CResult[shared_ptr[CArray]] MakeFromFieldNames "Make"(
             vector[shared_ptr[CArray]] children,
             vector[c_string] field_names,
             shared_ptr[CBuffer] null_bitmap,
             int64_t null_count,
             int64_t offset)
 
+        @staticmethod
+        CResult[shared_ptr[CArray]] MakeFromFields "Make"(
+            vector[shared_ptr[CArray]] children,
+            vector[shared_ptr[CField]] fields,
+            shared_ptr[CBuffer] null_bitmap,
+            int64_t null_count,
+            int64_t offset)
+
         shared_ptr[CArray] field(int pos)
         shared_ptr[CArray] GetFieldByName(const c_string& name) const
 
diff --git a/python/pyarrow/tests/strategies.py b/python/pyarrow/tests/strategies.py
index 7312ce3..498d738 100644
--- a/python/pyarrow/tests/strategies.py
+++ b/python/pyarrow/tests/strategies.py
@@ -167,13 +167,11 @@ def arrays(draw, type, size=None):
 
     if pa.types.is_struct(type):
         h.assume(len(type) > 0)
-        names, child_arrays = [], []
+        fields, child_arrays = [], []
         for field in type:
-            names.append(field.name)
+            fields.append(field)
             child_arrays.append(draw(arrays(field.type, size=size)))
-        # fields' metadata are lost here, because from_arrays doesn't accept
-        # a fields argumentum, only names
-        return pa.StructArray.from_arrays(child_arrays, names=names)
+        return pa.StructArray.from_arrays(child_arrays, fields=fields)
 
     if (pa.types.is_boolean(type) or pa.types.is_integer(type) or
             pa.types.is_floating(type)):
diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py
index c14d291..c591544 100644
--- a/python/pyarrow/tests/test_array.py
+++ b/python/pyarrow/tests/test_array.py
@@ -388,17 +388,21 @@ def test_struct_from_buffers():
 
 
 def test_struct_from_arrays():
-    a = pa.array([4, 5, None])
+    a = pa.array([4, 5, 6])
     b = pa.array(["bar", None, ""])
     c = pa.array([[1, 2], None, [3, None]])
-
-    arr = pa.StructArray.from_arrays([a, b, c], ["a", "b", "c"])
-    assert arr.to_pylist() == [
+    expected_list = [
         {'a': 4, 'b': 'bar', 'c': [1, 2]},
         {'a': 5, 'b': None, 'c': None},
-        {'a': None, 'b': '', 'c': [3, None]},
+        {'a': 6, 'b': '', 'c': [3, None]},
     ]
 
+    # From field names
+    arr = pa.StructArray.from_arrays([a, b, c], ["a", "b", "c"])
+    assert arr.type == pa.struct(
+        [("a", a.type), ("b", b.type), ("c", c.type)])
+    assert arr.to_pylist() == expected_list
+
     with pytest.raises(ValueError):
         pa.StructArray.from_arrays([a, b, c], ["a", "b"])
 
@@ -406,6 +410,22 @@ def test_struct_from_arrays():
     assert arr.type == pa.struct([])
     assert arr.to_pylist() == []
 
+    # From fields
+    fa = pa.field("a", a.type, nullable=False)
+    fb = pa.field("b", b.type)
+    fc = pa.field("c", b.type)
+    arr = pa.StructArray.from_arrays([a, b, c], fields=[fa, fb, fc])
+    assert arr.type == pa.struct([fa, fb, fc])
+    assert not arr.type[0].nullable
+    assert arr.to_pylist() == expected_list
+
+    with pytest.raises(ValueError):
+        pa.StructArray.from_arrays([a, b, c], fields=[fa, fb])
+
+    arr = pa.StructArray.from_arrays([], fields=[])
+    assert arr.type == pa.struct([])
+    assert arr.to_pylist() == []
+
 
 def test_dictionary_from_numpy():
     indices = np.repeat([0, 1, 2], 2)