You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2019/06/13 19:45:22 UTC

[arrow] branch master updated: ARROW-5531: [Python] Implement Array.from_buffers for varbinary and nested types, add DataType.num_buffers property

This is an automated email from the ASF dual-hosted git repository.

wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new f068424  ARROW-5531: [Python] Implement Array.from_buffers for varbinary and nested types, add DataType.num_buffers property
f068424 is described below

commit f068424de7fea2a3d76eef6dc437745141a3d1d3
Author: Wes McKinney <we...@apache.org>
AuthorDate: Thu Jun 13 14:45:13 2019 -0500

    ARROW-5531: [Python] Implement Array.from_buffers for varbinary and nested types, add DataType.num_buffers property
    
    Thanks to Antoine's recent work on `Array::View` this method can be made more robust and safe by checking for the correct number of buffers.
    
    Author: Wes McKinney <we...@apache.org>
    
    Closes #4537 from wesm/ARROW-5531 and squashes the following commits:
    
    ec0695d86 <Wes McKinney> Address code review feedback
    a72533831 <Wes McKinney> Implement Array.from_buffers for nested types, add DataType.num_buffers, more checks
---
 python/pyarrow/array.pxi             |  53 ++++++++++-------
 python/pyarrow/includes/libarrow.pxd |   6 ++
 python/pyarrow/tests/test_array.py   | 107 ++++++++++++++++++++++++-----------
 python/pyarrow/types.pxi             |  29 +++++-----
 4 files changed, 128 insertions(+), 67 deletions(-)

diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi
index ae6104f..cce967e 100644
--- a/python/pyarrow/array.pxi
+++ b/python/pyarrow/array.pxi
@@ -561,9 +561,10 @@ cdef class Array(_PandasConvertible):
             (_reduce_array_data(self.sp_array.get().data().get()),)
 
     @staticmethod
-    def from_buffers(DataType type, length, buffers, null_count=-1, offset=0):
+    def from_buffers(DataType type, length, buffers, null_count=-1, offset=0,
+                     children=None):
         """
-        Construct an Array from a sequence of buffers.  The concrete type
+        Construct an Array from a sequence of buffers. The concrete type
         returned depends on the datatype.
 
         Parameters
@@ -578,6 +579,8 @@ cdef class Array(_PandasConvertible):
         offset : int, default 0
             The array's logical offset (in values, not in bytes) from the
             start of each buffer
+        children : List[Array], default None
+            Nested type children with length matching type.num_children
 
         Returns
         -------
@@ -585,19 +588,36 @@ cdef class Array(_PandasConvertible):
         """
         cdef:
             Buffer buf
+            Array child
             vector[shared_ptr[CBuffer]] c_buffers
-            shared_ptr[CArrayData] ad
+            vector[shared_ptr[CArrayData]] c_child_data
+            shared_ptr[CArrayData] array_data
 
-        if not is_primitive(type.id):
-            raise NotImplementedError("from_buffers is only supported for "
-                                      "primitive arrays yet.")
+        children = children or []
+
+        if type.num_children != len(children):
+            raise ValueError("Type's expected number of children "
+                             "({0}) did not match the passed number "
+                             "({1}).".format(type.num_children, len(children)))
+
+        if type.num_buffers != len(buffers):
+            raise ValueError("Type's expected number of buffers "
+                             "({0}) did not match the passed number "
+                             "({1}).".format(type.num_buffers, len(buffers)))
 
         for buf in buffers:
             # None will produce a null buffer pointer
             c_buffers.push_back(pyarrow_unwrap_buffer(buf))
-        ad = CArrayData.Make(type.sp_type, length, c_buffers,
-                             null_count, offset)
-        return pyarrow_wrap_array(MakeArray(ad))
+
+        for child in children:
+            c_child_data.push_back(child.ap.data())
+
+        array_data = CArrayData.MakeWithChildren(type.sp_type, length,
+                                                 c_buffers, c_child_data,
+                                                 null_count, offset)
+        cdef Array result = pyarrow_wrap_array(MakeArray(array_data))
+        result.validate()
+        return result
 
     @property
     def null_count(self):
@@ -1214,18 +1234,9 @@ cdef class StringArray(Array):
         -------
         string_array : StringArray
         """
-        cdef shared_ptr[CBuffer] c_null_bitmap
-        cdef shared_ptr[CArray] out
-
-        if null_bitmap is not None:
-            c_null_bitmap = null_bitmap.buffer
-        else:
-            null_count = 0
-
-        out.reset(new CStringArray(
-            length, value_offsets.buffer, data.buffer, c_null_bitmap,
-            null_count, offset))
-        return pyarrow_wrap_array(out)
+        return Array.from_buffers(utf8(), length,
+                                  [null_bitmap, value_offsets, data],
+                                  null_count, offset)
 
 
 cdef class BinaryArray(Array):
diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd
index 1e32b87..f979cd6 100644
--- a/python/pyarrow/includes/libarrow.pxd
+++ b/python/pyarrow/includes/libarrow.pxd
@@ -83,6 +83,10 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
         TimeUnit_MICRO" arrow::TimeUnit::MICRO"
         TimeUnit_NANO" arrow::TimeUnit::NANO"
 
+    cdef cppclass CDataTypeLayout" arrow::DataTypeLayout":
+        vector[int64_t] bit_widths
+        c_bool has_dictionary
+
     cdef cppclass CDataType" arrow::DataType":
         Type id()
 
@@ -94,6 +98,8 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
 
         int num_children()
 
+        CDataTypeLayout layout()
+
         c_string ToString()
 
     c_bool is_primitive(Type type)
diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py
index 0b79017..f4fc23c 100644
--- a/python/pyarrow/tests/test_array.py
+++ b/python/pyarrow/tests/test_array.py
@@ -310,8 +310,81 @@ def test_array_from_buffers():
     with pytest.raises(TypeError):
         pa.Array.from_buffers(pa.int16(), 3, [u'', u''], offset=1)
 
-    with pytest.raises(NotImplementedError):
-        pa.Array.from_buffers(pa.list_(pa.int16()), 4, [None, values_buf])
+
+def test_string_binary_from_buffers():
+    array = pa.array(["a", None, "b", "c"])
+
+    buffers = array.buffers()
+    copied = pa.StringArray.from_buffers(
+        len(array), buffers[1], buffers[2], buffers[0], array.null_count,
+        array.offset)
+    assert copied.to_pylist() == ["a", None, "b", "c"]
+
+    binary_copy = pa.Array.from_buffers(pa.binary(), len(array),
+                                        array.buffers(), array.null_count,
+                                        array.offset)
+    assert binary_copy.to_pylist() == [b"a", None, b"b", b"c"]
+
+    copied = pa.StringArray.from_buffers(
+        len(array), buffers[1], buffers[2], buffers[0])
+    assert copied.to_pylist() == ["a", None, "b", "c"]
+
+    sliced = array[1:]
+    buffers = sliced.buffers()
+    copied = pa.StringArray.from_buffers(
+        len(sliced), buffers[1], buffers[2], buffers[0], -1, sliced.offset)
+    assert copied.to_pylist() == [None, "b", "c"]
+    assert copied.null_count == 1
+
+    # Slice but exclude all null entries so that we don't need to pass
+    # the null bitmap.
+    sliced = array[2:]
+    buffers = sliced.buffers()
+    copied = pa.StringArray.from_buffers(
+        len(sliced), buffers[1], buffers[2], None, -1, sliced.offset)
+    assert copied.to_pylist() == ["b", "c"]
+    assert copied.null_count == 0
+
+
+def test_list_from_buffers():
+    ty = pa.list_(pa.int16())
+    array = pa.array([[0, 1, 2], None, [], [3, 4, 5]], type=ty)
+
+    buffers = array.buffers()
+
+    with pytest.raises(ValueError):
+        # No children
+        pa.Array.from_buffers(ty, 4, [None, buffers[1]])
+
+    child = pa.Array.from_buffers(pa.int16(), 6, buffers[2:])
+    copied = pa.Array.from_buffers(ty, 4, buffers[:2], children=[child])
+    assert copied.equals(array)
+
+    with pytest.raises(ValueError):
+        # too many children
+        pa.Array.from_buffers(ty, 4, [None, buffers[1]],
+                              children=[child, child])
+
+
+def test_struct_from_buffers():
+    ty = pa.struct([pa.field('a', pa.int16()), pa.field('b', pa.utf8())])
+    array = pa.array([{'a': 0, 'b': 'foo'}, None, {'a': 5, 'b': ''}],
+                     type=ty)
+    buffers = array.buffers()
+
+    with pytest.raises(ValueError):
+        # No children
+        pa.Array.from_buffers(ty, 3, [None, buffers[1]])
+
+    children = [pa.Array.from_buffers(pa.int16(), 3, buffers[1:3]),
+                pa.Array.from_buffers(pa.utf8(), 3, buffers[3:])]
+    copied = pa.Array.from_buffers(ty, 3, buffers[:1], children=children)
+    assert copied.equals(array)
+
+    with pytest.raises(ValueError):
+        # not enough many children
+        pa.Array.from_buffers(ty, 3, [buffers[0]],
+                              children=children[:1])
 
 
 def test_dictionary_from_numpy():
@@ -499,36 +572,6 @@ def test_union_array_slice():
             assert arr[i:j].to_pylist() == lst[i:j]
 
 
-def test_string_from_buffers():
-    array = pa.array(["a", None, "b", "c"])
-
-    buffers = array.buffers()
-    copied = pa.StringArray.from_buffers(
-        len(array), buffers[1], buffers[2], buffers[0], array.null_count,
-        array.offset)
-    assert copied.to_pylist() == ["a", None, "b", "c"]
-
-    copied = pa.StringArray.from_buffers(
-        len(array), buffers[1], buffers[2], buffers[0])
-    assert copied.to_pylist() == ["a", None, "b", "c"]
-
-    sliced = array[1:]
-    buffers = sliced.buffers()
-    copied = pa.StringArray.from_buffers(
-        len(sliced), buffers[1], buffers[2], buffers[0], -1, sliced.offset)
-    assert copied.to_pylist() == [None, "b", "c"]
-    assert copied.null_count == 1
-
-    # Slice but exclude all null entries so that we don't need to pass
-    # the null bitmap.
-    sliced = array[2:]
-    buffers = sliced.buffers()
-    copied = pa.StringArray.from_buffers(
-        len(sliced), buffers[1], buffers[2], None, -1, sliced.offset)
-    assert copied.to_pylist() == ["b", "c"]
-    assert copied.null_count == 0
-
-
 def _check_cast_case(case, safe=True):
     in_data, in_type, out_data, out_type = case
     if isinstance(out_data, pa.Array):
diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi
index 24feec7..9a92761 100644
--- a/python/pyarrow/types.pxi
+++ b/python/pyarrow/types.pxi
@@ -124,6 +124,21 @@ cdef class DataType:
             raise ValueError("Non-fixed width type")
         return ty.bit_width()
 
+    @property
+    def num_children(self):
+        """
+        The number of child fields.
+        """
+        return self.type.num_children()
+
+    @property
+    def num_buffers(self):
+        """
+        Number of data buffers required to construct Array type
+        excluding children
+        """
+        return self.type.layout().bit_widths.size()
+
     def __str__(self):
         return frombytes(self.type.ToString())
 
@@ -297,13 +312,6 @@ cdef class StructType(DataType):
     def __reduce__(self):
         return struct, (list(self),)
 
-    @property
-    def num_children(self):
-        """
-        The number of struct fields.
-        """
-        return self.type.num_children()
-
 
 cdef class UnionType(DataType):
     """
@@ -314,13 +322,6 @@ cdef class UnionType(DataType):
         DataType.init(self, type)
 
     @property
-    def num_children(self):
-        """
-        The number of union members.
-        """
-        return self.type.num_children()
-
-    @property
     def mode(self):
         """
         The mode of the union ("dense" or "sparse").