You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2019/06/13 19:45:22 UTC
[arrow] branch master updated: ARROW-5531: [Python] Implement
Array.from_buffers for varbinary and nested types,
add DataType.num_buffers property
This is an automated email from the ASF dual-hosted git repository.
wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new f068424 ARROW-5531: [Python] Implement Array.from_buffers for varbinary and nested types, add DataType.num_buffers property
f068424 is described below
commit f068424de7fea2a3d76eef6dc437745141a3d1d3
Author: Wes McKinney <we...@apache.org>
AuthorDate: Thu Jun 13 14:45:13 2019 -0500
ARROW-5531: [Python] Implement Array.from_buffers for varbinary and nested types, add DataType.num_buffers property
Thanks to Antoine's recent work on `Array::View` this method can be made more robust and safe by checking for the correct number of buffers.
Author: Wes McKinney <we...@apache.org>
Closes #4537 from wesm/ARROW-5531 and squashes the following commits:
ec0695d86 <Wes McKinney> Address code review feedback
a72533831 <Wes McKinney> Implement Array.from_buffers for nested types, add DataType.num_buffers, more checks
---
python/pyarrow/array.pxi | 53 ++++++++++-------
python/pyarrow/includes/libarrow.pxd | 6 ++
python/pyarrow/tests/test_array.py | 107 ++++++++++++++++++++++++-----------
python/pyarrow/types.pxi | 29 +++++-----
4 files changed, 128 insertions(+), 67 deletions(-)
diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi
index ae6104f..cce967e 100644
--- a/python/pyarrow/array.pxi
+++ b/python/pyarrow/array.pxi
@@ -561,9 +561,10 @@ cdef class Array(_PandasConvertible):
(_reduce_array_data(self.sp_array.get().data().get()),)
@staticmethod
- def from_buffers(DataType type, length, buffers, null_count=-1, offset=0):
+ def from_buffers(DataType type, length, buffers, null_count=-1, offset=0,
+ children=None):
"""
- Construct an Array from a sequence of buffers. The concrete type
+ Construct an Array from a sequence of buffers. The concrete type
returned depends on the datatype.
Parameters
@@ -578,6 +579,8 @@ cdef class Array(_PandasConvertible):
offset : int, default 0
The array's logical offset (in values, not in bytes) from the
start of each buffer
+ children : List[Array], default None
+ Nested type children with length matching type.num_children
Returns
-------
@@ -585,19 +588,36 @@ cdef class Array(_PandasConvertible):
"""
cdef:
Buffer buf
+ Array child
vector[shared_ptr[CBuffer]] c_buffers
- shared_ptr[CArrayData] ad
+ vector[shared_ptr[CArrayData]] c_child_data
+ shared_ptr[CArrayData] array_data
- if not is_primitive(type.id):
- raise NotImplementedError("from_buffers is only supported for "
- "primitive arrays yet.")
+ children = children or []
+
+ if type.num_children != len(children):
+ raise ValueError("Type's expected number of children "
+ "({0}) did not match the passed number "
+ "({1}).".format(type.num_children, len(children)))
+
+ if type.num_buffers != len(buffers):
+ raise ValueError("Type's expected number of buffers "
+ "({0}) did not match the passed number "
+ "({1}).".format(type.num_buffers, len(buffers)))
for buf in buffers:
# None will produce a null buffer pointer
c_buffers.push_back(pyarrow_unwrap_buffer(buf))
- ad = CArrayData.Make(type.sp_type, length, c_buffers,
- null_count, offset)
- return pyarrow_wrap_array(MakeArray(ad))
+
+ for child in children:
+ c_child_data.push_back(child.ap.data())
+
+ array_data = CArrayData.MakeWithChildren(type.sp_type, length,
+ c_buffers, c_child_data,
+ null_count, offset)
+ cdef Array result = pyarrow_wrap_array(MakeArray(array_data))
+ result.validate()
+ return result
@property
def null_count(self):
@@ -1214,18 +1234,9 @@ cdef class StringArray(Array):
-------
string_array : StringArray
"""
- cdef shared_ptr[CBuffer] c_null_bitmap
- cdef shared_ptr[CArray] out
-
- if null_bitmap is not None:
- c_null_bitmap = null_bitmap.buffer
- else:
- null_count = 0
-
- out.reset(new CStringArray(
- length, value_offsets.buffer, data.buffer, c_null_bitmap,
- null_count, offset))
- return pyarrow_wrap_array(out)
+ return Array.from_buffers(utf8(), length,
+ [null_bitmap, value_offsets, data],
+ null_count, offset)
cdef class BinaryArray(Array):
diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd
index 1e32b87..f979cd6 100644
--- a/python/pyarrow/includes/libarrow.pxd
+++ b/python/pyarrow/includes/libarrow.pxd
@@ -83,6 +83,10 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
TimeUnit_MICRO" arrow::TimeUnit::MICRO"
TimeUnit_NANO" arrow::TimeUnit::NANO"
+ cdef cppclass CDataTypeLayout" arrow::DataTypeLayout":
+ vector[int64_t] bit_widths
+ c_bool has_dictionary
+
cdef cppclass CDataType" arrow::DataType":
Type id()
@@ -94,6 +98,8 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
int num_children()
+ CDataTypeLayout layout()
+
c_string ToString()
c_bool is_primitive(Type type)
diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py
index 0b79017..f4fc23c 100644
--- a/python/pyarrow/tests/test_array.py
+++ b/python/pyarrow/tests/test_array.py
@@ -310,8 +310,81 @@ def test_array_from_buffers():
with pytest.raises(TypeError):
pa.Array.from_buffers(pa.int16(), 3, [u'', u''], offset=1)
- with pytest.raises(NotImplementedError):
- pa.Array.from_buffers(pa.list_(pa.int16()), 4, [None, values_buf])
+
+def test_string_binary_from_buffers():
+ array = pa.array(["a", None, "b", "c"])
+
+ buffers = array.buffers()
+ copied = pa.StringArray.from_buffers(
+ len(array), buffers[1], buffers[2], buffers[0], array.null_count,
+ array.offset)
+ assert copied.to_pylist() == ["a", None, "b", "c"]
+
+ binary_copy = pa.Array.from_buffers(pa.binary(), len(array),
+ array.buffers(), array.null_count,
+ array.offset)
+ assert binary_copy.to_pylist() == [b"a", None, b"b", b"c"]
+
+ copied = pa.StringArray.from_buffers(
+ len(array), buffers[1], buffers[2], buffers[0])
+ assert copied.to_pylist() == ["a", None, "b", "c"]
+
+ sliced = array[1:]
+ buffers = sliced.buffers()
+ copied = pa.StringArray.from_buffers(
+ len(sliced), buffers[1], buffers[2], buffers[0], -1, sliced.offset)
+ assert copied.to_pylist() == [None, "b", "c"]
+ assert copied.null_count == 1
+
+ # Slice but exclude all null entries so that we don't need to pass
+ # the null bitmap.
+ sliced = array[2:]
+ buffers = sliced.buffers()
+ copied = pa.StringArray.from_buffers(
+ len(sliced), buffers[1], buffers[2], None, -1, sliced.offset)
+ assert copied.to_pylist() == ["b", "c"]
+ assert copied.null_count == 0
+
+
+def test_list_from_buffers():
+ ty = pa.list_(pa.int16())
+ array = pa.array([[0, 1, 2], None, [], [3, 4, 5]], type=ty)
+
+ buffers = array.buffers()
+
+ with pytest.raises(ValueError):
+ # No children
+ pa.Array.from_buffers(ty, 4, [None, buffers[1]])
+
+ child = pa.Array.from_buffers(pa.int16(), 6, buffers[2:])
+ copied = pa.Array.from_buffers(ty, 4, buffers[:2], children=[child])
+ assert copied.equals(array)
+
+ with pytest.raises(ValueError):
+ # too many children
+ pa.Array.from_buffers(ty, 4, [None, buffers[1]],
+ children=[child, child])
+
+
+def test_struct_from_buffers():
+ ty = pa.struct([pa.field('a', pa.int16()), pa.field('b', pa.utf8())])
+ array = pa.array([{'a': 0, 'b': 'foo'}, None, {'a': 5, 'b': ''}],
+ type=ty)
+ buffers = array.buffers()
+
+ with pytest.raises(ValueError):
+ # No children
+ pa.Array.from_buffers(ty, 3, [None, buffers[1]])
+
+ children = [pa.Array.from_buffers(pa.int16(), 3, buffers[1:3]),
+ pa.Array.from_buffers(pa.utf8(), 3, buffers[3:])]
+ copied = pa.Array.from_buffers(ty, 3, buffers[:1], children=children)
+ assert copied.equals(array)
+
+ with pytest.raises(ValueError):
+ # not enough many children
+ pa.Array.from_buffers(ty, 3, [buffers[0]],
+ children=children[:1])
def test_dictionary_from_numpy():
@@ -499,36 +572,6 @@ def test_union_array_slice():
assert arr[i:j].to_pylist() == lst[i:j]
-def test_string_from_buffers():
- array = pa.array(["a", None, "b", "c"])
-
- buffers = array.buffers()
- copied = pa.StringArray.from_buffers(
- len(array), buffers[1], buffers[2], buffers[0], array.null_count,
- array.offset)
- assert copied.to_pylist() == ["a", None, "b", "c"]
-
- copied = pa.StringArray.from_buffers(
- len(array), buffers[1], buffers[2], buffers[0])
- assert copied.to_pylist() == ["a", None, "b", "c"]
-
- sliced = array[1:]
- buffers = sliced.buffers()
- copied = pa.StringArray.from_buffers(
- len(sliced), buffers[1], buffers[2], buffers[0], -1, sliced.offset)
- assert copied.to_pylist() == [None, "b", "c"]
- assert copied.null_count == 1
-
- # Slice but exclude all null entries so that we don't need to pass
- # the null bitmap.
- sliced = array[2:]
- buffers = sliced.buffers()
- copied = pa.StringArray.from_buffers(
- len(sliced), buffers[1], buffers[2], None, -1, sliced.offset)
- assert copied.to_pylist() == ["b", "c"]
- assert copied.null_count == 0
-
-
def _check_cast_case(case, safe=True):
in_data, in_type, out_data, out_type = case
if isinstance(out_data, pa.Array):
diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi
index 24feec7..9a92761 100644
--- a/python/pyarrow/types.pxi
+++ b/python/pyarrow/types.pxi
@@ -124,6 +124,21 @@ cdef class DataType:
raise ValueError("Non-fixed width type")
return ty.bit_width()
+ @property
+ def num_children(self):
+ """
+ The number of child fields.
+ """
+ return self.type.num_children()
+
+ @property
+ def num_buffers(self):
+ """
+ Number of data buffers required to construct Array type
+ excluding children
+ """
+ return self.type.layout().bit_widths.size()
+
def __str__(self):
return frombytes(self.type.ToString())
@@ -297,13 +312,6 @@ cdef class StructType(DataType):
def __reduce__(self):
return struct, (list(self),)
- @property
- def num_children(self):
- """
- The number of struct fields.
- """
- return self.type.num_children()
-
cdef class UnionType(DataType):
"""
@@ -314,13 +322,6 @@ cdef class UnionType(DataType):
DataType.init(self, type)
@property
- def num_children(self):
- """
- The number of union members.
- """
- return self.type.num_children()
-
- @property
def mode(self):
"""
The mode of the union ("dense" or "sparse").