You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ap...@apache.org on 2020/08/10 12:58:46 UTC
[arrow] branch master updated: ARROW-9429: [Python]
ChunkedArray.to_numpy
This is an automated email from the ASF dual-hosted git repository.
apitrou pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 52d0fe6 ARROW-9429: [Python] ChunkedArray.to_numpy
52d0fe6 is described below
commit 52d0fe618d06df8018c6af96e09e0c4ee538e724
Author: Krisztián Szűcs <sz...@gmail.com>
AuthorDate: Mon Aug 10 14:57:54 2020 +0200
ARROW-9429: [Python] ChunkedArray.to_numpy
While it technically is still using pandas during the conversion it exposes the to_numpy() method.
Also refactored the chunked array construction to support more flexible input, see the test case.
Closes #7868 from kszucs/ARROW-9429
Lead-authored-by: Krisztián Szűcs <sz...@gmail.com>
Co-authored-by: Antoine Pitrou <an...@python.org>
Signed-off-by: Antoine Pitrou <an...@python.org>
---
python/pyarrow/table.pxi | 68 ++++++++++++++++----------
python/pyarrow/tests/test_extension_type.py | 21 ++++++--
python/pyarrow/tests/test_table.py | 74 ++++++++++++++++++++++++++++-
3 files changed, 132 insertions(+), 31 deletions(-)
diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi
index 37064a5..b8205a3 100644
--- a/python/pyarrow/table.pxi
+++ b/python/pyarrow/table.pxi
@@ -226,24 +226,35 @@ cdef class ChunkedArray(_PandasConvertible):
def _to_pandas(self, options, **kwargs):
return _array_like_to_pandas(self, options)
- def __array__(self, dtype=None):
+ def to_numpy(self):
+ """
+ Return a NumPy copy of this array (experimental).
+
+ Returns
+ -------
+ array : numpy.ndarray
+ """
cdef:
PyObject* out
PandasOptions c_options
object values
if self.type.id == _Type_EXTENSION:
- return (
- chunked_array(
- [self.chunk(i).storage for i in range(self.num_chunks)]
- ).__array__(dtype)
+ storage_array = chunked_array(
+ [chunk.storage for chunk in self.iterchunks()],
+ type=self.type.storage_type
)
+ return storage_array.to_numpy()
with nogil:
- check_status(libarrow.ConvertChunkedArrayToPandas(
- c_options,
- self.sp_chunked_array,
- self, &out))
+ check_status(
+ ConvertChunkedArrayToPandas(
+ c_options,
+ self.sp_chunked_array,
+ self,
+ &out
+ )
+ )
# wrap_array_output uses pandas to convert to Categorical, here
# always convert to numpy array
@@ -252,6 +263,10 @@ cdef class ChunkedArray(_PandasConvertible):
if isinstance(values, dict):
values = np.take(values['dictionary'], values['indices'])
+ return values
+
+ def __array__(self, dtype=None):
+ values = self.to_numpy()
if dtype is None:
return values
return values.astype(dtype)
@@ -416,7 +431,6 @@ def chunked_array(arrays, type=None):
Must all be the same data type. Can be empty only if type also passed.
type : DataType or string coercible to DataType
-
Returns
-------
ChunkedArray
@@ -425,31 +439,35 @@ def chunked_array(arrays, type=None):
Array arr
vector[shared_ptr[CArray]] c_arrays
shared_ptr[CChunkedArray] sp_chunked_array
- shared_ptr[CDataType] sp_data_type
+
+ type = ensure_type(type, allow_none=True)
if isinstance(arrays, Array):
arrays = [arrays]
for x in arrays:
- if isinstance(x, Array):
- arr = x
- if type is not None:
- assert x.type == type
+ arr = x if isinstance(x, Array) else array(x, type=type)
+
+ if type is None:
+ # it allows more flexible chunked array construction from to coerce
+ # subsequent arrays to the firstly inferred array type
+ # it also spares the inference overhead after the first chunk
+ type = arr.type
else:
- arr = array(x, type=type)
+ if arr.type != type:
+ raise TypeError(
+ "All array chunks must have type {}".format(type)
+ )
c_arrays.push_back(arr.sp_array)
- if type:
- type = ensure_type(type)
- sp_data_type = pyarrow_unwrap_data_type(type)
- sp_chunked_array.reset(new CChunkedArray(c_arrays, sp_data_type))
- else:
- if c_arrays.size() == 0:
- raise ValueError("When passing an empty collection of arrays "
- "you must also pass the data type")
- sp_chunked_array.reset(new CChunkedArray(c_arrays))
+ if c_arrays.size() == 0 and type is None:
+ raise ValueError("When passing an empty collection of arrays "
+ "you must also pass the data type")
+ sp_chunked_array.reset(
+ new CChunkedArray(c_arrays, pyarrow_unwrap_data_type(type))
+ )
with nogil:
check_status(sp_chunked_array.get().Validate())
diff --git a/python/pyarrow/tests/test_extension_type.py b/python/pyarrow/tests/test_extension_type.py
index dafa4f0..a3ef336 100644
--- a/python/pyarrow/tests/test_extension_type.py
+++ b/python/pyarrow/tests/test_extension_type.py
@@ -482,7 +482,20 @@ def test_to_numpy():
np.testing.assert_array_equal(result, expected)
# chunked array
- charr = pa.chunked_array([arr])
-
- result = np.asarray(charr)
- np.testing.assert_array_equal(result, expected)
+ a1 = pa.chunked_array([arr, arr])
+ a2 = pa.chunked_array([arr, arr], type=period_type)
+ expected = np.hstack([expected, expected])
+
+ for charr in [a1, a2]:
+ assert charr.type == period_type
+ for result in [np.asarray(charr), charr.to_numpy()]:
+ assert result.dtype == np.int64
+ np.testing.assert_array_equal(result, expected)
+
+ # zero chunks
+ charr = pa.chunked_array([], type=period_type)
+ assert charr.type == period_type
+
+ for result in [np.asarray(charr), charr.to_numpy()]:
+ assert result.dtype == np.int64
+ np.testing.assert_array_equal(result, np.array([], dtype='int64'))
diff --git a/python/pyarrow/tests/test_table.py b/python/pyarrow/tests/test_table.py
index af32e18..4012a91 100644
--- a/python/pyarrow/tests/test_table.py
+++ b/python/pyarrow/tests/test_table.py
@@ -57,9 +57,79 @@ def test_chunked_array_basics():
assert wr() is None
+def test_chunked_array_construction():
+ arr = pa.chunked_array([
+ [1, 2, 3],
+ [4, 5, 6],
+ [7, 8, 9],
+ ])
+ assert arr.type == pa.int64()
+ assert len(arr) == 9
+ assert len(arr.chunks) == 3
+
+ arr = pa.chunked_array([
+ [1, 2, 3],
+ [4., 5., 6.],
+ [7, 8, 9],
+ ])
+ assert arr.type == pa.int64()
+ assert len(arr) == 9
+ assert len(arr.chunks) == 3
+
+ arr = pa.chunked_array([
+ [1, 2, 3],
+ [4., 5., 6.],
+ [7, 8, 9],
+ ], type=pa.int8())
+ assert arr.type == pa.int8()
+ assert len(arr) == 9
+ assert len(arr.chunks) == 3
+
+ arr = pa.chunked_array([
+ [1, 2, 3],
+ []
+ ])
+ assert arr.type == pa.int64()
+ assert len(arr) == 3
+ assert len(arr.chunks) == 2
+
+ msg = (
+ "When passing an empty collection of arrays you must also pass the "
+ "data type"
+ )
+ with pytest.raises(ValueError, match=msg):
+ assert pa.chunked_array([])
+
+ assert pa.chunked_array([], type=pa.string()).type == pa.string()
+ assert pa.chunked_array([[]]).type == pa.null()
+ assert pa.chunked_array([[]], type=pa.string()).type == pa.string()
+
+
+def test_chunked_array_to_numpy():
+ data = pa.chunked_array([
+ [1, 2, 3],
+ [4, 5, 6],
+ []
+ ])
+ arr1 = np.asarray(data)
+ arr2 = data.to_numpy()
+
+ assert isinstance(arr2, np.ndarray)
+ assert arr2.shape == (6,)
+ assert np.array_equal(arr1, arr2)
+
+
def test_chunked_array_mismatch_types():
- with pytest.raises(pa.ArrowInvalid):
- pa.chunked_array([pa.array([1, 2]), pa.array(['foo', 'bar'])])
+ with pytest.raises(TypeError):
+ # Given array types are different
+ pa.chunked_array([
+ pa.array([1, 2, 3]),
+ pa.array([1., 2., 3.])
+ ])
+
+ with pytest.raises(TypeError):
+ # Given array type is different from explicit type argument
+ pa.chunked_array([pa.array([1, 2, 3])], type=pa.float64())
def test_chunked_array_str():