You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ap...@apache.org on 2020/08/10 12:58:46 UTC
[arrow] branch master updated: ARROW-9429: [Python] ChunkedArray.to_numpy

This is an automated email from the ASF dual-hosted git repository.

apitrou pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 52d0fe6  ARROW-9429: [Python] ChunkedArray.to_numpy
52d0fe6 is described below

commit 52d0fe618d06df8018c6af96e09e0c4ee538e724
Author: Krisztián Szűcs <sz...@gmail.com>
AuthorDate: Mon Aug 10 14:57:54 2020 +0200

    ARROW-9429: [Python] ChunkedArray.to_numpy
    
    While it technically is still using pandas during the conversion it exposes the to_numpy() method.
    
    Also refactored the chunked array construction to support more flexible input, see the test case.
    
    Closes #7868 from kszucs/ARROW-9429
    
    Lead-authored-by: Krisztián Szűcs <sz...@gmail.com>
    Co-authored-by: Antoine Pitrou <an...@python.org>
    Signed-off-by: Antoine Pitrou <an...@python.org>
---
 python/pyarrow/table.pxi                    | 68 ++++++++++++++++----------
 python/pyarrow/tests/test_extension_type.py | 21 ++++++--
 python/pyarrow/tests/test_table.py          | 74 ++++++++++++++++++++++++++++-
 3 files changed, 132 insertions(+), 31 deletions(-)

diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi
index 37064a5..b8205a3 100644
--- a/python/pyarrow/table.pxi
+++ b/python/pyarrow/table.pxi
@@ -226,24 +226,35 @@ cdef class ChunkedArray(_PandasConvertible):
     def _to_pandas(self, options, **kwargs):
         return _array_like_to_pandas(self, options)
 
-    def __array__(self, dtype=None):
+    def to_numpy(self):
+        """
+        Return a NumPy copy of this array (experimental).
+
+        Returns
+        -------
+        array : numpy.ndarray
+        """
         cdef:
             PyObject* out
             PandasOptions c_options
             object values
 
         if self.type.id == _Type_EXTENSION:
-            return (
-                chunked_array(
-                    [self.chunk(i).storage for i in range(self.num_chunks)]
-                ).__array__(dtype)
+            storage_array = chunked_array(
+                [chunk.storage for chunk in self.iterchunks()],
+                type=self.type.storage_type
             )
+            return storage_array.to_numpy()
 
         with nogil:
-            check_status(libarrow.ConvertChunkedArrayToPandas(
-                c_options,
-                self.sp_chunked_array,
-                self, &out))
+            check_status(
+                ConvertChunkedArrayToPandas(
+                    c_options,
+                    self.sp_chunked_array,
+                    self,
+                    &out
+                )
+            )
 
         # wrap_array_output uses pandas to convert to Categorical, here
         # always convert to numpy array
@@ -252,6 +263,10 @@ cdef class ChunkedArray(_PandasConvertible):
         if isinstance(values, dict):
             values = np.take(values['dictionary'], values['indices'])
 
+        return values
+
+    def __array__(self, dtype=None):
+        values = self.to_numpy()
         if dtype is None:
             return values
         return values.astype(dtype)
@@ -416,7 +431,6 @@ def chunked_array(arrays, type=None):
         Must all be the same data type. Can be empty only if type also passed.
     type : DataType or string coercible to DataType
 
-
     Returns
     -------
     ChunkedArray
@@ -425,31 +439,35 @@ def chunked_array(arrays, type=None):
         Array arr
         vector[shared_ptr[CArray]] c_arrays
         shared_ptr[CChunkedArray] sp_chunked_array
-        shared_ptr[CDataType] sp_data_type
+
+    type = ensure_type(type, allow_none=True)
 
     if isinstance(arrays, Array):
         arrays = [arrays]
 
     for x in arrays:
-        if isinstance(x, Array):
-            arr = x
-            if type is not None:
-                assert x.type == type
+        arr = x if isinstance(x, Array) else array(x, type=type)
+
+        if type is None:
+            # it allows more flexible chunked array construction from to coerce
+            # subsequent arrays to the firstly inferred array type
+            # it also spares the inference overhead after the first chunk
+            type = arr.type
         else:
-            arr = array(x, type=type)
+            if arr.type != type:
+                raise TypeError(
+                    "All array chunks must have type {}".format(type)
+                )
 
         c_arrays.push_back(arr.sp_array)
 
-    if type:
-        type = ensure_type(type)
-        sp_data_type = pyarrow_unwrap_data_type(type)
-        sp_chunked_array.reset(new CChunkedArray(c_arrays, sp_data_type))
-    else:
-        if c_arrays.size() == 0:
-            raise ValueError("When passing an empty collection of arrays "
-                             "you must also pass the data type")
-        sp_chunked_array.reset(new CChunkedArray(c_arrays))
+    if c_arrays.size() == 0 and type is None:
+        raise ValueError("When passing an empty collection of arrays "
+                         "you must also pass the data type")
 
+    sp_chunked_array.reset(
+        new CChunkedArray(c_arrays, pyarrow_unwrap_data_type(type))
+    )
     with nogil:
         check_status(sp_chunked_array.get().Validate())
 
diff --git a/python/pyarrow/tests/test_extension_type.py b/python/pyarrow/tests/test_extension_type.py
index dafa4f0..a3ef336 100644
--- a/python/pyarrow/tests/test_extension_type.py
+++ b/python/pyarrow/tests/test_extension_type.py
@@ -482,7 +482,20 @@ def test_to_numpy():
     np.testing.assert_array_equal(result, expected)
 
     # chunked array
-    charr = pa.chunked_array([arr])
-
-    result = np.asarray(charr)
-    np.testing.assert_array_equal(result, expected)
+    a1 = pa.chunked_array([arr, arr])
+    a2 = pa.chunked_array([arr, arr], type=period_type)
+    expected = np.hstack([expected, expected])
+
+    for charr in [a1, a2]:
+        assert charr.type == period_type
+        for result in [np.asarray(charr), charr.to_numpy()]:
+            assert result.dtype == np.int64
+            np.testing.assert_array_equal(result, expected)
+
+    # zero chunks
+    charr = pa.chunked_array([], type=period_type)
+    assert charr.type == period_type
+
+    for result in [np.asarray(charr), charr.to_numpy()]:
+        assert result.dtype == np.int64
+        np.testing.assert_array_equal(result, np.array([], dtype='int64'))
diff --git a/python/pyarrow/tests/test_table.py b/python/pyarrow/tests/test_table.py
index af32e18..4012a91 100644
--- a/python/pyarrow/tests/test_table.py
+++ b/python/pyarrow/tests/test_table.py
@@ -57,9 +57,79 @@ def test_chunked_array_basics():
     assert wr() is None
 
 
+def test_chunked_array_construction():
+    arr = pa.chunked_array([
+        [1, 2, 3],
+        [4, 5, 6],
+        [7, 8, 9],
+    ])
+    assert arr.type == pa.int64()
+    assert len(arr) == 9
+    assert len(arr.chunks) == 3
+
+    arr = pa.chunked_array([
+        [1, 2, 3],
+        [4., 5., 6.],
+        [7, 8, 9],
+    ])
+    assert arr.type == pa.int64()
+    assert len(arr) == 9
+    assert len(arr.chunks) == 3
+
+    arr = pa.chunked_array([
+        [1, 2, 3],
+        [4., 5., 6.],
+        [7, 8, 9],
+    ], type=pa.int8())
+    assert arr.type == pa.int8()
+    assert len(arr) == 9
+    assert len(arr.chunks) == 3
+
+    arr = pa.chunked_array([
+        [1, 2, 3],
+        []
+    ])
+    assert arr.type == pa.int64()
+    assert len(arr) == 3
+    assert len(arr.chunks) == 2
+
+    msg = (
+        "When passing an empty collection of arrays you must also pass the "
+        "data type"
+    )
+    with pytest.raises(ValueError, match=msg):
+        assert pa.chunked_array([])
+
+    assert pa.chunked_array([], type=pa.string()).type == pa.string()
+    assert pa.chunked_array([[]]).type == pa.null()
+    assert pa.chunked_array([[]], type=pa.string()).type == pa.string()
+
+
+def test_chunked_array_to_numpy():
+    data = pa.chunked_array([
+        [1, 2, 3],
+        [4, 5, 6],
+        []
+    ])
+    arr1 = np.asarray(data)
+    arr2 = data.to_numpy()
+
+    assert isinstance(arr2, np.ndarray)
+    assert arr2.shape == (6,)
+    assert np.array_equal(arr1, arr2)
+
+
 def test_chunked_array_mismatch_types():
-    with pytest.raises(pa.ArrowInvalid):
-        pa.chunked_array([pa.array([1, 2]), pa.array(['foo', 'bar'])])
+    with pytest.raises(TypeError):
+        # Given array types are different
+        pa.chunked_array([
+            pa.array([1, 2, 3]),
+            pa.array([1., 2., 3.])
+        ])
+
+    with pytest.raises(TypeError):
+        # Given array type is different from explicit type argument
+        pa.chunked_array([pa.array([1, 2, 3])], type=pa.float64())
 
 
 def test_chunked_array_str():