You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2018/08/04 20:42:09 UTC

[arrow] branch master updated: ARROW-2666: [Python] Add __array__ method to Array, ChunkedArray, Column

This is an automated email from the ASF dual-hosted git repository.

wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 7a6144e  ARROW-2666: [Python] Add __array__ method to Array, ChunkedArray, Column
7a6144e is described below

commit 7a6144ec67b7b61130b937c806598bcadf46fc11
Author: Pedro M. Duarte <pm...@gmail.com>
AuthorDate: Sat Aug 4 16:00:40 2018 -0400

    ARROW-2666: [Python] Add __array__ method to Array, ChunkedArray, Column
    
    Implement `__array__` method on `pyarrow.Array`, `pyarrow.ChunkedArray` and `pyarrow.Column` so that the `to_pandas()` method is used when calling `numpy.asarray` on an instance of these classes.
    
    Currently `numpy.asarray` falls back to using the iterator interface so we get numpy object arrays of the underlying pyarrow scalar value type.
    
    Author: Pedro M. Duarte <pm...@gmail.com>
    
    Closes #2365 from PedroMDuarte/asarray and squashes the following commits:
    
    71f9e291 <Pedro M. Duarte> Improve inline comment
    6eac2685 <Pedro M. Duarte> Add __array__ method to Array, ChunkedArray, Column
---
 python/pyarrow/array.pxi           |  5 ++++
 python/pyarrow/table.pxi           | 17 +++++++++---
 python/pyarrow/tests/test_array.py | 29 ++++++++++++++++++++
 python/pyarrow/tests/test_table.py | 56 +++++++++++++++++++++++++++++++++++---
 4 files changed, 99 insertions(+), 8 deletions(-)

diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi
index d59bb05..513fa86 100644
--- a/python/pyarrow/array.pxi
+++ b/python/pyarrow/array.pxi
@@ -652,6 +652,11 @@ cdef class Array:
                                               self, &out))
         return wrap_array_output(out)
 
+    def __array__(self, dtype=None):
+        if dtype is None:
+            return self.to_pandas()
+        return self.to_pandas().astype(dtype)
+
     def to_numpy(self):
         """
         EXPERIMENTAL: Construct a NumPy view of this array. Only supports
diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi
index 9a8a875..e056843 100644
--- a/python/pyarrow/table.pxi
+++ b/python/pyarrow/table.pxi
@@ -147,11 +147,12 @@ cdef class ChunkedArray:
                   c_bool zero_copy_only=False,
                   c_bool integer_object_nulls=False):
         """
-        Convert the arrow::Column to a pandas.Series
+        Convert the arrow::ChunkedArray to an array object suitable for use
+        in pandas
 
-        Returns
-        -------
-        pandas.Series
+        See also
+        --------
+        Column.to_pandas
         """
         cdef:
             PyObject* out
@@ -171,6 +172,11 @@ cdef class ChunkedArray:
 
         return wrap_array_output(out)
 
+    def __array__(self, dtype=None):
+        if dtype is None:
+            return self.to_pandas()
+        return self.to_pandas().astype(dtype)
+
     def dictionary_encode(self):
         """
         Compute dictionary-encoded representation of array
@@ -517,6 +523,9 @@ cdef class Column:
 
         return result
 
+    def __array__(self, dtype=None):
+        return self.data.__array__(dtype=dtype)
+
     def equals(self, Column other):
         """
         Check if contents of two columns are equal
diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py
index af2708f..425fe09 100644
--- a/python/pyarrow/tests/test_array.py
+++ b/python/pyarrow/tests/test_array.py
@@ -156,6 +156,35 @@ def test_to_pandas_zero_copy():
         np_arr.sum()
 
 
+def test_asarray():
+    arr = pa.array(range(4))
+
+    # The iterator interface gives back an array of Int64Value's
+    np_arr = np.asarray([_ for _ in arr])
+    assert np_arr.tolist() == [0, 1, 2, 3]
+    assert np_arr.dtype == np.dtype('O')
+    assert type(np_arr[0]) == pa.lib.Int64Value
+
+    # Calling with the arrow array gives back an array with 'int64' dtype
+    np_arr = np.asarray(arr)
+    assert np_arr.tolist() == [0, 1, 2, 3]
+    assert np_arr.dtype == np.dtype('int64')
+
+    # An optional type can be specified when calling np.asarray
+    np_arr = np.asarray(arr, dtype='str')
+    assert np_arr.tolist() == ['0', '1', '2', '3']
+
+    # If PyArrow array has null values, numpy type will be changed as needed
+    # to support nulls.
+    arr = pa.array([0, 1, 2, None])
+    assert arr.type == pa.int64()
+    np_arr = np.asarray(arr)
+    elements = np_arr.tolist()
+    assert elements[:3] == [0., 1., 2.]
+    assert np.isnan(elements[3])
+    assert np_arr.dtype == np.dtype('float64')
+
+
 def test_array_getitem():
     arr = pa.array(range(10, 15))
     lst = arr.to_pylist()
diff --git a/python/pyarrow/tests/test_table.py b/python/pyarrow/tests/test_table.py
index 69086e0..cc672fc 100644
--- a/python/pyarrow/tests/test_table.py
+++ b/python/pyarrow/tests/test_table.py
@@ -160,6 +160,48 @@ def test_chunked_array_pickle(data, typ):
     assert result.equals(array)
 
 
+def test_chunked_array_to_pandas():
+    data = [
+        pa.array([-10, -5, 0, 5, 10])
+    ]
+    table = pa.Table.from_arrays(data, names=['a'])
+    chunked_arr = table.column(0).data
+    assert isinstance(chunked_arr, pa.ChunkedArray)
+    array = chunked_arr.to_pandas()
+    assert array.shape == (5,)
+    assert array[0] == -10
+
+
+def test_chunked_array_asarray():
+    data = [
+        pa.array([0]),
+        pa.array([1, 2, 3])
+    ]
+    chunked_arr = pa.chunked_array(data)
+
+    np_arr = np.asarray(chunked_arr)
+    assert np_arr.tolist() == [0, 1, 2, 3]
+    assert np_arr.dtype == np.dtype('int64')
+
+    # An optional type can be specified when calling np.asarray
+    np_arr = np.asarray(chunked_arr, dtype='str')
+    assert np_arr.tolist() == ['0', '1', '2', '3']
+
+    # Types are modified when there are nulls
+    data = [
+        pa.array([1, None]),
+        pa.array([1, 2, 3])
+    ]
+    chunked_arr = pa.chunked_array(data)
+
+    np_arr = np.asarray(chunked_arr)
+    elements = np_arr.tolist()
+    assert elements[0] == 1.
+    assert np.isnan(elements[1])
+    assert elements[2:] == [1., 2., 3.]
+    assert np_arr.dtype == np.dtype('float64')
+
+
 def test_column_basics():
     data = [
         pa.array([-10, -5, 0, 5, 10])
@@ -219,14 +261,20 @@ def test_column_to_pandas():
     assert series.iloc[0] == -10
 
 
-def test_chunked_array_to_pandas():
+def test_column_asarray():
     data = [
         pa.array([-10, -5, 0, 5, 10])
     ]
     table = pa.Table.from_arrays(data, names=['a'])
-    array = table.column(0).data.to_pandas()
-    assert array.shape == (5,)
-    assert array[0] == -10
+    column = table.column(0)
+
+    np_arr = np.asarray(column)
+    assert np_arr.tolist() == [-10, -5, 0, 5, 10]
+    assert np_arr.dtype == np.dtype('int64')
+
+    # An optional type can be specified when calling np.asarray
+    np_arr = np.asarray(column, dtype='str')
+    assert np_arr.tolist() == ['-10', '-5', '0', '5', '10']
 
 
 def test_column_flatten():