You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2018/08/04 20:42:09 UTC
[arrow] branch master updated: ARROW-2666: [Python] Add __array__
method to Array, ChunkedArray, Column
This is an automated email from the ASF dual-hosted git repository.
wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 7a6144e ARROW-2666: [Python] Add __array__ method to Array, ChunkedArray, Column
7a6144e is described below
commit 7a6144ec67b7b61130b937c806598bcadf46fc11
Author: Pedro M. Duarte <pm...@gmail.com>
AuthorDate: Sat Aug 4 16:00:40 2018 -0400
ARROW-2666: [Python] Add __array__ method to Array, ChunkedArray, Column
Implement `__array__` method on `pyarrow.Array`, `pyarrow.ChunkedArray` and `pyarrow.Column` so that the `to_pandas()` method is used when calling `numpy.asarray` on an instance of these classes.
Currently `numpy.asarray` falls back to using the iterator interface so we get numpy object arrays of the underlying pyarrow scalar value type.
Author: Pedro M. Duarte <pm...@gmail.com>
Closes #2365 from PedroMDuarte/asarray and squashes the following commits:
71f9e291 <Pedro M. Duarte> Improve inline comment
6eac2685 <Pedro M. Duarte> Add __array__ method to Array, ChunkedArray, Column
---
python/pyarrow/array.pxi | 5 ++++
python/pyarrow/table.pxi | 17 +++++++++---
python/pyarrow/tests/test_array.py | 29 ++++++++++++++++++++
python/pyarrow/tests/test_table.py | 56 +++++++++++++++++++++++++++++++++++---
4 files changed, 99 insertions(+), 8 deletions(-)
diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi
index d59bb05..513fa86 100644
--- a/python/pyarrow/array.pxi
+++ b/python/pyarrow/array.pxi
@@ -652,6 +652,11 @@ cdef class Array:
self, &out))
return wrap_array_output(out)
+ def __array__(self, dtype=None):
+ if dtype is None:
+ return self.to_pandas()
+ return self.to_pandas().astype(dtype)
+
def to_numpy(self):
"""
EXPERIMENTAL: Construct a NumPy view of this array. Only supports
diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi
index 9a8a875..e056843 100644
--- a/python/pyarrow/table.pxi
+++ b/python/pyarrow/table.pxi
@@ -147,11 +147,12 @@ cdef class ChunkedArray:
c_bool zero_copy_only=False,
c_bool integer_object_nulls=False):
"""
- Convert the arrow::Column to a pandas.Series
+ Convert the arrow::ChunkedArray to an array object suitable for use
+ in pandas
- Returns
- -------
- pandas.Series
+ See also
+ --------
+ Column.to_pandas
"""
cdef:
PyObject* out
@@ -171,6 +172,11 @@ cdef class ChunkedArray:
return wrap_array_output(out)
+ def __array__(self, dtype=None):
+ if dtype is None:
+ return self.to_pandas()
+ return self.to_pandas().astype(dtype)
+
def dictionary_encode(self):
"""
Compute dictionary-encoded representation of array
@@ -517,6 +523,9 @@ cdef class Column:
return result
+ def __array__(self, dtype=None):
+ return self.data.__array__(dtype=dtype)
+
def equals(self, Column other):
"""
Check if contents of two columns are equal
diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py
index af2708f..425fe09 100644
--- a/python/pyarrow/tests/test_array.py
+++ b/python/pyarrow/tests/test_array.py
@@ -156,6 +156,35 @@ def test_to_pandas_zero_copy():
np_arr.sum()
+def test_asarray():
+ arr = pa.array(range(4))
+
+ # The iterator interface gives back an array of Int64Value's
+ np_arr = np.asarray([_ for _ in arr])
+ assert np_arr.tolist() == [0, 1, 2, 3]
+ assert np_arr.dtype == np.dtype('O')
+ assert type(np_arr[0]) == pa.lib.Int64Value
+
+ # Calling with the arrow array gives back an array with 'int64' dtype
+ np_arr = np.asarray(arr)
+ assert np_arr.tolist() == [0, 1, 2, 3]
+ assert np_arr.dtype == np.dtype('int64')
+
+ # An optional type can be specified when calling np.asarray
+ np_arr = np.asarray(arr, dtype='str')
+ assert np_arr.tolist() == ['0', '1', '2', '3']
+
+ # If PyArrow array has null values, numpy type will be changed as needed
+ # to support nulls.
+ arr = pa.array([0, 1, 2, None])
+ assert arr.type == pa.int64()
+ np_arr = np.asarray(arr)
+ elements = np_arr.tolist()
+ assert elements[:3] == [0., 1., 2.]
+ assert np.isnan(elements[3])
+ assert np_arr.dtype == np.dtype('float64')
+
+
def test_array_getitem():
arr = pa.array(range(10, 15))
lst = arr.to_pylist()
diff --git a/python/pyarrow/tests/test_table.py b/python/pyarrow/tests/test_table.py
index 69086e0..cc672fc 100644
--- a/python/pyarrow/tests/test_table.py
+++ b/python/pyarrow/tests/test_table.py
@@ -160,6 +160,48 @@ def test_chunked_array_pickle(data, typ):
assert result.equals(array)
+def test_chunked_array_to_pandas():
+ data = [
+ pa.array([-10, -5, 0, 5, 10])
+ ]
+ table = pa.Table.from_arrays(data, names=['a'])
+ chunked_arr = table.column(0).data
+ assert isinstance(chunked_arr, pa.ChunkedArray)
+ array = chunked_arr.to_pandas()
+ assert array.shape == (5,)
+ assert array[0] == -10
+
+
+def test_chunked_array_asarray():
+ data = [
+ pa.array([0]),
+ pa.array([1, 2, 3])
+ ]
+ chunked_arr = pa.chunked_array(data)
+
+ np_arr = np.asarray(chunked_arr)
+ assert np_arr.tolist() == [0, 1, 2, 3]
+ assert np_arr.dtype == np.dtype('int64')
+
+ # An optional type can be specified when calling np.asarray
+ np_arr = np.asarray(chunked_arr, dtype='str')
+ assert np_arr.tolist() == ['0', '1', '2', '3']
+
+ # Types are modified when there are nulls
+ data = [
+ pa.array([1, None]),
+ pa.array([1, 2, 3])
+ ]
+ chunked_arr = pa.chunked_array(data)
+
+ np_arr = np.asarray(chunked_arr)
+ elements = np_arr.tolist()
+ assert elements[0] == 1.
+ assert np.isnan(elements[1])
+ assert elements[2:] == [1., 2., 3.]
+ assert np_arr.dtype == np.dtype('float64')
+
+
def test_column_basics():
data = [
pa.array([-10, -5, 0, 5, 10])
@@ -219,14 +261,20 @@ def test_column_to_pandas():
assert series.iloc[0] == -10
-def test_chunked_array_to_pandas():
+def test_column_asarray():
data = [
pa.array([-10, -5, 0, 5, 10])
]
table = pa.Table.from_arrays(data, names=['a'])
- array = table.column(0).data.to_pandas()
- assert array.shape == (5,)
- assert array[0] == -10
+ column = table.column(0)
+
+ np_arr = np.asarray(column)
+ assert np_arr.tolist() == [-10, -5, 0, 5, 10]
+ assert np_arr.dtype == np.dtype('int64')
+
+ # An optional type can be specified when calling np.asarray
+ np_arr = np.asarray(column, dtype='str')
+ assert np_arr.tolist() == ['-10', '-5', '0', '5', '10']
def test_column_flatten():