You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2017/04/17 13:56:58 UTC
arrow git commit: ARROW-707: [Python] Return NullArray for array of
all None in Array.from_pandas. Revert from_numpy -> from_pandas
Repository: arrow
Updated Branches:
refs/heads/master f51259068 -> 312a66535
ARROW-707: [Python] Return NullArray for array of all None in Array.from_pandas. Revert from_numpy -> from_pandas
per ARROW-838, I reverted the `Array.from_numpy` name to `Array.from_pandas` to reflect that the import is specific to pandas 0.x's memory representation
Author: Wes McKinney <we...@twosigma.com>
Closes #554 from wesm/ARROW-707 and squashes the following commits:
a875257 [Wes McKinney] Rename PyObject_is_null to reflect domain-specific nature
093b057 [Wes McKinney] Check more cases of all nulls. Fix segfault for NaN that resulted from computations
7d97f28 [Wes McKinney] Return NullArray for array of all None in Array.from_pandas. Revert from_numpy -> from_pandas
Project: http://git-wip-us.apache.org/repos/asf/arrow/repo
Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/312a6653
Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/312a6653
Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/312a6653
Branch: refs/heads/master
Commit: 312a665353c420452e98b6b266a5a7cb214c936f
Parents: f512590
Author: Wes McKinney <we...@twosigma.com>
Authored: Mon Apr 17 09:56:53 2017 -0400
Committer: Wes McKinney <we...@twosigma.com>
Committed: Mon Apr 17 09:56:53 2017 -0400
----------------------------------------------------------------------
cpp/src/arrow/python/pandas_convert.cc | 31 +++++++++++++--------
python/doc/source/api.rst | 1 +
python/pyarrow/__init__.py | 1 +
python/pyarrow/_array.pxd | 4 +++
python/pyarrow/_array.pyx | 18 ++++++-------
python/pyarrow/_io.pyx | 2 +-
python/pyarrow/_table.pyx | 2 +-
python/pyarrow/tests/test_array.py | 4 +--
python/pyarrow/tests/test_convert_pandas.py | 34 ++++++++++++++++--------
python/pyarrow/tests/test_scalars.py | 6 ++---
10 files changed, 65 insertions(+), 38 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/arrow/blob/312a6653/cpp/src/arrow/python/pandas_convert.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/python/pandas_convert.cc b/cpp/src/arrow/python/pandas_convert.cc
index b33aea4..5cdcb6f 100644
--- a/cpp/src/arrow/python/pandas_convert.cc
+++ b/cpp/src/arrow/python/pandas_convert.cc
@@ -61,8 +61,16 @@ namespace py {
// ----------------------------------------------------------------------
// Utility code
-static inline bool PyObject_is_null(const PyObject* obj) {
- return obj == Py_None || obj == numpy_nan;
+static inline bool PyFloat_isnan(const PyObject* obj) {
+ if (PyFloat_Check(obj)) {
+ double val = PyFloat_AS_DOUBLE(obj);
+ return val != val;
+ } else {
+ return false;
+ }
+}
+static inline bool PandasObjectIsNull(const PyObject* obj) {
+ return obj == Py_None || obj == numpy_nan || PyFloat_isnan(obj);
}
static inline bool PyObject_is_string(const PyObject* obj) {
@@ -158,7 +166,7 @@ static Status AppendObjectStrings(
for (int64_t i = 0; i < objects.size(); ++i) {
obj = objects[i];
- if ((have_mask && mask_values[i]) || PyObject_is_null(obj)) {
+ if ((have_mask && mask_values[i]) || PandasObjectIsNull(obj)) {
RETURN_NOT_OK(builder->AppendNull());
} else if (PyUnicode_Check(obj)) {
obj = PyUnicode_AsUTF8String(obj);
@@ -197,7 +205,7 @@ static Status AppendObjectFixedWidthBytes(PyArrayObject* arr, PyArrayObject* mas
for (int64_t i = 0; i < objects.size(); ++i) {
obj = objects[i];
- if ((have_mask && mask_values[i]) || PyObject_is_null(obj)) {
+ if ((have_mask && mask_values[i]) || PandasObjectIsNull(obj)) {
RETURN_NOT_OK(builder->AppendNull());
} else if (PyUnicode_Check(obj)) {
obj = PyUnicode_AsUTF8String(obj);
@@ -519,7 +527,7 @@ Status PandasConverter::ConvertDates() {
obj = objects[i];
if (PyDate_CheckExact(obj)) {
date_builder.Append(UnboxDate<ArrowType>::Unbox(obj));
- } else if (PyObject_is_null(obj)) {
+ } else if (PandasObjectIsNull(obj)) {
date_builder.AppendNull();
} else {
return InvalidConversion(obj, "date");
@@ -570,7 +578,7 @@ Status PandasConverter::ConvertDecimals() {
default:
break;
}
- } else if (PyObject_is_null(object)) {
+ } else if (PandasObjectIsNull(object)) {
decimal_builder.AppendNull();
} else {
return InvalidConversion(object, "decimal.Decimal");
@@ -724,7 +732,7 @@ Status PandasConverter::ConvertBooleans() {
PyObject* obj;
for (int64_t i = 0; i < length_; ++i) {
obj = objects[i];
- if ((have_mask && mask_values[i]) || PyObject_is_null(obj)) {
+ if ((have_mask && mask_values[i]) || PandasObjectIsNull(obj)) {
++null_count;
} else if (obj == Py_True) {
BitUtil::SetBit(bitmap, i);
@@ -791,7 +799,7 @@ Status PandasConverter::ConvertObjects() {
RETURN_NOT_OK(ImportFromModule(decimal, "Decimal", &Decimal));
for (int64_t i = 0; i < length_; ++i) {
- if (PyObject_is_null(objects[i])) {
+ if (PandasObjectIsNull(objects[i])) {
continue;
} else if (PyObject_is_string(objects[i])) {
return ConvertObjectStrings();
@@ -809,7 +817,8 @@ Status PandasConverter::ConvertObjects() {
}
}
- return Status::TypeError("Unable to infer type of object array, were all null");
+ out_ = std::make_shared<NullArray>(length_);
+ return Status::OK();
}
template <int ITEM_TYPE, typename ArrowType>
@@ -833,7 +842,7 @@ inline Status PandasConverter::ConvertTypedLists(const std::shared_ptr<DataType>
ListBuilder list_builder(pool_, value_builder);
PyObject** objects = reinterpret_cast<PyObject**>(PyArray_DATA(arr_));
for (int64_t i = 0; i < length_; ++i) {
- if (PyObject_is_null(objects[i])) {
+ if (PandasObjectIsNull(objects[i])) {
RETURN_NOT_OK(list_builder.AppendNull());
} else if (PyArray_Check(objects[i])) {
auto numpy_array = reinterpret_cast<PyArrayObject*>(objects[i]);
@@ -893,7 +902,7 @@ inline Status PandasConverter::ConvertTypedLists<NPY_OBJECT, StringType>(
ListBuilder list_builder(pool_, value_builder);
PyObject** objects = reinterpret_cast<PyObject**>(PyArray_DATA(arr_));
for (int64_t i = 0; i < length_; ++i) {
- if (PyObject_is_null(objects[i])) {
+ if (PandasObjectIsNull(objects[i])) {
RETURN_NOT_OK(list_builder.AppendNull());
} else if (PyArray_Check(objects[i])) {
auto numpy_array = reinterpret_cast<PyArrayObject*>(objects[i]);
http://git-wip-us.apache.org/repos/asf/arrow/blob/312a6653/python/doc/source/api.rst
----------------------------------------------------------------------
diff --git a/python/doc/source/api.rst b/python/doc/source/api.rst
index 801ab34..1b7b9bd 100644
--- a/python/doc/source/api.rst
+++ b/python/doc/source/api.rst
@@ -90,6 +90,7 @@ Array Types
:toctree: generated/
Array
+ NullArray
NumericArray
IntegerArray
FloatingPointArray
http://git-wip-us.apache.org/repos/asf/arrow/blob/312a6653/python/pyarrow/__init__.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py
index 506d567..3db2a4f 100644
--- a/python/pyarrow/__init__.py
+++ b/python/pyarrow/__init__.py
@@ -40,6 +40,7 @@ from pyarrow._array import (null, bool_,
Array, Tensor,
from_pylist,
from_numpy_dtype,
+ NullArray,
NumericArray, IntegerArray, FloatingPointArray,
BooleanArray,
Int8Array, UInt8Array,
http://git-wip-us.apache.org/repos/asf/arrow/blob/312a6653/python/pyarrow/_array.pxd
----------------------------------------------------------------------
diff --git a/python/pyarrow/_array.pxd b/python/pyarrow/_array.pxd
index 4041374..afb0c27 100644
--- a/python/pyarrow/_array.pxd
+++ b/python/pyarrow/_array.pxd
@@ -141,6 +141,10 @@ cdef class Tensor:
cdef init(self, const shared_ptr[CTensor]& sp_tensor)
+cdef class NullArray(Array):
+ pass
+
+
cdef class BooleanArray(Array):
pass
http://git-wip-us.apache.org/repos/asf/arrow/blob/312a6653/python/pyarrow/_array.pyx
----------------------------------------------------------------------
diff --git a/python/pyarrow/_array.pyx b/python/pyarrow/_array.pyx
index c5a595c..99ff6f2 100644
--- a/python/pyarrow/_array.pyx
+++ b/python/pyarrow/_array.pyx
@@ -843,9 +843,9 @@ cdef class Array:
self.type = box_data_type(self.sp_array.get().type())
@staticmethod
- def from_numpy(obj, mask=None, DataType type=None,
- timestamps_to_ms=False,
- MemoryPool memory_pool=None):
+ def from_pandas(obj, mask=None, DataType type=None,
+ timestamps_to_ms=False,
+ MemoryPool memory_pool=None):
"""
Convert pandas.Series to an Arrow Array.
@@ -878,7 +878,7 @@ cdef class Array:
>>> import pandas as pd
>>> import pyarrow as pa
- >>> pa.Array.from_numpy(pd.Series([1, 2]))
+ >>> pa.Array.from_pandas(pd.Series([1, 2]))
<pyarrow.array.Int64Array object at 0x7f674e4c0e10>
[
1,
@@ -886,7 +886,7 @@ cdef class Array:
]
>>> import numpy as np
- >>> pa.Array.from_numpy(pd.Series([1, 2]), np.array([0, 1],
+ >>> pa.Array.from_pandas(pd.Series([1, 2]), np.array([0, 1],
... dtype=bool))
<pyarrow.array.Int64Array object at 0x7f9019e11208>
[
@@ -1329,14 +1329,14 @@ cdef class DictionaryArray(Array):
mask = indices == -1
else:
mask = mask | (indices == -1)
- arrow_indices = Array.from_numpy(indices, mask=mask,
- memory_pool=memory_pool)
+ arrow_indices = Array.from_pandas(indices, mask=mask,
+ memory_pool=memory_pool)
if isinstance(dictionary, Array):
arrow_dictionary = dictionary
else:
- arrow_dictionary = Array.from_numpy(dictionary,
- memory_pool=memory_pool)
+ arrow_dictionary = Array.from_pandas(dictionary,
+ memory_pool=memory_pool)
if not isinstance(arrow_indices, IntegerArray):
raise ValueError('Indices must be integer type')
http://git-wip-us.apache.org/repos/asf/arrow/blob/312a6653/python/pyarrow/_io.pyx
----------------------------------------------------------------------
diff --git a/python/pyarrow/_io.pyx b/python/pyarrow/_io.pyx
index 9f067fb..ec37de0 100644
--- a/python/pyarrow/_io.pyx
+++ b/python/pyarrow/_io.pyx
@@ -1148,7 +1148,7 @@ cdef class FeatherWriter:
if isinstance(col, Array):
arr = col
else:
- arr = Array.from_numpy(col, mask=mask)
+ arr = Array.from_pandas(col, mask=mask)
cdef c_string c_name = tobytes(name)
http://git-wip-us.apache.org/repos/asf/arrow/blob/312a6653/python/pyarrow/_table.pyx
----------------------------------------------------------------------
diff --git a/python/pyarrow/_table.pyx b/python/pyarrow/_table.pyx
index 6558b2e..78fec75 100644
--- a/python/pyarrow/_table.pyx
+++ b/python/pyarrow/_table.pyx
@@ -321,7 +321,7 @@ cdef _dataframe_to_arrays(df, timestamps_to_ms, Schema schema):
if schema is not None:
type = schema.field_by_name(name).type
- arr = Array.from_numpy(col, type=type,
+ arr = Array.from_pandas(col, type=type,
timestamps_to_ms=timestamps_to_ms)
names.append(name)
arrays.append(arr)
http://git-wip-us.apache.org/repos/asf/arrow/blob/312a6653/python/pyarrow/tests/test_array.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py
index 57b17f6..a1fe842 100644
--- a/python/pyarrow/tests/test_array.py
+++ b/python/pyarrow/tests/test_array.py
@@ -162,8 +162,8 @@ def test_dictionary_from_boxed_arrays():
indices = np.repeat([0, 1, 2], 2)
dictionary = np.array(['foo', 'bar', 'baz'], dtype=object)
- iarr = pa.Array.from_numpy(indices)
- darr = pa.Array.from_numpy(dictionary)
+ iarr = pa.Array.from_pandas(indices)
+ darr = pa.Array.from_pandas(dictionary)
d1 = pa.DictionaryArray.from_arrays(iarr, darr)
http://git-wip-us.apache.org/repos/asf/arrow/blob/312a6653/python/pyarrow/tests/test_convert_pandas.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/tests/test_convert_pandas.py b/python/pyarrow/tests/test_convert_pandas.py
index 2394d63..f360234 100644
--- a/python/pyarrow/tests/test_convert_pandas.py
+++ b/python/pyarrow/tests/test_convert_pandas.py
@@ -79,8 +79,8 @@ class TestPandasConversion(unittest.TestCase):
def _check_array_roundtrip(self, values, expected=None, mask=None,
timestamps_to_ms=False, type=None):
- arr = pa.Array.from_numpy(values, timestamps_to_ms=timestamps_to_ms,
- mask=mask, type=type)
+ arr = pa.Array.from_pandas(values, timestamps_to_ms=timestamps_to_ms,
+ mask=mask, type=type)
result = arr.to_pandas()
values_nulls = pd.isnull(values)
@@ -125,7 +125,7 @@ class TestPandasConversion(unittest.TestCase):
for name, arrow_dtype in dtypes:
values = np.random.randn(num_values).astype(name)
- arr = pa.Array.from_numpy(values, null_mask)
+ arr = pa.Array.from_pandas(values, null_mask)
arrays.append(arr)
fields.append(pa.Field.from_py(name, arrow_dtype))
values[null_mask] = np.nan
@@ -178,7 +178,7 @@ class TestPandasConversion(unittest.TestCase):
for name in int_dtypes:
values = np.random.randint(0, 100, size=num_values)
- arr = pa.Array.from_numpy(values, null_mask)
+ arr = pa.Array.from_pandas(values, null_mask)
arrays.append(arr)
expected = values.astype('f8')
@@ -212,7 +212,7 @@ class TestPandasConversion(unittest.TestCase):
mask = np.random.randint(0, 10, size=num_values) < 3
values = np.random.randint(0, 10, size=num_values) < 5
- arr = pa.Array.from_numpy(values, mask)
+ arr = pa.Array.from_pandas(values, mask)
expected = values.astype(object)
expected[mask] = None
@@ -375,11 +375,11 @@ class TestPandasConversion(unittest.TestCase):
t32 = pa.date32()
t64 = pa.date64()
- a32 = pa.Array.from_numpy(arr, type=t32)
- a64 = pa.Array.from_numpy(arr, type=t64)
+ a32 = pa.Array.from_pandas(arr, type=t32)
+ a64 = pa.Array.from_pandas(arr, type=t64)
- a32_expected = pa.Array.from_numpy(arr_i4, mask=mask, type=t32)
- a64_expected = pa.Array.from_numpy(arr_i8, mask=mask, type=t64)
+ a32_expected = pa.Array.from_pandas(arr_i4, mask=mask, type=t32)
+ a64_expected = pa.Array.from_pandas(arr_i8, mask=mask, type=t64)
assert a32.equals(a32_expected)
assert a64.equals(a64_expected)
@@ -406,8 +406,8 @@ class TestPandasConversion(unittest.TestCase):
arr = np.array([17259, 17260, 17261], dtype='int32')
arr2 = arr.astype('int64') * 86400000
- a1 = pa.Array.from_numpy(arr, type=t1)
- a2 = pa.Array.from_numpy(arr2, type=t2)
+ a1 = pa.Array.from_pandas(arr, type=t1)
+ a2 = pa.Array.from_pandas(arr2, type=t2)
expected = datetime.date(2017, 4, 3)
assert a1[0].as_py() == expected
@@ -586,3 +586,15 @@ class TestPandasConversion(unittest.TestCase):
converted = pa.Table.from_pandas(expected)
df = converted.to_pandas()
tm.assert_frame_equal(df, expected)
+
+ def test_all_nones(self):
+ def _check_series(s):
+ converted = pa.Array.from_pandas(s)
+ assert isinstance(converted, pa.NullArray)
+ assert len(converted) == 3
+ assert converted.null_count == 3
+ assert converted[0] is pa.NA
+
+ _check_series(pd.Series([None] * 3, dtype=object))
+ _check_series(pd.Series([np.nan] * 3, dtype=object))
+ _check_series(pd.Series([np.sqrt(-1)] * 3, dtype=object))
http://git-wip-us.apache.org/repos/asf/arrow/blob/312a6653/python/pyarrow/tests/test_scalars.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/tests/test_scalars.py b/python/pyarrow/tests/test_scalars.py
index f4f275b..df2a898 100644
--- a/python/pyarrow/tests/test_scalars.py
+++ b/python/pyarrow/tests/test_scalars.py
@@ -124,7 +124,7 @@ class TestScalars(unittest.TestCase):
for unit in units:
dtype = 'datetime64[{0}]'.format(unit)
- arrow_arr = pa.Array.from_numpy(arr.astype(dtype))
+ arrow_arr = pa.Array.from_pandas(arr.astype(dtype))
expected = pd.Timestamp('2000-01-01 12:34:56')
assert arrow_arr[0].as_py() == expected
@@ -133,8 +133,8 @@ class TestScalars(unittest.TestCase):
arrow_type = pa.timestamp(unit, tz=tz)
dtype = 'datetime64[{0}]'.format(unit)
- arrow_arr = pa.Array.from_numpy(arr.astype(dtype),
- type=arrow_type)
+ arrow_arr = pa.Array.from_pandas(arr.astype(dtype),
+ type=arrow_type)
expected = (pd.Timestamp('2000-01-01 12:34:56')
.tz_localize('utc')
.tz_convert(tz))