You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2019/06/12 22:14:49 UTC
[arrow] branch master updated: ARROW-4324: [Python] Triage broken
type inference logic in presence of a mix of NumPy dtype-having objects and
other scalar values
This is an automated email from the ASF dual-hosted git repository.
wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 25b4a468 ARROW-4324: [Python] Triage broken type inference logic in presence of a mix of NumPy dtype-having objects and other scalar values
25b4a468 is described below
commit 25b4a46805a3be01c83e53a92524d4d7b021c74d
Author: Wes McKinney <we...@apache.org>
AuthorDate: Wed Jun 12 17:14:40 2019 -0500
ARROW-4324: [Python] Triage broken type inference logic in presence of a mix of NumPy dtype-having objects and other scalar values
In investigating the innocuous bug report from ARROW-4324 I stumbled on a pile of hacks and flawed design around type inference
```
test_list = [np.dtype('int32').type(10), np.dtype('float32').type(0.5)]
test_array = pa.array(test_list)
# Expected
# test_array
# <pyarrow.lib.DoubleArray object at 0x7f009963bf48>
# [
# 10,
# 0.5
# ]
# Got
# test_array
# <pyarrow.lib.Int32Array object at 0x7f009963bf48>
# [
# 10,
# 0
# ]
```
It turns out there are several issues:
* There was a kludge around handling the `numpy.nan` value which is a PyFloat, not a NumPy float64 scalar
* Type inference assumed "NaN is null", which should not be hard coded, so I added a flag to switch between pandas semantics and non-pandas
* Mixing NumPy scalar values and non-NumPy scalars (like our evil friend numpy.nan) caused the output type to be simply incorrect. For example `[np.float16(1.5), 2.5]` would yield `pa.float16()` output type. Yuck
In inserted some hacks to force what I believe to be the correct behavior and fixed a couple unit tests that actually exhibited buggy behavior before (see within). I don't have time to do the "right thing" right now which is to more or less rewrite the hot path of `arrow/python/inference.cc`, so at least this gets the unit tests asserting what is correct so that refactoring will be more productive later.
Author: Wes McKinney <we...@apache.org>
Closes #4527 from wesm/ARROW-4324 and squashes the following commits:
e396958b0 <Wes McKinney> Add unit test for passing pandas Series with from_pandas=False
754468a5d <Wes McKinney> Set from_pandas to None by default in pyarrow.array so that user wishes can be respected
e1b839339 <Wes McKinney> Remove outdated unit test, add Python unit test that shows behavior from ARROW-2240 that's been changed
4bc8c8193 <Wes McKinney> Triage type inference logic in presence of a mix of NumPy dtype-having objects and other typed values, pending more serious refactor in ARROW-5564
---
cpp/src/arrow/python/arrow_to_pandas.cc | 14 +--
cpp/src/arrow/python/inference.cc | 134 +++++++++++++++++++--------
cpp/src/arrow/python/inference.h | 6 +-
cpp/src/arrow/python/numpy-internal.h | 9 ++
cpp/src/arrow/python/python-test.cc | 14 ---
cpp/src/arrow/python/python_to_arrow.cc | 2 +-
python/pyarrow/array.pxi | 41 +++++---
python/pyarrow/tests/test_convert_builtin.py | 42 +++++++--
python/pyarrow/tests/test_pandas.py | 7 ++
9 files changed, 182 insertions(+), 87 deletions(-)
diff --git a/cpp/src/arrow/python/arrow_to_pandas.cc b/cpp/src/arrow/python/arrow_to_pandas.cc
index d556664..fa35a6e 100644
--- a/cpp/src/arrow/python/arrow_to_pandas.cc
+++ b/cpp/src/arrow/python/arrow_to_pandas.cc
@@ -173,19 +173,11 @@ inline void set_numpy_metadata(int type, DataType* datatype, PyArray_Descr* out)
}
}
-static inline PyArray_Descr* GetSafeNumPyDtype(int type) {
- if (type == NPY_DATETIME) {
- // It is not safe to mutate the result of DescrFromType
- return PyArray_DescrNewFromType(type);
- } else {
- return PyArray_DescrFromType(type);
- }
-}
static inline PyObject* NewArray1DFromType(DataType* arrow_type, int type, int64_t length,
void* data) {
npy_intp dims[1] = {length};
- PyArray_Descr* descr = GetSafeNumPyDtype(type);
+ PyArray_Descr* descr = internal::GetSafeNumPyDtype(type);
if (descr == nullptr) {
// Error occurred, trust error state is set
return nullptr;
@@ -244,7 +236,7 @@ class PandasBlock {
Status AllocateNDArray(int npy_type, int ndim = 2) {
PyAcquireGIL lock;
- PyArray_Descr* descr = GetSafeNumPyDtype(npy_type);
+ PyArray_Descr* descr = internal::GetSafeNumPyDtype(npy_type);
PyObject* block_arr;
if (ndim == 2) {
@@ -1220,7 +1212,7 @@ class CategoricalBlock : public PandasBlock {
PyAcquireGIL lock;
- PyArray_Descr* descr = GetSafeNumPyDtype(npy_type);
+ PyArray_Descr* descr = internal::GetSafeNumPyDtype(npy_type);
if (descr == nullptr) {
// Error occurred, trust error state is set
return Status::OK();
diff --git a/cpp/src/arrow/python/inference.cc b/cpp/src/arrow/python/inference.cc
index 6cf8bed..4ec4d9d 100644
--- a/cpp/src/arrow/python/inference.cc
+++ b/cpp/src/arrow/python/inference.cc
@@ -42,19 +42,27 @@ namespace py {
#define _NUMPY_UNIFY_NOOP(DTYPE) \
case NPY_##DTYPE: \
- return NOOP;
+ return OK;
#define _NUMPY_UNIFY_PROMOTE(DTYPE) \
case NPY_##DTYPE: \
- return PROMOTE;
-
-// Form a consensus NumPy dtype to use for Arrow conversion for a collection of dtype
-// objects observed one at a time
+ current_type_num_ = dtype; \
+ current_dtype_ = descr; \
+ return OK;
+
+#define _NUMPY_UNIFY_PROMOTE_TO(DTYPE, NEW_TYPE) \
+ case NPY_##DTYPE: \
+ current_type_num_ = NPY_##NEW_TYPE; \
+ current_dtype_ = PyArray_DescrFromType(current_type_num_); \
+ return OK;
+
+// Form a consensus NumPy dtype to use for Arrow conversion for a
+// collection of dtype objects observed one at a time
class NumPyDtypeUnifier {
public:
- enum Action { NOOP, PROMOTE, INVALID };
+ enum Action { OK, INVALID };
- NumPyDtypeUnifier() : current_type_num_(-1), current_dtype_(NULLPTR) {}
+ NumPyDtypeUnifier() : current_type_num_(-1), current_dtype_(nullptr) {}
Status InvalidMix(int new_dtype) {
return Status::Invalid("Cannot mix NumPy dtypes ",
@@ -97,7 +105,7 @@ class NumPyDtypeUnifier {
_NUMPY_UNIFY_PROMOTE(INT64);
_NUMPY_UNIFY_NOOP(UINT8);
_NUMPY_UNIFY_NOOP(UINT16);
- _NUMPY_UNIFY_PROMOTE(FLOAT32);
+ _NUMPY_UNIFY_PROMOTE_TO(FLOAT32, FLOAT64);
_NUMPY_UNIFY_PROMOTE(FLOAT64);
default:
return INVALID;
@@ -113,7 +121,7 @@ class NumPyDtypeUnifier {
_NUMPY_UNIFY_NOOP(UINT8);
_NUMPY_UNIFY_NOOP(UINT16);
_NUMPY_UNIFY_NOOP(UINT32);
- _NUMPY_UNIFY_PROMOTE(FLOAT32);
+ _NUMPY_UNIFY_PROMOTE_TO(FLOAT32, FLOAT64);
_NUMPY_UNIFY_PROMOTE(FLOAT64);
default:
return INVALID;
@@ -149,7 +157,7 @@ class NumPyDtypeUnifier {
_NUMPY_UNIFY_NOOP(UINT8);
_NUMPY_UNIFY_NOOP(UINT16);
_NUMPY_UNIFY_PROMOTE(UINT64);
- _NUMPY_UNIFY_PROMOTE(FLOAT32);
+ _NUMPY_UNIFY_PROMOTE_TO(FLOAT32, FLOAT64);
_NUMPY_UNIFY_PROMOTE(FLOAT64);
default:
return INVALID;
@@ -161,7 +169,7 @@ class NumPyDtypeUnifier {
_NUMPY_UNIFY_NOOP(UINT8);
_NUMPY_UNIFY_NOOP(UINT16);
_NUMPY_UNIFY_NOOP(UINT32);
- _NUMPY_UNIFY_PROMOTE(FLOAT32);
+ _NUMPY_UNIFY_PROMOTE_TO(FLOAT32, FLOAT64);
_NUMPY_UNIFY_PROMOTE(FLOAT64);
default:
return INVALID;
@@ -210,12 +218,11 @@ class NumPyDtypeUnifier {
int Observe_DATETIME(PyArray_Descr* dtype_obj) {
// TODO: check that units are all the same
- // current_dtype_ = dtype_obj->type_num;
- return NOOP;
+ return OK;
}
Status Observe(PyArray_Descr* descr) {
- const int dtype = fix_numpy_type_num(descr->type_num);
+ int dtype = fix_numpy_type_num(descr->type_num);
if (current_type_num_ == -1) {
current_dtype_ = descr;
@@ -230,7 +237,7 @@ class NumPyDtypeUnifier {
action = Observe_##DTYPE(descr, dtype); \
break;
- int action = NOOP;
+ int action = OK;
switch (current_type_num_) {
OBSERVE_CASE(BOOL);
OBSERVE_CASE(INT8);
@@ -253,9 +260,6 @@ class NumPyDtypeUnifier {
if (action == INVALID) {
return InvalidMix(dtype);
- } else if (action == PROMOTE) {
- current_type_num_ = dtype;
- current_dtype_ = descr;
}
return Status::OK();
}
@@ -264,6 +268,8 @@ class NumPyDtypeUnifier {
PyArray_Descr* current_dtype() const { return current_dtype_; }
+ int current_type_num() const { return current_type_num_; }
+
private:
int current_type_num_;
PyArray_Descr* current_dtype_;
@@ -278,8 +284,10 @@ class TypeInferrer {
// early with long sequences that may have problems up front
// \param make_unions permit mixed-type data by creating union types (not yet
// implemented)
- explicit TypeInferrer(int64_t validate_interval = 100, bool make_unions = false)
- : validate_interval_(validate_interval),
+ explicit TypeInferrer(bool pandas_null_sentinels = false,
+ int64_t validate_interval = 100, bool make_unions = false)
+ : pandas_null_sentinels_(pandas_null_sentinels),
+ validate_interval_(validate_interval),
make_unions_(make_unions),
total_count_(0),
none_count_(0),
@@ -297,6 +305,7 @@ class TypeInferrer {
decimal_count_(0),
list_count_(0),
struct_count_(0),
+ numpy_dtype_count_(0),
max_decimal_metadata_(std::numeric_limits<int32_t>::min(),
std::numeric_limits<int32_t>::min()),
decimal_type_() {
@@ -311,12 +320,12 @@ class TypeInferrer {
Status Visit(PyObject* obj, bool* keep_going) {
++total_count_;
- if (obj == Py_None || internal::PyFloat_IsNaN(obj)) {
+ if (obj == Py_None || (pandas_null_sentinels_ && internal::PyFloat_IsNaN(obj))) {
++none_count_;
} else if (PyBool_Check(obj)) {
++bool_count_;
*keep_going = make_unions_;
- } else if (internal::PyFloatScalar_Check(obj)) {
+ } else if (PyFloat_Check(obj)) {
++float_count_;
*keep_going = make_unions_;
} else if (internal::IsPyInteger(obj)) {
@@ -367,7 +376,7 @@ class TypeInferrer {
});
}
- Status GetType(std::shared_ptr<DataType>* out) const {
+ Status GetType(std::shared_ptr<DataType>* out) {
// TODO(wesm): handling forming unions
if (make_unions_) {
return Status::NotImplemented("Creating union types not yet supported");
@@ -375,11 +384,48 @@ class TypeInferrer {
RETURN_NOT_OK(Validate());
- if (numpy_unifier_.current_dtype() != nullptr) {
- std::shared_ptr<DataType> type;
- RETURN_NOT_OK(NumPyDtypeToArrow(numpy_unifier_.current_dtype(), &type));
- *out = type;
- } else if (list_count_) {
+ if (numpy_dtype_count_ > 0) {
+ // All NumPy scalars and Nones/nulls
+ if (numpy_dtype_count_ + none_count_ == total_count_) {
+ std::shared_ptr<DataType> type;
+ RETURN_NOT_OK(NumPyDtypeToArrow(numpy_unifier_.current_dtype(), &type));
+ *out = type;
+ return Status::OK();
+ }
+
+ // The "bad path": data contains a mix of NumPy scalars and
+ // other kinds of scalars. Note this can happen innocuously
+ // because numpy.nan is not a NumPy scalar (it's a built-in
+ // PyFloat)
+
+ // TODO(ARROW-5564): Merge together type unification so this
+ // hack is not necessary
+ switch (numpy_unifier_.current_type_num()) {
+ case NPY_BOOL:
+ bool_count_ += numpy_dtype_count_;
+ break;
+ case NPY_INT8:
+ case NPY_INT16:
+ case NPY_INT32:
+ case NPY_INT64:
+ case NPY_UINT8:
+ case NPY_UINT16:
+ case NPY_UINT32:
+ case NPY_UINT64:
+ int_count_ += numpy_dtype_count_;
+ break;
+ case NPY_FLOAT32:
+ case NPY_FLOAT64:
+ float_count_ += numpy_dtype_count_;
+ break;
+ case NPY_DATETIME:
+ return Status::Invalid(
+ "numpy.datetime64 scalars cannot be mixed "
+ "with other Python scalar values currently");
+ }
+ }
+
+ if (list_count_) {
std::shared_ptr<DataType> value_type;
RETURN_NOT_OK(list_inferrer_->GetType(&value_type));
*out = list(value_type);
@@ -439,13 +485,15 @@ class TypeInferrer {
Status VisitDType(PyArray_Descr* dtype, bool* keep_going) {
// Continue visiting dtypes for now.
// TODO(wesm): devise approach for unions
+ ++numpy_dtype_count_;
*keep_going = true;
return numpy_unifier_.Observe(dtype);
}
Status VisitList(PyObject* obj, bool* keep_going /* unused */) {
if (!list_inferrer_) {
- list_inferrer_.reset(new TypeInferrer(validate_interval_, make_unions_));
+ list_inferrer_.reset(
+ new TypeInferrer(pandas_null_sentinels_, validate_interval_, make_unions_));
}
++list_count_;
return list_inferrer_->VisitSequence(obj);
@@ -458,9 +506,15 @@ class TypeInferrer {
}
// Not an object array: infer child Arrow type from dtype
if (!list_inferrer_) {
- list_inferrer_.reset(new TypeInferrer(validate_interval_, make_unions_));
+ list_inferrer_.reset(
+ new TypeInferrer(pandas_null_sentinels_, validate_interval_, make_unions_));
}
++list_count_;
+
+ // XXX(wesm): In ARROW-4324 I added accounting to check whether
+ // all of the non-null values have NumPy dtypes, but the
+ // total_count not not being properly incremented here
+ ++(*list_inferrer_).total_count_;
return list_inferrer_->VisitDType(dtype, keep_going);
}
@@ -484,7 +538,8 @@ class TypeInferrer {
if (it == struct_inferrers_.end()) {
it = struct_inferrers_
.insert(
- std::make_pair(key, TypeInferrer(validate_interval_, make_unions_)))
+ std::make_pair(key, TypeInferrer(pandas_null_sentinels_,
+ validate_interval_, make_unions_)))
.first;
}
TypeInferrer* visitor = &it->second;
@@ -503,9 +558,9 @@ class TypeInferrer {
return Status::OK();
}
- Status GetStructType(std::shared_ptr<DataType>* out) const {
+ Status GetStructType(std::shared_ptr<DataType>* out) {
std::vector<std::shared_ptr<Field>> fields;
- for (const auto& it : struct_inferrers_) {
+ for (auto&& it : struct_inferrers_) {
std::shared_ptr<DataType> field_type;
RETURN_NOT_OK(it.second.GetType(&field_type));
fields.emplace_back(field(it.first, field_type));
@@ -515,6 +570,7 @@ class TypeInferrer {
}
private:
+ bool pandas_null_sentinels_;
int64_t validate_interval_;
bool make_unions_;
int64_t total_count_;
@@ -532,8 +588,9 @@ class TypeInferrer {
int64_t unicode_count_;
int64_t decimal_count_;
int64_t list_count_;
- std::unique_ptr<TypeInferrer> list_inferrer_;
int64_t struct_count_;
+ int64_t numpy_dtype_count_;
+ std::unique_ptr<TypeInferrer> list_inferrer_;
std::map<std::string, TypeInferrer> struct_inferrers_;
// If we observe a strongly-typed value in e.g. a NumPy array, we can store
@@ -548,9 +605,10 @@ class TypeInferrer {
};
// Non-exhaustive type inference
-Status InferArrowType(PyObject* obj, std::shared_ptr<DataType>* out_type) {
+Status InferArrowType(PyObject* obj, bool pandas_null_sentinels,
+ std::shared_ptr<DataType>* out_type) {
PyDateTime_IMPORT;
- TypeInferrer inferrer;
+ TypeInferrer inferrer(pandas_null_sentinels);
RETURN_NOT_OK(inferrer.VisitSequence(obj));
RETURN_NOT_OK(inferrer.GetType(out_type));
if (*out_type == nullptr) {
@@ -560,7 +618,7 @@ Status InferArrowType(PyObject* obj, std::shared_ptr<DataType>* out_type) {
return Status::OK();
}
-Status InferArrowTypeAndSize(PyObject* obj, int64_t* size,
+Status InferArrowTypeAndSize(PyObject* obj, bool pandas_null_sentinels, int64_t* size,
std::shared_ptr<DataType>* out_type) {
if (!PySequence_Check(obj)) {
return Status::TypeError("Object is not a sequence");
@@ -572,7 +630,7 @@ Status InferArrowTypeAndSize(PyObject* obj, int64_t* size,
*out_type = null();
return Status::OK();
}
- RETURN_NOT_OK(InferArrowType(obj, out_type));
+ RETURN_NOT_OK(InferArrowType(obj, pandas_null_sentinels, out_type));
return Status::OK();
}
diff --git a/cpp/src/arrow/python/inference.h b/cpp/src/arrow/python/inference.h
index 8790250..746e922 100644
--- a/cpp/src/arrow/python/inference.h
+++ b/cpp/src/arrow/python/inference.h
@@ -40,10 +40,12 @@ namespace py {
// These three functions take a sequence input, not arbitrary iterables
ARROW_PYTHON_EXPORT
-arrow::Status InferArrowType(PyObject* obj, std::shared_ptr<arrow::DataType>* out_type);
+arrow::Status InferArrowType(PyObject* obj, bool pandas_null_sentinels,
+ std::shared_ptr<arrow::DataType>* out_type);
ARROW_PYTHON_EXPORT
-arrow::Status InferArrowTypeAndSize(PyObject* obj, int64_t* size,
+arrow::Status InferArrowTypeAndSize(PyObject* obj, bool pandas_null_sentinels,
+ int64_t* size,
std::shared_ptr<arrow::DataType>* out_type);
/// Checks whether the passed Python object is a boolean scalar
diff --git a/cpp/src/arrow/python/numpy-internal.h b/cpp/src/arrow/python/numpy-internal.h
index 19bcde0..da826f5 100644
--- a/cpp/src/arrow/python/numpy-internal.h
+++ b/cpp/src/arrow/python/numpy-internal.h
@@ -171,6 +171,15 @@ inline bool PyBoolScalar_Check(PyObject* obj) {
return PyBool_Check(obj) || PyArray_IsScalar(obj, Bool);
}
+static inline PyArray_Descr* GetSafeNumPyDtype(int type) {
+ if (type == NPY_DATETIME) {
+ // It is not safe to mutate the result of DescrFromType
+ return PyArray_DescrNewFromType(type);
+ } else {
+ return PyArray_DescrFromType(type);
+ }
+}
+
} // namespace internal
} // namespace py
diff --git a/cpp/src/arrow/python/python-test.cc b/cpp/src/arrow/python/python-test.cc
index ad1ae01..03e4970 100644
--- a/cpp/src/arrow/python/python-test.cc
+++ b/cpp/src/arrow/python/python-test.cc
@@ -442,19 +442,5 @@ TEST_F(DecimalTest, UpdateWithNaN) {
ASSERT_EQ(std::numeric_limits<int32_t>::min(), metadata.scale());
}
-TEST(PythonTest, ConstructStringArrayWithLeadingZeros) {
- PyAcquireGIL lock;
-
- OwnedRef list_ref(PyList_New(2));
- PyObject* list = list_ref.obj();
- std::string str("str");
-
- ASSERT_EQ(0, PyList_SetItem(list, 0, PyFloat_FromDouble(NAN)));
- ASSERT_EQ(0, PyList_SetItem(list, 1, PyUnicode_FromString(str.c_str())));
-
- std::shared_ptr<ChunkedArray> out;
- ASSERT_OK(ConvertPySequence(list, {}, &out));
-}
-
} // namespace py
} // namespace arrow
diff --git a/cpp/src/arrow/python/python_to_arrow.cc b/cpp/src/arrow/python/python_to_arrow.cc
index 2d07f0e..2d1d5d2 100644
--- a/cpp/src/arrow/python/python_to_arrow.cc
+++ b/cpp/src/arrow/python/python_to_arrow.cc
@@ -972,7 +972,7 @@ Status ConvertPySequence(PyObject* sequence_source, PyObject* mask,
bool strict_conversions = false;
if (options.type == nullptr) {
- RETURN_NOT_OK(InferArrowType(seq, &real_type));
+ RETURN_NOT_OK(InferArrowType(seq, options.from_pandas, &real_type));
} else {
real_type = options.type;
strict_conversions = true;
diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi
index e504806..ae6104f 100644
--- a/python/pyarrow/array.pxi
+++ b/python/pyarrow/array.pxi
@@ -84,7 +84,7 @@ cdef _ndarray_to_array(object values, object mask, DataType type,
return pyarrow_wrap_array(chunked_out.get().chunk(0))
-def array(object obj, type=None, mask=None, size=None, bint from_pandas=False,
+def array(object obj, type=None, mask=None, size=None, from_pandas=None,
bint safe=True, MemoryPool memory_pool=None):
"""
Create pyarrow.Array instance from a Python object
@@ -105,11 +105,13 @@ def array(object obj, type=None, mask=None, size=None, bint from_pandas=False,
will be treated as a "max size", but will involve an initial allocation
of size followed by a resize to the actual size (so if you know the
exact size specifying it correctly will give you better performance).
- from_pandas : boolean, default False
- Use pandas's semantics for inferring nulls from values in ndarray-like
- data. If passed, the mask tasks precendence, but if a value is unmasked
- (not-null), but still null according to pandas semantics, then it is
- null
+ from_pandas : boolean, default None
+ Use pandas's semantics for inferring nulls from values in
+ ndarray-like data. If passed, the mask tasks precendence, but
+ if a value is unmasked (not-null), but still null according to
+ pandas semantics, then it is null. Defaults to False if not
+ passed explicitly by user, or True if a pandas object is
+ passed in
safe : boolean, default True
Check for overflows or other unsafe conversions
memory_pool : pyarrow.MemoryPool, optional
@@ -147,14 +149,26 @@ def array(object obj, type=None, mask=None, size=None, bint from_pandas=False,
array : pyarrow.Array or pyarrow.ChunkedArray (if object data
overflowed binary storage)
"""
+ cdef:
+ CMemoryPool* pool = maybe_unbox_memory_pool(memory_pool)
+ bint is_pandas_object = False
+ bint c_from_pandas
+
type = ensure_type(type, allow_none=True)
- cdef CMemoryPool* pool = maybe_unbox_memory_pool(memory_pool)
+
+ if from_pandas is None:
+ c_from_pandas = False
+ else:
+ c_from_pandas = from_pandas
if _is_array_like(obj):
if mask is not None:
- mask = get_series_values(mask)
+ # out argument unused
+ mask = get_series_values(mask, &is_pandas_object)
- values = get_series_values(obj)
+ values = get_series_values(obj, &is_pandas_object)
+ if is_pandas_object and from_pandas is None:
+ c_from_pandas = True
if pandas_api.is_categorical(values):
return DictionaryArray.from_arrays(
@@ -166,11 +180,11 @@ def array(object obj, type=None, mask=None, size=None, bint from_pandas=False,
if pandas_api.have_pandas:
values, type = pandas_api.compat.get_datetimetz_type(
values, obj.dtype, type)
- return _ndarray_to_array(values, mask, type, from_pandas, safe,
+ return _ndarray_to_array(values, mask, type, c_from_pandas, safe,
pool)
else:
# ConvertPySequence does strict conversion if type is explicitly passed
- return _sequence_to_array(obj, mask, size, type, pool, from_pandas)
+ return _sequence_to_array(obj, mask, size, type, pool, c_from_pandas)
def asarray(values, type=None):
@@ -1463,13 +1477,16 @@ cdef dict _array_classes = {
}
-cdef object get_series_values(object obj):
+cdef object get_series_values(object obj, bint* is_series):
if pandas_api.is_series(obj):
result = obj.values
+ is_series[0] = True
elif isinstance(obj, np.ndarray):
result = obj
+ is_series[0] = False
else:
result = pandas_api.make_series(obj).values
+ is_series[0] = False
return result
diff --git a/python/pyarrow/tests/test_convert_builtin.py b/python/pyarrow/tests/test_convert_builtin.py
index 17ae1e3..5eebfbd 100644
--- a/python/pyarrow/tests/test_convert_builtin.py
+++ b/python/pyarrow/tests/test_convert_builtin.py
@@ -290,12 +290,8 @@ def test_sequence_numpy_integer(seq, np_scalar_pa_type):
def test_sequence_numpy_integer_inferred(seq, np_scalar_pa_type):
np_scalar, pa_type = np_scalar_pa_type
expected = [np_scalar(1), None, np_scalar(3), None]
- if np_scalar != np.uint64:
- expected += [np_scalar(np.iinfo(np_scalar).min),
- np_scalar(np.iinfo(np_scalar).max)]
- else:
- # max(uint64) is too large for the inferred int64 type
- expected += [0, np.iinfo(np.int64).max]
+ expected += [np_scalar(np.iinfo(np_scalar).min),
+ np_scalar(np.iinfo(np_scalar).max)]
arr = pa.array(seq(expected))
assert len(arr) == 6
assert arr.null_count == 2
@@ -303,6 +299,14 @@ def test_sequence_numpy_integer_inferred(seq, np_scalar_pa_type):
assert arr.to_pylist() == expected
+def test_numpy_scalars_mixed_type():
+ # ARROW-4324
+ data = [np.int32(10), np.float32(0.5)]
+ arr = pa.array(data)
+ expected = pa.array([10, 0.5], type='float64')
+ assert arr.equals(expected)
+
+
@pytest.mark.xfail(reason="Type inference for uint64 not implemented",
raises=pa.ArrowException)
def test_uint64_max_convert():
@@ -435,9 +439,13 @@ def test_mixed_sequence_errors():
@parametrize_with_iterable_types
-@pytest.mark.parametrize("np_scalar", [np.float16, np.float32, np.float64])
+@pytest.mark.parametrize("np_scalar,pa_type", [
+ (np.float16, pa.float16()),
+ (np.float32, pa.float32()),
+ (np.float64, pa.float64())
+])
@pytest.mark.parametrize("from_pandas", [True, False])
-def test_sequence_numpy_double(seq, np_scalar, from_pandas):
+def test_sequence_numpy_double(seq, np_scalar, pa_type, from_pandas):
data = [np_scalar(1.5), np_scalar(1), None, np_scalar(2.5), None, np.nan]
arr = pa.array(seq(data), from_pandas=from_pandas)
assert len(arr) == 6
@@ -445,7 +453,12 @@ def test_sequence_numpy_double(seq, np_scalar, from_pandas):
assert arr.null_count == 3
else:
assert arr.null_count == 2
- assert arr.type == pa.float64()
+ if from_pandas:
+ # The NaN is skipped in type inference, otherwise it forces a
+ # float64 promotion
+ assert arr.type == pa_type
+ else:
+ assert arr.type == pa.float64()
assert arr.to_pylist()[:4] == data[:4]
if from_pandas:
@@ -475,6 +488,17 @@ def test_ndarray_nested_numpy_double(from_pandas, inner_seq):
[[1., 2.], [1., 2., 3.], [np.nan], None])
+def test_array_ignore_nan_from_pandas():
+ # See ARROW-4324, this reverts logic that was introduced in
+ # ARROW-2240
+ with pytest.raises(ValueError):
+ pa.array([np.nan, 'str'])
+
+ arr = pa.array([np.nan, 'str'], from_pandas=True)
+ expected = pa.array([None, 'str'])
+ assert arr.equals(expected)
+
+
def test_nested_ndarray_different_dtypes():
data = [
np.array([1, 2, 3], dtype='int64'),
diff --git a/python/pyarrow/tests/test_pandas.py b/python/pyarrow/tests/test_pandas.py
index 3b5bd57..46e4f65 100644
--- a/python/pyarrow/tests/test_pandas.py
+++ b/python/pyarrow/tests/test_pandas.py
@@ -593,6 +593,13 @@ class TestConvertPrimitiveTypes(object):
expected = pd.Series([False, True, True, None, True])
_check_array_roundtrip(s, expected=expected, type=pa.bool_())
+ def test_series_from_pandas_false_respected(self):
+ # Check that explicit from_pandas=False is respected
+ s = pd.Series([0.0, np.nan])
+ arr = pa.array(s, from_pandas=False)
+ assert arr.null_count == 0
+ assert np.isnan(arr[1].as_py())
+
def test_integer_no_nulls(self):
data = OrderedDict()
fields = []