You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by uw...@apache.org on 2018/07/17 12:54:31 UTC
[arrow] branch master updated: ARROW-2806: [C++/Python] More consistent null/nan handling
This is an automated email from the ASF dual-hosted git repository.
uwe pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 3419058 ARROW-2806: [C++/Python] More consistent null/nan handling
3419058 is described below
commit 34190587a6d5c1fa49c2d6025b1f305b4f169d75
Author: Korn, Uwe <Uw...@blue-yonder.com>
AuthorDate: Tue Jul 17 14:54:21 2018 +0200
ARROW-2806: [C++/Python] More consistent null/nan handling
I'll take care of the cast issues mentioned in the ticket in a separate PR, already opened https://issues.apache.org/jira/browse/ARROW-2854 for them.
Author: Korn, Uwe <Uw...@blue-yonder.com>
Closes #2270 from xhochy/ARROW-2806 and squashes the following commits:
418f3fb0 <Korn, Uwe> ARROW-2806: More consistent null/nan handling
---
cpp/cmake_modules/FindClangTools.cmake | 6 +-
cpp/src/arrow/python/builtin_convert.cc | 179 +++++++++++++++++++++------
cpp/src/arrow/python/builtin_convert.h | 12 +-
cpp/src/arrow/python/numpy_to_arrow.cc | 9 +-
cpp/src/arrow/python/python-test.cc | 10 +-
python/doc/source/data.rst | 15 +++
python/pyarrow/array.pxi | 17 ++-
python/pyarrow/includes/libarrow.pxd | 7 +-
python/pyarrow/tests/test_convert_builtin.py | 74 ++++++++++-
9 files changed, 262 insertions(+), 67 deletions(-)
diff --git a/cpp/cmake_modules/FindClangTools.cmake b/cpp/cmake_modules/FindClangTools.cmake
index 7d1d2d1..215a5cd 100644
--- a/cpp/cmake_modules/FindClangTools.cmake
+++ b/cpp/cmake_modules/FindClangTools.cmake
@@ -86,12 +86,10 @@ if (CLANG_FORMAT_VERSION)
if ("${CLANG_FORMAT_BIN}" STREQUAL "CLANG_FORMAT_BIN-NOTFOUND")
# binary was still not found, look into Cellar
- # TODO: This currently only works for '.0' patch releases as
- # find_program does not support regular expressions
- # in the paths.
+ file(GLOB CLANG_FORMAT_PATH "${HOMEBREW_PREFIX}/Cellar/llvm/${CLANG_FORMAT_VERSION}.*")
find_program(CLANG_FORMAT_BIN
NAMES clang-format
- PATHS "${HOMEBREW_PREFIX}/Cellar/llvm/${CLANG_FORMAT_VERSION}.0/bin"
+ PATHS "${CLANG_FORMAT_PATH}/bin"
NO_DEFAULT_PATH
)
endif()
diff --git a/cpp/src/arrow/python/builtin_convert.cc b/cpp/src/arrow/python/builtin_convert.cc
index 49f2b31..f193961 100644
--- a/cpp/src/arrow/python/builtin_convert.cc
+++ b/cpp/src/arrow/python/builtin_convert.cc
@@ -427,7 +427,7 @@ class BoolConverter : public TypedConverterVisitor<BooleanBuilder, BoolConverter
}
};
-template <typename IntType>
+template <typename IntType, bool from_pandas = true>
class TypedIntConverter
: public TypedConverterVisitor<NumericBuilder<IntType>, TypedIntConverter<IntType>> {
public:
@@ -439,6 +439,23 @@ class TypedIntConverter
}
};
+template <typename IntType>
+class TypedIntConverter<IntType, false>
+ : public TypedConverterVisitor<NumericBuilder<IntType>,
+ TypedIntConverter<IntType, false>> {
+ public:
+ Status AppendSingle(PyObject* obj) {
+ return (obj == Py_None) ? this->AppendNull() : this->AppendItem(obj);
+ }
+
+ // Append a non-missing item
+ Status AppendItem(PyObject* obj) {
+ typename IntType::c_type value;
+ RETURN_NOT_OK(internal::CIntFromPython(obj, &value));
+ return this->typed_builder_->Append(value);
+ }
+};
+
class Date32Converter : public TypedConverterVisitor<Date32Builder, Date32Converter> {
public:
// Append a non-missing item
@@ -523,18 +540,37 @@ class TimestampConverter
TimeUnit::type unit_;
};
+template <bool from_pandas = true>
class Float16Converter
- : public TypedConverterVisitor<HalfFloatBuilder, Float16Converter> {
+ : public TypedConverterVisitor<HalfFloatBuilder, Float16Converter<from_pandas>> {
public:
// Append a non-missing item
Status AppendItem(PyObject* obj) {
npy_half val;
RETURN_NOT_OK(PyFloat_AsHalf(obj, &val));
- return typed_builder_->Append(val);
+ return this->typed_builder_->Append(val);
+ }
+};
+
+template <>
+class Float16Converter<false>
+ : public TypedConverterVisitor<HalfFloatBuilder, Float16Converter<false>> {
+ public:
+ Status AppendSingle(PyObject* obj) override {
+ return (obj == Py_None) ? this->AppendNull() : this->AppendItem(obj);
+ }
+
+ // Append a non-missing item
+ Status AppendItem(PyObject* obj) {
+ npy_half val;
+ RETURN_NOT_OK(PyFloat_AsHalf(obj, &val));
+ return this->typed_builder_->Append(val);
}
};
-class Float32Converter : public TypedConverterVisitor<FloatBuilder, Float32Converter> {
+template <bool from_pandas = true>
+class Float32Converter
+ : public TypedConverterVisitor<FloatBuilder, Float32Converter<true>> {
public:
// Append a non-missing item
Status AppendItem(PyObject* obj) {
@@ -544,7 +580,25 @@ class Float32Converter : public TypedConverterVisitor<FloatBuilder, Float32Conve
}
};
-class DoubleConverter : public TypedConverterVisitor<DoubleBuilder, DoubleConverter> {
+template <>
+class Float32Converter<false>
+ : public TypedConverterVisitor<FloatBuilder, Float32Converter<false>> {
+ public:
+ Status AppendSingle(PyObject* obj) override {
+ return (obj == Py_None) ? this->AppendNull() : this->AppendItem(obj);
+ }
+
+ // Append a non-missing item
+ Status AppendItem(PyObject* obj) {
+ float val = static_cast<float>(PyFloat_AsDouble(obj));
+ RETURN_IF_PYERROR();
+ return this->typed_builder_->Append(val);
+ }
+};
+
+template <bool from_pandas = true>
+class DoubleConverter
+ : public TypedConverterVisitor<DoubleBuilder, DoubleConverter<true>> {
public:
// Append a non-missing item
Status AppendItem(PyObject* obj) {
@@ -554,6 +608,22 @@ class DoubleConverter : public TypedConverterVisitor<DoubleBuilder, DoubleConver
}
};
+template <>
+class DoubleConverter<false>
+ : public TypedConverterVisitor<DoubleBuilder, DoubleConverter<false>> {
+ public:
+ Status AppendSingle(PyObject* obj) override {
+ return (obj == Py_None) ? this->AppendNull() : this->AppendItem(obj);
+ }
+
+ // Append a non-missing item
+ Status AppendItem(PyObject* obj) {
+ double val = PyFloat_AsDouble(obj);
+ RETURN_IF_PYERROR();
+ return this->typed_builder_->Append(val);
+ }
+};
+
class BytesConverter : public TypedConverterVisitor<BinaryBuilder, BytesConverter> {
public:
// Append a non-missing item
@@ -581,6 +651,8 @@ class UTF8Converter : public TypedConverterVisitor<StringBuilder, UTF8Converter>
class ListConverter : public TypedConverterVisitor<ListBuilder, ListConverter> {
public:
+ explicit ListConverter(bool from_pandas) : from_pandas_(from_pandas) {}
+
Status Init(ArrayBuilder* builder) override;
// Append a non-missing item
@@ -595,10 +667,13 @@ class ListConverter : public TypedConverterVisitor<ListBuilder, ListConverter> {
protected:
std::unique_ptr<SeqConverter> value_converter_;
+ bool from_pandas_;
};
class StructConverter : public TypedConverterVisitor<StructBuilder, StructConverter> {
public:
+ explicit StructConverter(bool from_pandas) : from_pandas_(from_pandas) {}
+
Status Init(ArrayBuilder* builder) override;
// Append a non-missing item
@@ -660,6 +735,7 @@ class StructConverter : public TypedConverterVisitor<StructBuilder, StructConver
int num_fields_;
// Whether we're converting from a sequence of dicts or tuples
enum { UNKNOWN, DICTS, TUPLES } source_kind_ = UNKNOWN;
+ bool from_pandas_;
};
class DecimalConverter
@@ -674,29 +750,39 @@ class DecimalConverter
}
};
+#define INT_CONVERTER(ArrowType) \
+ { \
+ if (from_pandas) { \
+ return std::unique_ptr<SeqConverter>(new TypedIntConverter<ArrowType, true>); \
+ } else { \
+ return std::unique_ptr<SeqConverter>(new TypedIntConverter<ArrowType, false>); \
+ } \
+ }
+
// Dynamic constructor for sequence converters
-std::unique_ptr<SeqConverter> GetConverter(const std::shared_ptr<DataType>& type) {
+std::unique_ptr<SeqConverter> GetConverter(const std::shared_ptr<DataType>& type,
+ bool from_pandas) {
switch (type->id()) {
case Type::NA:
return std::unique_ptr<SeqConverter>(new NullConverter);
case Type::BOOL:
return std::unique_ptr<SeqConverter>(new BoolConverter);
case Type::INT8:
- return std::unique_ptr<SeqConverter>(new TypedIntConverter<Int8Type>);
+ INT_CONVERTER(Int8Type)
case Type::INT16:
- return std::unique_ptr<SeqConverter>(new TypedIntConverter<Int16Type>);
+ INT_CONVERTER(Int16Type)
case Type::INT32:
- return std::unique_ptr<SeqConverter>(new TypedIntConverter<Int32Type>);
+ INT_CONVERTER(Int32Type)
case Type::INT64:
- return std::unique_ptr<SeqConverter>(new TypedIntConverter<Int64Type>);
+ INT_CONVERTER(Int64Type)
case Type::UINT8:
- return std::unique_ptr<SeqConverter>(new TypedIntConverter<UInt8Type>);
+ INT_CONVERTER(UInt8Type)
case Type::UINT16:
- return std::unique_ptr<SeqConverter>(new TypedIntConverter<UInt16Type>);
+ INT_CONVERTER(UInt16Type)
case Type::UINT32:
- return std::unique_ptr<SeqConverter>(new TypedIntConverter<UInt32Type>);
+ INT_CONVERTER(UInt32Type)
case Type::UINT64:
- return std::unique_ptr<SeqConverter>(new TypedIntConverter<UInt64Type>);
+ INT_CONVERTER(UInt64Type)
case Type::DATE32:
return std::unique_ptr<SeqConverter>(new Date32Converter);
case Type::DATE64:
@@ -704,12 +790,27 @@ std::unique_ptr<SeqConverter> GetConverter(const std::shared_ptr<DataType>& type
case Type::TIMESTAMP:
return std::unique_ptr<SeqConverter>(
new TimestampConverter(checked_cast<const TimestampType&>(*type).unit()));
- case Type::HALF_FLOAT:
- return std::unique_ptr<SeqConverter>(new Float16Converter);
- case Type::FLOAT:
- return std::unique_ptr<SeqConverter>(new Float32Converter);
- case Type::DOUBLE:
- return std::unique_ptr<SeqConverter>(new DoubleConverter);
+ case Type::HALF_FLOAT: {
+ if (from_pandas) {
+ return std::unique_ptr<SeqConverter>(new Float16Converter<true>);
+ } else {
+ return std::unique_ptr<SeqConverter>(new Float16Converter<false>);
+ }
+ }
+ case Type::FLOAT: {
+ if (from_pandas) {
+ return std::unique_ptr<SeqConverter>(new Float32Converter<true>);
+ } else {
+ return std::unique_ptr<SeqConverter>(new Float32Converter<false>);
+ }
+ }
+ case Type::DOUBLE: {
+ if (from_pandas) {
+ return std::unique_ptr<SeqConverter>(new DoubleConverter<true>);
+ } else {
+ return std::unique_ptr<SeqConverter>(new DoubleConverter<false>);
+ }
+ }
case Type::BINARY:
return std::unique_ptr<SeqConverter>(new BytesConverter);
case Type::FIXED_SIZE_BINARY:
@@ -717,9 +818,9 @@ std::unique_ptr<SeqConverter> GetConverter(const std::shared_ptr<DataType>& type
case Type::STRING:
return std::unique_ptr<SeqConverter>(new UTF8Converter);
case Type::LIST:
- return std::unique_ptr<SeqConverter>(new ListConverter);
+ return std::unique_ptr<SeqConverter>(new ListConverter(from_pandas));
case Type::STRUCT:
- return std::unique_ptr<SeqConverter>(new StructConverter);
+ return std::unique_ptr<SeqConverter>(new StructConverter(from_pandas));
case Type::DECIMAL:
return std::unique_ptr<SeqConverter>(new DecimalConverter);
default:
@@ -731,8 +832,8 @@ Status ListConverter::Init(ArrayBuilder* builder) {
builder_ = builder;
typed_builder_ = checked_cast<ListBuilder*>(builder);
- value_converter_ =
- GetConverter(checked_cast<const ListType&>(*builder->type()).value_type());
+ value_converter_ = GetConverter(
+ checked_cast<const ListType&>(*builder->type()).value_type(), from_pandas_);
if (value_converter_ == nullptr) {
return Status::NotImplemented("value type not implemented");
}
@@ -756,7 +857,7 @@ Status StructConverter::Init(ArrayBuilder* builder) {
const std::string& field_name(struct_type.child(i)->name());
std::shared_ptr<DataType> field_type(struct_type.child(i)->type());
- auto value_converter = GetConverter(field_type);
+ auto value_converter = GetConverter(field_type, from_pandas_);
if (value_converter == nullptr) {
return Status::NotImplemented("value type not implemented");
}
@@ -774,9 +875,10 @@ Status StructConverter::Init(ArrayBuilder* builder) {
}
Status AppendPySequence(PyObject* obj, int64_t size,
- const std::shared_ptr<DataType>& type, ArrayBuilder* builder) {
+ const std::shared_ptr<DataType>& type, ArrayBuilder* builder,
+ bool from_pandas) {
PyDateTime_IMPORT;
- auto converter = GetConverter(type);
+ auto converter = GetConverter(type, from_pandas);
if (converter == nullptr) {
std::stringstream ss;
ss << "No type converter implemented for " << type->ToString();
@@ -788,7 +890,8 @@ Status AppendPySequence(PyObject* obj, int64_t size,
static Status ConvertPySequenceReal(PyObject* obj, int64_t size,
const std::shared_ptr<DataType>* type,
- MemoryPool* pool, std::shared_ptr<Array>* out) {
+ MemoryPool* pool, bool from_pandas,
+ std::shared_ptr<Array>* out) {
PyAcquireGIL lock;
PyObject* seq;
@@ -814,28 +917,30 @@ static Status ConvertPySequenceReal(PyObject* obj, int64_t size,
// Give the sequence converter an array builder
std::unique_ptr<ArrayBuilder> builder;
RETURN_NOT_OK(MakeBuilder(pool, real_type, &builder));
- RETURN_NOT_OK(AppendPySequence(seq, size, real_type, builder.get()));
+ RETURN_NOT_OK(AppendPySequence(seq, size, real_type, builder.get(), from_pandas));
return builder->Finish(out);
}
-Status ConvertPySequence(PyObject* obj, MemoryPool* pool, std::shared_ptr<Array>* out) {
- return ConvertPySequenceReal(obj, -1, nullptr, pool, out);
+Status ConvertPySequence(PyObject* obj, MemoryPool* pool, bool from_pandas,
+ std::shared_ptr<Array>* out) {
+ return ConvertPySequenceReal(obj, -1, nullptr, pool, from_pandas, out);
}
Status ConvertPySequence(PyObject* obj, const std::shared_ptr<DataType>& type,
- MemoryPool* pool, std::shared_ptr<Array>* out) {
- return ConvertPySequenceReal(obj, -1, &type, pool, out);
+ MemoryPool* pool, bool from_pandas,
+ std::shared_ptr<Array>* out) {
+ return ConvertPySequenceReal(obj, -1, &type, pool, from_pandas, out);
}
-Status ConvertPySequence(PyObject* obj, int64_t size, MemoryPool* pool,
+Status ConvertPySequence(PyObject* obj, int64_t size, MemoryPool* pool, bool from_pandas,
std::shared_ptr<Array>* out) {
- return ConvertPySequenceReal(obj, size, nullptr, pool, out);
+ return ConvertPySequenceReal(obj, size, nullptr, pool, from_pandas, out);
}
Status ConvertPySequence(PyObject* obj, int64_t size,
const std::shared_ptr<DataType>& type, MemoryPool* pool,
- std::shared_ptr<Array>* out) {
- return ConvertPySequenceReal(obj, size, &type, pool, out);
+ bool from_pandas, std::shared_ptr<Array>* out) {
+ return ConvertPySequenceReal(obj, size, &type, pool, from_pandas, out);
}
} // namespace py
diff --git a/cpp/src/arrow/python/builtin_convert.h b/cpp/src/arrow/python/builtin_convert.h
index 7a32bec..d9b5ecd 100644
--- a/cpp/src/arrow/python/builtin_convert.h
+++ b/cpp/src/arrow/python/builtin_convert.h
@@ -47,27 +47,29 @@ ARROW_EXPORT arrow::Status InferArrowTypeAndSize(
ARROW_EXPORT arrow::Status AppendPySequence(PyObject* obj, int64_t size,
const std::shared_ptr<arrow::DataType>& type,
- arrow::ArrayBuilder* builder);
+ arrow::ArrayBuilder* builder,
+ bool from_pandas);
// Type and size inference
ARROW_EXPORT
-Status ConvertPySequence(PyObject* obj, MemoryPool* pool, std::shared_ptr<Array>* out);
+Status ConvertPySequence(PyObject* obj, MemoryPool* pool, bool from_pandas,
+ std::shared_ptr<Array>* out);
// Type inference only
ARROW_EXPORT
-Status ConvertPySequence(PyObject* obj, int64_t size, MemoryPool* pool,
+Status ConvertPySequence(PyObject* obj, int64_t size, MemoryPool* pool, bool from_pandas,
std::shared_ptr<Array>* out);
// Size inference only
ARROW_EXPORT
Status ConvertPySequence(PyObject* obj, const std::shared_ptr<DataType>& type,
- MemoryPool* pool, std::shared_ptr<Array>* out);
+ MemoryPool* pool, bool from_pandas, std::shared_ptr<Array>* out);
// No inference
ARROW_EXPORT
Status ConvertPySequence(PyObject* obj, int64_t size,
const std::shared_ptr<DataType>& type, MemoryPool* pool,
- std::shared_ptr<Array>* out);
+ bool from_pandas, std::shared_ptr<Array>* out);
ARROW_EXPORT
Status InvalidConversion(PyObject* obj, const std::string& expected_type_name,
diff --git a/cpp/src/arrow/python/numpy_to_arrow.cc b/cpp/src/arrow/python/numpy_to_arrow.cc
index e18cced..09926ba 100644
--- a/cpp/src/arrow/python/numpy_to_arrow.cc
+++ b/cpp/src/arrow/python/numpy_to_arrow.cc
@@ -1273,7 +1273,8 @@ inline Status NumPyConverter::ConvertTypedLists(const std::shared_ptr<DataType>&
ss << inferred_type->ToString() << " cannot be converted to " << type->ToString();
return Status::TypeError(ss.str());
}
- return AppendPySequence(object, size, type, value_builder);
+ return AppendPySequence(object, size, type, value_builder,
+ use_pandas_null_sentinels_);
} else {
return Status::TypeError("Unsupported Python type for list items");
}
@@ -1368,7 +1369,8 @@ inline Status NumPyConverter::ConvertTypedLists<NPY_OBJECT, BinaryType>(
ss << inferred_type->ToString() << " cannot be converted to BINARY.";
return Status::TypeError(ss.str());
}
- return AppendPySequence(object, size, type, value_builder);
+ return AppendPySequence(object, size, type, value_builder,
+ use_pandas_null_sentinels_);
} else {
return Status::TypeError("Unsupported Python type for list items");
}
@@ -1425,7 +1427,8 @@ inline Status NumPyConverter::ConvertTypedLists<NPY_OBJECT, StringType>(
ss << inferred_type->ToString() << " cannot be converted to STRING.";
return Status::TypeError(ss.str());
}
- return AppendPySequence(object, size, type, value_builder);
+ return AppendPySequence(object, size, type, value_builder,
+ use_pandas_null_sentinels_);
} else {
return Status::TypeError("Unsupported Python type for list items");
}
diff --git a/cpp/src/arrow/python/python-test.cc b/cpp/src/arrow/python/python-test.cc
index abe93b0..9763bef 100644
--- a/cpp/src/arrow/python/python-test.cc
+++ b/cpp/src/arrow/python/python-test.cc
@@ -269,7 +269,7 @@ TEST(BuiltinConversionTest, TestMixedTypeFails) {
ASSERT_EQ(PyList_SetItem(list, 1, integer), 0);
ASSERT_EQ(PyList_SetItem(list, 2, doub), 0);
- ASSERT_RAISES(TypeError, ConvertPySequence(list, pool, &arr));
+ ASSERT_RAISES(TypeError, ConvertPySequence(list, pool, false, &arr));
}
TEST_F(DecimalTest, FromPythonDecimalRescaleNotTruncateable) {
@@ -349,7 +349,7 @@ TEST_F(DecimalTest, TestNoneAndNaN) {
MemoryPool* pool = default_memory_pool();
std::shared_ptr<Array> arr;
- ASSERT_OK(ConvertPySequence(list, pool, &arr));
+ ASSERT_OK(ConvertPySequence(list, pool, false, &arr));
ASSERT_TRUE(arr->IsValid(0));
ASSERT_TRUE(arr->IsNull(1));
ASSERT_TRUE(arr->IsNull(2));
@@ -374,7 +374,7 @@ TEST_F(DecimalTest, TestMixedPrecisionAndScale) {
MemoryPool* pool = default_memory_pool();
std::shared_ptr<Array> arr;
- ASSERT_OK(ConvertPySequence(list, pool, &arr));
+ ASSERT_OK(ConvertPySequence(list, pool, false, &arr));
const auto& type = checked_cast<const DecimalType&>(*arr->type());
int32_t expected_precision = 9;
@@ -402,7 +402,7 @@ TEST_F(DecimalTest, TestMixedPrecisionAndScaleSequenceConvert) {
ASSERT_EQ(PyList_SetItem(list, 0, value1), 0);
ASSERT_EQ(PyList_SetItem(list, 1, value2), 0);
- ASSERT_OK(ConvertPySequence(list, pool, &arr));
+ ASSERT_OK(ConvertPySequence(list, pool, false, &arr));
const auto& type = checked_cast<const Decimal128Type&>(*arr->type());
ASSERT_EQ(3, type.precision());
@@ -438,7 +438,7 @@ TEST(PythonTest, ConstructStringArrayWithLeadingZeros) {
std::shared_ptr<Array> out;
auto pool = default_memory_pool();
- ASSERT_OK(ConvertPySequence(list, pool, &out));
+ ASSERT_OK(ConvertPySequence(list, pool, false, &out));
}
} // namespace py
diff --git a/python/doc/source/data.rst b/python/doc/source/data.rst
index 0717260..3f4169c 100644
--- a/python/doc/source/data.rst
+++ b/python/doc/source/data.rst
@@ -198,6 +198,21 @@ Arrays can be sliced without copying:
arr[1:3]
+None values and NAN handling
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+As mentioned in the above section, the Python object ``None`` is always
+converted to an Arrow null element on the conversion to ``pyarrow.Array``. For
+the float NaN value which is either represented by the Python object
+``float('nan')`` or ``numpy.nan`` we normally convert it to a *valid* float
+value during the conversion. If an integer input is supplied to
+``pyarrow.array`` that contains ``np.nan``, ``ValueError`` is raised.
+
+To handle better compability with Pandas, we support interpreting NaN values as
+null elements. This is enabled automatically on all ``from_pandas`` function and
+can be enable on the other conversion functions by passing ``from_pandas=True``
+as a function parameter.
+
List arrays
~~~~~~~~~~~
diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi
index 9d14e1e..de59509 100644
--- a/python/pyarrow/array.pxi
+++ b/python/pyarrow/array.pxi
@@ -17,25 +17,29 @@
cdef _sequence_to_array(object sequence, object size, DataType type,
- CMemoryPool* pool):
+ CMemoryPool* pool, c_bool from_pandas):
cdef shared_ptr[CArray] out
cdef int64_t c_size
if type is None:
if size is None:
with nogil:
- check_status(ConvertPySequence(sequence, pool, &out))
+ check_status(
+ ConvertPySequence(sequence, pool, from_pandas, &out)
+ )
else:
c_size = size
with nogil:
check_status(
- ConvertPySequence(sequence, c_size, pool, &out)
+ ConvertPySequence(
+ sequence, c_size, pool, from_pandas, &out
+ )
)
else:
if size is None:
with nogil:
check_status(
ConvertPySequence(
- sequence, type.sp_type, pool, &out,
+ sequence, type.sp_type, pool, from_pandas, &out,
)
)
else:
@@ -43,7 +47,8 @@ cdef _sequence_to_array(object sequence, object size, DataType type,
with nogil:
check_status(
ConvertPySequence(
- sequence, c_size, type.sp_type, pool, &out,
+ sequence, c_size, type.sp_type, pool, from_pandas,
+ &out,
)
)
@@ -178,7 +183,7 @@ def array(object obj, type=None, mask=None,
else:
if mask is not None:
raise ValueError("Masks only supported with ndarray-like inputs")
- return _sequence_to_array(obj, size, type, pool)
+ return _sequence_to_array(obj, size, type, pool, from_pandas)
def asarray(values, type=None):
diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd
index d617986..dc8470f 100644
--- a/python/pyarrow/includes/libarrow.pxd
+++ b/python/pyarrow/includes/libarrow.pxd
@@ -938,14 +938,17 @@ cdef extern from "arrow/python/api.h" namespace "arrow::py" nogil:
object PyHalf_FromHalf(npy_half value)
CStatus ConvertPySequence(object obj, CMemoryPool* pool,
+ c_bool from_pandas,
shared_ptr[CArray]* out)
CStatus ConvertPySequence(object obj, const shared_ptr[CDataType]& type,
- CMemoryPool* pool, shared_ptr[CArray]* out)
- CStatus ConvertPySequence(object obj, int64_t size, CMemoryPool* pool,
+ CMemoryPool* pool, c_bool from_pandas,
shared_ptr[CArray]* out)
+ CStatus ConvertPySequence(object obj, int64_t size, CMemoryPool* pool,
+ c_bool from_pandas, shared_ptr[CArray]* out)
CStatus ConvertPySequence(object obj, int64_t size,
const shared_ptr[CDataType]& type,
CMemoryPool* pool,
+ c_bool from_pandas,
shared_ptr[CArray]* out)
CStatus NumPyDtypeToArrow(object dtype, shared_ptr[CDataType]* type)
diff --git a/python/pyarrow/tests/test_convert_builtin.py b/python/pyarrow/tests/test_convert_builtin.py
index 31228b4..7c7918e 100644
--- a/python/pyarrow/tests/test_convert_builtin.py
+++ b/python/pyarrow/tests/test_convert_builtin.py
@@ -226,6 +226,40 @@ def test_sequence_integer(seq, np_scalar_pa_type):
@parametrize_with_iterable_types
+@pytest.mark.parametrize("np_scalar_pa_type", int_type_pairs)
+def test_sequence_integer_np_nan(seq, np_scalar_pa_type):
+ # ARROW-2806: numpy.nan is a double value and thus should produce
+ # a double array.
+ _, pa_type = np_scalar_pa_type
+ with pytest.raises(ValueError):
+ pa.array(seq([np.nan]), type=pa_type, from_pandas=False)
+
+ arr = pa.array(seq([np.nan]), type=pa_type, from_pandas=True)
+ expected = [None]
+ assert len(arr) == 1
+ assert arr.null_count == 1
+ assert arr.type == pa_type
+ assert arr.to_pylist() == expected
+
+
+@parametrize_with_iterable_types
+@pytest.mark.parametrize("np_scalar_pa_type", int_type_pairs)
+def test_sequence_integer_nested_np_nan(seq, np_scalar_pa_type):
+ # ARROW-2806: numpy.nan is a double value and thus should produce
+ # a double array.
+ _, pa_type = np_scalar_pa_type
+ with pytest.raises(ValueError):
+ pa.array(seq([[np.nan]]), type=pa.list_(pa_type), from_pandas=False)
+
+ arr = pa.array(seq([[np.nan]]), type=pa.list_(pa_type), from_pandas=True)
+ expected = [[None]]
+ assert len(arr) == 1
+ assert arr.null_count == 0
+ assert arr.type == pa.list_(pa_type)
+ assert arr.to_pylist() == expected
+
+
+@parametrize_with_iterable_types
def test_sequence_integer_inferred(seq):
expected = [1, None, 3, None]
arr = pa.array(seq(expected))
@@ -310,13 +344,43 @@ def test_sequence_double():
@parametrize_with_iterable_types
@pytest.mark.parametrize("np_scalar", [np.float16, np.float32, np.float64])
-def test_sequence_numpy_double(seq, np_scalar):
- data = [np_scalar(1.5), np_scalar(1), None, np_scalar(2.5), None, None]
- arr = pa.array(seq(data))
+@pytest.mark.parametrize("from_pandas", [True, False])
+def test_sequence_numpy_double(seq, np_scalar, from_pandas):
+ data = [np_scalar(1.5), np_scalar(1), None, np_scalar(2.5), None, np.nan]
+ arr = pa.array(seq(data), from_pandas=from_pandas)
assert len(arr) == 6
- assert arr.null_count == 3
+ if from_pandas:
+ assert arr.null_count == 3
+ else:
+ assert arr.null_count == 2
assert arr.type == pa.float64()
- assert arr.to_pylist() == data
+
+ assert arr.to_pylist()[:4] == data[:4]
+ if from_pandas:
+ assert arr.to_pylist()[5] is None
+ else:
+ assert np.isnan(arr.to_pylist()[5])
+
+
+@pytest.mark.parametrize("from_pandas", [True, False])
+@pytest.mark.parametrize("inner_seq", [np.array, list])
+def test_ndarray_nested_numpy_double(from_pandas, inner_seq):
+ # ARROW-2806
+ data = np.array([
+ inner_seq([1., 2.]),
+ inner_seq([1., 2., 3.]),
+ inner_seq([np.nan]),
+ None
+ ])
+ arr = pa.array(data, from_pandas=from_pandas)
+ assert len(arr) == 4
+ assert arr.null_count == 1
+ assert arr.type == pa.list_(pa.float64())
+ if from_pandas:
+ assert arr.to_pylist() == [[1.0, 2.0], [1.0, 2.0, 3.0], [None], None]
+ else:
+ np.testing.assert_equal(arr.to_pylist(),
+ [[1., 2.], [1., 2., 3.], [np.nan], None])
def test_sequence_unicode():