You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by uw...@apache.org on 2018/09/04 06:36:39 UTC
[arrow] branch master updated: ARROW-1949: [Python/C++] Add option
to Array.from_pandas and pyarrow.array to perform unsafe casts
This is an automated email from the ASF dual-hosted git repository.
uwe pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new a712165 ARROW-1949: [Python/C++] Add option to Array.from_pandas and pyarrow.array to perform unsafe casts
a712165 is described below
commit a71216530d4ab7a9bf0dba8f2d540e7b446e5e20
Author: Krisztián Szűcs <sz...@gmail.com>
AuthorDate: Tue Sep 4 08:36:29 2018 +0200
ARROW-1949: [Python/C++] Add option to Array.from_pandas and pyarrow.array to perform unsafe casts
Author: Krisztián Szűcs <sz...@gmail.com>
Closes #2497 from kszucs/ARROW-1949 and squashes the following commits:
f352c477 <Krisztián Szűcs> remove safe flag from _sequence_to_array
70d6cae2 <Krisztián Szűcs> annotate boolean arguments as bint
e838a14d <Krisztián Szűcs> check-format
fff89aaa <Krisztián Szűcs> lint
92ac3a92 <Krisztián Szűcs> tests for timestamp casts
dd8871e8 <Krisztián Szűcs> wire CastOptions through the API
---
cpp/src/arrow/python/numpy_to_arrow.cc | 35 ++++++++++++--------
cpp/src/arrow/python/numpy_to_arrow.h | 18 ++++++++++
python/pyarrow/array.pxi | 60 +++++++++++++++++++---------------
python/pyarrow/includes/libarrow.pxd | 6 ++++
python/pyarrow/tests/test_array.py | 25 ++++++++++++--
5 files changed, 103 insertions(+), 41 deletions(-)
diff --git a/cpp/src/arrow/python/numpy_to_arrow.cc b/cpp/src/arrow/python/numpy_to_arrow.cc
index 502afc7..ece00c2 100644
--- a/cpp/src/arrow/python/numpy_to_arrow.cc
+++ b/cpp/src/arrow/python/numpy_to_arrow.cc
@@ -173,13 +173,15 @@ int64_t MaskToBitmap(PyArrayObject* mask, int64_t length, uint8_t* bitmap) {
class NumPyConverter {
public:
NumPyConverter(MemoryPool* pool, PyObject* arr, PyObject* mo,
- const std::shared_ptr<DataType>& type, bool from_pandas)
+ const std::shared_ptr<DataType>& type, bool from_pandas,
+ const compute::CastOptions& cast_options = compute::CastOptions())
: pool_(pool),
type_(type),
arr_(reinterpret_cast<PyArrayObject*>(arr)),
dtype_(PyArray_DESCR(arr_)),
mask_(nullptr),
from_pandas_(from_pandas),
+ cast_options_(cast_options),
null_bitmap_data_(nullptr),
null_count_(0) {
if (mo != nullptr && mo != Py_None) {
@@ -289,6 +291,7 @@ class NumPyConverter {
int itemsize_;
bool from_pandas_;
+ compute::CastOptions cast_options_;
// Used in visitor pattern
ArrayVector out_arrays_;
@@ -319,7 +322,8 @@ namespace {
Status CastBuffer(const std::shared_ptr<DataType>& in_type,
const std::shared_ptr<Buffer>& input, const int64_t length,
const std::shared_ptr<Buffer>& valid_bitmap, const int64_t null_count,
- const std::shared_ptr<DataType>& out_type, MemoryPool* pool,
+ const std::shared_ptr<DataType>& out_type,
+ const compute::CastOptions& cast_options, MemoryPool* pool,
std::shared_ptr<Buffer>* out) {
// Must cast
auto tmp_data = ArrayData::Make(in_type, length, {valid_bitmap, input}, null_count);
@@ -328,9 +332,6 @@ Status CastBuffer(const std::shared_ptr<DataType>& in_type,
std::shared_ptr<Array> casted_array;
compute::FunctionContext context(pool);
- compute::CastOptions cast_options;
- cast_options.allow_int_overflow = false;
- cast_options.allow_time_truncate = false;
RETURN_NOT_OK(
compute::Cast(&context, *tmp_array, out_type, cast_options, &casted_array));
@@ -412,7 +413,8 @@ inline Status NumPyConverter::ConvertData(std::shared_ptr<Buffer>* data) {
RETURN_NOT_OK(NumPyDtypeToArrow(reinterpret_cast<PyObject*>(dtype_), &input_type));
if (!input_type->Equals(*type_)) {
- RETURN_NOT_OK(CastBuffer(input_type, *data, length_, nullptr, 0, type_, pool_, data));
+ RETURN_NOT_OK(CastBuffer(input_type, *data, length_, nullptr, 0, type_, cast_options_,
+ pool_, data));
}
return Status::OK();
@@ -465,14 +467,14 @@ inline Status NumPyConverter::ConvertData<Date32Type>(std::shared_ptr<Buffer>* d
if (!input_type->Equals(*type_)) {
// The null bitmap was already computed in VisitNative()
RETURN_NOT_OK(CastBuffer(input_type, *data, length_, null_bitmap_, null_count_,
- type_, pool_, data));
+ type_, cast_options_, pool_, data));
}
}
} else {
RETURN_NOT_OK(NumPyDtypeToArrow(reinterpret_cast<PyObject*>(dtype_), &input_type));
if (!input_type->Equals(*type_)) {
- RETURN_NOT_OK(
- CastBuffer(input_type, *data, length_, nullptr, 0, type_, pool_, data));
+ RETURN_NOT_OK(CastBuffer(input_type, *data, length_, nullptr, 0, type_,
+ cast_options_, pool_, data));
}
}
@@ -512,14 +514,14 @@ inline Status NumPyConverter::ConvertData<Date64Type>(std::shared_ptr<Buffer>* d
if (!input_type->Equals(*type_)) {
// The null bitmap was already computed in VisitNative()
RETURN_NOT_OK(CastBuffer(input_type, *data, length_, null_bitmap_, null_count_,
- type_, pool_, data));
+ type_, cast_options_, pool_, data));
}
}
} else {
RETURN_NOT_OK(NumPyDtypeToArrow(reinterpret_cast<PyObject*>(dtype_), &input_type));
if (!input_type->Equals(*type_)) {
- RETURN_NOT_OK(
- CastBuffer(input_type, *data, length_, nullptr, 0, type_, pool_, data));
+ RETURN_NOT_OK(CastBuffer(input_type, *data, length_, nullptr, 0, type_,
+ cast_options_, pool_, data));
}
}
@@ -770,6 +772,7 @@ Status NumPyConverter::Visit(const StructType& type) {
Status NdarrayToArrow(MemoryPool* pool, PyObject* ao, PyObject* mo, bool from_pandas,
const std::shared_ptr<DataType>& type,
+ const compute::CastOptions& cast_options,
std::shared_ptr<ChunkedArray>* out) {
if (!PyArray_Check(ao)) {
return Status::Invalid("Input object was not a NumPy array");
@@ -784,7 +787,7 @@ Status NdarrayToArrow(MemoryPool* pool, PyObject* ao, PyObject* mo, bool from_pa
return ConvertPySequence(ao, mo, py_options, out);
}
- NumPyConverter converter(pool, ao, mo, type, from_pandas);
+ NumPyConverter converter(pool, ao, mo, type, from_pandas, cast_options);
RETURN_NOT_OK(converter.Convert());
const auto& output_arrays = converter.result();
DCHECK_GT(output_arrays.size(), 0);
@@ -792,5 +795,11 @@ Status NdarrayToArrow(MemoryPool* pool, PyObject* ao, PyObject* mo, bool from_pa
return Status::OK();
}
+Status NdarrayToArrow(MemoryPool* pool, PyObject* ao, PyObject* mo, bool from_pandas,
+ const std::shared_ptr<DataType>& type,
+ std::shared_ptr<ChunkedArray>* out) {
+ return NdarrayToArrow(pool, ao, mo, from_pandas, type, compute::CastOptions(), out);
+}
+
} // namespace py
} // namespace arrow
diff --git a/cpp/src/arrow/python/numpy_to_arrow.h b/cpp/src/arrow/python/numpy_to_arrow.h
index bbdd576..5e1c088 100644
--- a/cpp/src/arrow/python/numpy_to_arrow.h
+++ b/cpp/src/arrow/python/numpy_to_arrow.h
@@ -24,6 +24,7 @@
#include <memory>
+#include "arrow/compute/kernels/cast.h"
#include "arrow/util/visibility.h"
namespace arrow {
@@ -45,6 +46,23 @@ namespace py {
/// \param[in] from_pandas If true, use pandas's null sentinels to determine
/// whether values are null
/// \param[in] type a specific type to cast to, may be null
+/// \param[in] cast_options casting options
+/// \param[out] out a ChunkedArray, to accommodate chunked output
+ARROW_EXPORT
+Status NdarrayToArrow(MemoryPool* pool, PyObject* ao, PyObject* mo, bool from_pandas,
+ const std::shared_ptr<DataType>& type,
+ const compute::CastOptions& cast_options,
+ std::shared_ptr<ChunkedArray>* out);
+
+/// Safely convert NumPy arrays to Arrow. If target data type is not known,
+/// pass a type with null.
+///
+/// \param[in] pool Memory pool for any memory allocations
+/// \param[in] ao an ndarray with the array data
+/// \param[in] mo an ndarray with a null mask (True is null), optional
+/// \param[in] from_pandas If true, use pandas's null sentinels to determine
+/// whether values are null
+/// \param[in] type a specific type to cast to, may be null
/// \param[out] out a ChunkedArray, to accommodate chunked output
ARROW_EXPORT
Status NdarrayToArrow(MemoryPool* pool, PyObject* ao, PyObject* mo, bool from_pandas,
diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi
index 76a639a..f9a16a3 100644
--- a/python/pyarrow/array.pxi
+++ b/python/pyarrow/array.pxi
@@ -17,8 +17,7 @@
cdef _sequence_to_array(object sequence, object mask, object size,
- DataType type,
- CMemoryPool* pool, c_bool from_pandas):
+ DataType type, CMemoryPool* pool, c_bool from_pandas):
cdef int64_t c_size
cdef PyConversionOptions options
@@ -50,10 +49,14 @@ cdef _is_array_like(obj):
cdef _ndarray_to_array(object values, object mask, DataType type,
- c_bool from_pandas,
- CMemoryPool* pool):
- cdef shared_ptr[CChunkedArray] chunked_out
- cdef shared_ptr[CDataType] c_type
+ c_bool from_pandas, c_bool safe, CMemoryPool* pool):
+ cdef:
+ shared_ptr[CChunkedArray] chunked_out
+ shared_ptr[CDataType] c_type
+ CCastOptions cast_options
+
+ cast_options.allow_int_overflow = not safe
+ cast_options.allow_time_truncate = not safe
dtype = values.dtype
@@ -66,7 +69,7 @@ cdef _ndarray_to_array(object values, object mask, DataType type,
with nogil:
check_status(NdarrayToArrow(pool, values, mask, from_pandas,
- c_type, &chunked_out))
+ c_type, cast_options, &chunked_out))
if chunked_out.get().num_chunks() > 1:
return pyarrow_wrap_chunked_array(chunked_out)
@@ -83,9 +86,8 @@ cdef inline DataType _ensure_type(object type):
return type
-def array(object obj, type=None, mask=None,
- MemoryPool memory_pool=None, size=None,
- from_pandas=False):
+def array(object obj, type=None, mask=None, size=None, bint from_pandas=False,
+ bint safe=True, MemoryPool memory_pool=None):
"""
Create pyarrow.Array instance from a Python object
@@ -94,14 +96,11 @@ def array(object obj, type=None, mask=None,
obj : sequence, iterable, ndarray or Series
If both type and size are specified may be a single use iterable. If
not strongly-typed, Arrow type will be inferred for resulting array
- mask : array (boolean), optional
- Indicate which values are null (True) or not null (False).
type : pyarrow.DataType
Explicit type to attempt to coerce to, otherwise will be inferred from
the data
- memory_pool : pyarrow.MemoryPool, optional
- If not passed, will allocate memory from the currently-set default
- memory pool
+ mask : array (boolean), optional
+ Indicate which values are null (True) or not null (False).
size : int64, optional
Size of the elements. If the imput is larger than size bail at this
length. For iterators, if size is larger than the input iterator this
@@ -113,6 +112,11 @@ def array(object obj, type=None, mask=None,
data. If passed, the mask tasks precendence, but if a value is unmasked
(not-null), but still null according to pandas semantics, then it is
null
+ safe : boolean, default True
+ Check for overflows or other unsafe conversions
+ memory_pool : pyarrow.MemoryPool, optional
+ If not passed, will allocate memory from the currently-set default
+ memory pool
Notes
-----
@@ -158,13 +162,15 @@ def array(object obj, type=None, mask=None,
return DictionaryArray.from_arrays(
values.codes, values.categories.values,
mask=mask, ordered=values.ordered,
- from_pandas=from_pandas,
+ from_pandas=from_pandas, safe=safe,
memory_pool=memory_pool)
else:
values, type = pdcompat.get_datetimetz_type(values, obj.dtype,
type)
- return _ndarray_to_array(values, mask, type, from_pandas, pool)
+ return _ndarray_to_array(values, mask, type, from_pandas, safe,
+ pool)
else:
+ # ConvertPySequence does strict conversion if type is explicitly passed
return _sequence_to_array(obj, mask, size, type, pool, from_pandas)
@@ -352,7 +358,7 @@ cdef class Array:
with nogil:
check_status(DebugPrint(deref(self.ap), 0))
- def cast(self, object target_type, safe=True):
+ def cast(self, object target_type, bint safe=True):
"""
Cast array values to another data type.
@@ -439,7 +445,8 @@ cdef class Array:
return wrap_datum(out)
@staticmethod
- def from_pandas(obj, mask=None, type=None, MemoryPool memory_pool=None):
+ def from_pandas(obj, mask=None, type=None, bint safe=True,
+ MemoryPool memory_pool=None):
"""
Convert pandas.Series to an Arrow Array, using pandas's semantics about
what values indicate nulls. See pyarrow.array for more general
@@ -453,6 +460,8 @@ cdef class Array:
type : pyarrow.DataType
Explicit type to attempt to coerce to, otherwise will be inferred
from the data
+ safe : boolean, default True
+ Check for overflows or other unsafe conversions
memory_pool : pyarrow.MemoryPool, optional
If not passed, will allocate memory from the currently-set default
memory pool
@@ -468,8 +477,8 @@ cdef class Array:
array : pyarrow.Array or pyarrow.ChunkedArray (if object data
overflows binary buffer)
"""
- return array(obj, mask=mask, type=type, memory_pool=memory_pool,
- from_pandas=True)
+ return array(obj, mask=mask, type=type, safe=safe, from_pandas=True,
+ memory_pool=memory_pool)
def __reduce__(self):
return _restore_array, \
@@ -597,9 +606,8 @@ cdef class Array:
return pyarrow_wrap_array(result)
- def to_pandas(self, c_bool strings_to_categorical=False,
- c_bool zero_copy_only=False,
- c_bool integer_object_nulls=False):
+ def to_pandas(self, bint strings_to_categorical=False,
+ bint zero_copy_only=False, bint integer_object_nulls=False):
"""
Convert to a NumPy array object suitable for use in pandas.
@@ -1051,8 +1059,8 @@ cdef class DictionaryArray(Array):
return self._indices
@staticmethod
- def from_arrays(indices, dictionary, mask=None, ordered=False,
- from_pandas=False, safe=True,
+ def from_arrays(indices, dictionary, mask=None, bint ordered=False,
+ bint from_pandas=False, bint safe=True,
MemoryPool memory_pool=None):
"""
Construct Arrow DictionaryArray from array of indices (must be
diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd
index 7fe354d..8bbbfcf 100644
--- a/python/pyarrow/includes/libarrow.pxd
+++ b/python/pyarrow/includes/libarrow.pxd
@@ -968,6 +968,12 @@ cdef extern from "arrow/python/api.h" namespace "arrow::py" nogil:
const shared_ptr[CDataType]& type,
shared_ptr[CChunkedArray]* out)
+ CStatus NdarrayToArrow(CMemoryPool* pool, object ao, object mo,
+ c_bool from_pandas,
+ const shared_ptr[CDataType]& type,
+ const CCastOptions& cast_options,
+ shared_ptr[CChunkedArray]* out)
+
CStatus NdarrayToTensor(CMemoryPool* pool, object ao,
shared_ptr[CTensor]* out)
diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py
index 4bebe31..d4b582e 100644
--- a/python/pyarrow/tests/test_array.py
+++ b/python/pyarrow/tests/test_array.py
@@ -479,13 +479,18 @@ def test_string_from_buffers():
def _check_cast_case(case, safe=True):
in_data, in_type, out_data, out_type = case
+ expected = pa.array(out_data, type=out_type)
+ # check casting an already created array
in_arr = pa.array(in_data, type=in_type)
-
casted = in_arr.cast(out_type, safe=safe)
- expected = pa.array(out_data, type=out_type)
assert casted.equals(expected)
+ # constructing an array with out type which optionally involves casting
+ # for more see ARROW-1949
+ in_arr = pa.array(in_data, type=out_type, safe=safe)
+ assert in_arr.equals(expected)
+
def test_cast_integers_safe():
safe_cases = [
@@ -573,6 +578,22 @@ def test_cast_timestamp_unit():
result = arr.cast(target, safe=False)
assert result.equals(expected)
+ # ARROW-1949
+ series = pd.Series([pd.Timestamp(1), pd.Timestamp(10), pd.Timestamp(1000)])
+ expected = pa.array([0, 0, 1], type=pa.timestamp('us'))
+
+ with pytest.raises(ValueError):
+ pa.array(series, type=pa.timestamp('us'))
+
+ with pytest.raises(ValueError):
+ pa.Array.from_pandas(series, type=pa.timestamp('us'))
+
+ result = pa.Array.from_pandas(series, type=pa.timestamp('us'), safe=False)
+ assert result.equals(expected)
+
+ result = pa.array(series, type=pa.timestamp('us'), safe=False)
+ assert result.equals(expected)
+
def test_cast_signed_to_unsigned():
safe_cases = [