You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by uw...@apache.org on 2018/09/04 06:36:39 UTC
[arrow] branch master updated: ARROW-1949: [Python/C++] Add option to Array.from_pandas and pyarrow.array to perform unsafe casts

This is an automated email from the ASF dual-hosted git repository.

uwe pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new a712165  ARROW-1949: [Python/C++] Add option to Array.from_pandas and pyarrow.array to perform unsafe casts
a712165 is described below

commit a71216530d4ab7a9bf0dba8f2d540e7b446e5e20
Author: Krisztián Szűcs <sz...@gmail.com>
AuthorDate: Tue Sep 4 08:36:29 2018 +0200

    ARROW-1949: [Python/C++] Add option to Array.from_pandas and pyarrow.array to perform unsafe casts
    
    Author: Krisztián Szűcs <sz...@gmail.com>
    
    Closes #2497 from kszucs/ARROW-1949 and squashes the following commits:
    
    f352c477 <Krisztián Szűcs> remove safe flag from _sequence_to_array
    70d6cae2 <Krisztián Szűcs> annotate boolean arguments as bint
    e838a14d <Krisztián Szűcs> check-format
    fff89aaa <Krisztián Szűcs> lint
    92ac3a92 <Krisztián Szűcs> tests for timestamp casts
    dd8871e8 <Krisztián Szűcs> wire CastOptions through the API
---
 cpp/src/arrow/python/numpy_to_arrow.cc | 35 ++++++++++++--------
 cpp/src/arrow/python/numpy_to_arrow.h  | 18 ++++++++++
 python/pyarrow/array.pxi               | 60 +++++++++++++++++++---------------
 python/pyarrow/includes/libarrow.pxd   |  6 ++++
 python/pyarrow/tests/test_array.py     | 25 ++++++++++++--
 5 files changed, 103 insertions(+), 41 deletions(-)

diff --git a/cpp/src/arrow/python/numpy_to_arrow.cc b/cpp/src/arrow/python/numpy_to_arrow.cc
index 502afc7..ece00c2 100644
--- a/cpp/src/arrow/python/numpy_to_arrow.cc
+++ b/cpp/src/arrow/python/numpy_to_arrow.cc
@@ -173,13 +173,15 @@ int64_t MaskToBitmap(PyArrayObject* mask, int64_t length, uint8_t* bitmap) {
 class NumPyConverter {
  public:
   NumPyConverter(MemoryPool* pool, PyObject* arr, PyObject* mo,
-                 const std::shared_ptr<DataType>& type, bool from_pandas)
+                 const std::shared_ptr<DataType>& type, bool from_pandas,
+                 const compute::CastOptions& cast_options = compute::CastOptions())
       : pool_(pool),
         type_(type),
         arr_(reinterpret_cast<PyArrayObject*>(arr)),
         dtype_(PyArray_DESCR(arr_)),
         mask_(nullptr),
         from_pandas_(from_pandas),
+        cast_options_(cast_options),
         null_bitmap_data_(nullptr),
         null_count_(0) {
     if (mo != nullptr && mo != Py_None) {
@@ -289,6 +291,7 @@ class NumPyConverter {
   int itemsize_;
 
   bool from_pandas_;
+  compute::CastOptions cast_options_;
 
   // Used in visitor pattern
   ArrayVector out_arrays_;
@@ -319,7 +322,8 @@ namespace {
 Status CastBuffer(const std::shared_ptr<DataType>& in_type,
                   const std::shared_ptr<Buffer>& input, const int64_t length,
                   const std::shared_ptr<Buffer>& valid_bitmap, const int64_t null_count,
-                  const std::shared_ptr<DataType>& out_type, MemoryPool* pool,
+                  const std::shared_ptr<DataType>& out_type,
+                  const compute::CastOptions& cast_options, MemoryPool* pool,
                   std::shared_ptr<Buffer>* out) {
   // Must cast
   auto tmp_data = ArrayData::Make(in_type, length, {valid_bitmap, input}, null_count);
@@ -328,9 +332,6 @@ Status CastBuffer(const std::shared_ptr<DataType>& in_type,
   std::shared_ptr<Array> casted_array;
 
   compute::FunctionContext context(pool);
-  compute::CastOptions cast_options;
-  cast_options.allow_int_overflow = false;
-  cast_options.allow_time_truncate = false;
 
   RETURN_NOT_OK(
       compute::Cast(&context, *tmp_array, out_type, cast_options, &casted_array));
@@ -412,7 +413,8 @@ inline Status NumPyConverter::ConvertData(std::shared_ptr<Buffer>* data) {
   RETURN_NOT_OK(NumPyDtypeToArrow(reinterpret_cast<PyObject*>(dtype_), &input_type));
 
   if (!input_type->Equals(*type_)) {
-    RETURN_NOT_OK(CastBuffer(input_type, *data, length_, nullptr, 0, type_, pool_, data));
+    RETURN_NOT_OK(CastBuffer(input_type, *data, length_, nullptr, 0, type_, cast_options_,
+                             pool_, data));
   }
 
   return Status::OK();
@@ -465,14 +467,14 @@ inline Status NumPyConverter::ConvertData<Date32Type>(std::shared_ptr<Buffer>* d
       if (!input_type->Equals(*type_)) {
         // The null bitmap was already computed in VisitNative()
         RETURN_NOT_OK(CastBuffer(input_type, *data, length_, null_bitmap_, null_count_,
-                                 type_, pool_, data));
+                                 type_, cast_options_, pool_, data));
       }
     }
   } else {
     RETURN_NOT_OK(NumPyDtypeToArrow(reinterpret_cast<PyObject*>(dtype_), &input_type));
     if (!input_type->Equals(*type_)) {
-      RETURN_NOT_OK(
-          CastBuffer(input_type, *data, length_, nullptr, 0, type_, pool_, data));
+      RETURN_NOT_OK(CastBuffer(input_type, *data, length_, nullptr, 0, type_,
+                               cast_options_, pool_, data));
     }
   }
 
@@ -512,14 +514,14 @@ inline Status NumPyConverter::ConvertData<Date64Type>(std::shared_ptr<Buffer>* d
       if (!input_type->Equals(*type_)) {
         // The null bitmap was already computed in VisitNative()
         RETURN_NOT_OK(CastBuffer(input_type, *data, length_, null_bitmap_, null_count_,
-                                 type_, pool_, data));
+                                 type_, cast_options_, pool_, data));
       }
     }
   } else {
     RETURN_NOT_OK(NumPyDtypeToArrow(reinterpret_cast<PyObject*>(dtype_), &input_type));
     if (!input_type->Equals(*type_)) {
-      RETURN_NOT_OK(
-          CastBuffer(input_type, *data, length_, nullptr, 0, type_, pool_, data));
+      RETURN_NOT_OK(CastBuffer(input_type, *data, length_, nullptr, 0, type_,
+                               cast_options_, pool_, data));
     }
   }
 
@@ -770,6 +772,7 @@ Status NumPyConverter::Visit(const StructType& type) {
 
 Status NdarrayToArrow(MemoryPool* pool, PyObject* ao, PyObject* mo, bool from_pandas,
                       const std::shared_ptr<DataType>& type,
+                      const compute::CastOptions& cast_options,
                       std::shared_ptr<ChunkedArray>* out) {
   if (!PyArray_Check(ao)) {
     return Status::Invalid("Input object was not a NumPy array");
@@ -784,7 +787,7 @@ Status NdarrayToArrow(MemoryPool* pool, PyObject* ao, PyObject* mo, bool from_pa
     return ConvertPySequence(ao, mo, py_options, out);
   }
 
-  NumPyConverter converter(pool, ao, mo, type, from_pandas);
+  NumPyConverter converter(pool, ao, mo, type, from_pandas, cast_options);
   RETURN_NOT_OK(converter.Convert());
   const auto& output_arrays = converter.result();
   DCHECK_GT(output_arrays.size(), 0);
@@ -792,5 +795,11 @@ Status NdarrayToArrow(MemoryPool* pool, PyObject* ao, PyObject* mo, bool from_pa
   return Status::OK();
 }
 
+Status NdarrayToArrow(MemoryPool* pool, PyObject* ao, PyObject* mo, bool from_pandas,
+                      const std::shared_ptr<DataType>& type,
+                      std::shared_ptr<ChunkedArray>* out) {
+  return NdarrayToArrow(pool, ao, mo, from_pandas, type, compute::CastOptions(), out);
+}
+
 }  // namespace py
 }  // namespace arrow
diff --git a/cpp/src/arrow/python/numpy_to_arrow.h b/cpp/src/arrow/python/numpy_to_arrow.h
index bbdd576..5e1c088 100644
--- a/cpp/src/arrow/python/numpy_to_arrow.h
+++ b/cpp/src/arrow/python/numpy_to_arrow.h
@@ -24,6 +24,7 @@
 
 #include <memory>
 
+#include "arrow/compute/kernels/cast.h"
 #include "arrow/util/visibility.h"
 
 namespace arrow {
@@ -45,6 +46,23 @@ namespace py {
 /// \param[in] from_pandas If true, use pandas's null sentinels to determine
 /// whether values are null
 /// \param[in] type a specific type to cast to, may be null
+/// \param[in] cast_options casting options
+/// \param[out] out a ChunkedArray, to accommodate chunked output
+ARROW_EXPORT
+Status NdarrayToArrow(MemoryPool* pool, PyObject* ao, PyObject* mo, bool from_pandas,
+                      const std::shared_ptr<DataType>& type,
+                      const compute::CastOptions& cast_options,
+                      std::shared_ptr<ChunkedArray>* out);
+
+/// Safely convert NumPy arrays to Arrow. If target data type is not known,
+/// pass a type with null.
+///
+/// \param[in] pool Memory pool for any memory allocations
+/// \param[in] ao an ndarray with the array data
+/// \param[in] mo an ndarray with a null mask (True is null), optional
+/// \param[in] from_pandas If true, use pandas's null sentinels to determine
+/// whether values are null
+/// \param[in] type a specific type to cast to, may be null
 /// \param[out] out a ChunkedArray, to accommodate chunked output
 ARROW_EXPORT
 Status NdarrayToArrow(MemoryPool* pool, PyObject* ao, PyObject* mo, bool from_pandas,
diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi
index 76a639a..f9a16a3 100644
--- a/python/pyarrow/array.pxi
+++ b/python/pyarrow/array.pxi
@@ -17,8 +17,7 @@
 
 
 cdef _sequence_to_array(object sequence, object mask, object size,
-                        DataType type,
-                        CMemoryPool* pool, c_bool from_pandas):
+                        DataType type, CMemoryPool* pool, c_bool from_pandas):
     cdef int64_t c_size
     cdef PyConversionOptions options
 
@@ -50,10 +49,14 @@ cdef _is_array_like(obj):
 
 
 cdef _ndarray_to_array(object values, object mask, DataType type,
-                       c_bool from_pandas,
-                       CMemoryPool* pool):
-    cdef shared_ptr[CChunkedArray] chunked_out
-    cdef shared_ptr[CDataType] c_type
+                       c_bool from_pandas, c_bool safe, CMemoryPool* pool):
+    cdef:
+        shared_ptr[CChunkedArray] chunked_out
+        shared_ptr[CDataType] c_type
+        CCastOptions cast_options
+
+    cast_options.allow_int_overflow = not safe
+    cast_options.allow_time_truncate = not safe
 
     dtype = values.dtype
 
@@ -66,7 +69,7 @@ cdef _ndarray_to_array(object values, object mask, DataType type,
 
     with nogil:
         check_status(NdarrayToArrow(pool, values, mask, from_pandas,
-                                    c_type, &chunked_out))
+                                    c_type, cast_options, &chunked_out))
 
     if chunked_out.get().num_chunks() > 1:
         return pyarrow_wrap_chunked_array(chunked_out)
@@ -83,9 +86,8 @@ cdef inline DataType _ensure_type(object type):
         return type
 
 
-def array(object obj, type=None, mask=None,
-          MemoryPool memory_pool=None, size=None,
-          from_pandas=False):
+def array(object obj, type=None, mask=None, size=None, bint from_pandas=False,
+          bint safe=True, MemoryPool memory_pool=None):
     """
     Create pyarrow.Array instance from a Python object
 
@@ -94,14 +96,11 @@ def array(object obj, type=None, mask=None,
     obj : sequence, iterable, ndarray or Series
         If both type and size are specified may be a single use iterable. If
         not strongly-typed, Arrow type will be inferred for resulting array
-    mask : array (boolean), optional
-        Indicate which values are null (True) or not null (False).
     type : pyarrow.DataType
         Explicit type to attempt to coerce to, otherwise will be inferred from
         the data
-    memory_pool : pyarrow.MemoryPool, optional
-        If not passed, will allocate memory from the currently-set default
-        memory pool
+    mask : array (boolean), optional
+        Indicate which values are null (True) or not null (False).
     size : int64, optional
         Size of the elements. If the imput is larger than size bail at this
         length. For iterators, if size is larger than the input iterator this
@@ -113,6 +112,11 @@ def array(object obj, type=None, mask=None,
         data. If passed, the mask tasks precendence, but if a value is unmasked
         (not-null), but still null according to pandas semantics, then it is
         null
+    safe : boolean, default True
+        Check for overflows or other unsafe conversions
+    memory_pool : pyarrow.MemoryPool, optional
+        If not passed, will allocate memory from the currently-set default
+        memory pool
 
     Notes
     -----
@@ -158,13 +162,15 @@ def array(object obj, type=None, mask=None,
             return DictionaryArray.from_arrays(
                 values.codes, values.categories.values,
                 mask=mask, ordered=values.ordered,
-                from_pandas=from_pandas,
+                from_pandas=from_pandas, safe=safe,
                 memory_pool=memory_pool)
         else:
             values, type = pdcompat.get_datetimetz_type(values, obj.dtype,
                                                         type)
-            return _ndarray_to_array(values, mask, type, from_pandas, pool)
+            return _ndarray_to_array(values, mask, type, from_pandas, safe,
+                                     pool)
     else:
+        # ConvertPySequence does strict conversion if type is explicitly passed
         return _sequence_to_array(obj, mask, size, type, pool, from_pandas)
 
 
@@ -352,7 +358,7 @@ cdef class Array:
         with nogil:
             check_status(DebugPrint(deref(self.ap), 0))
 
-    def cast(self, object target_type, safe=True):
+    def cast(self, object target_type, bint safe=True):
         """
         Cast array values to another data type.
 
@@ -439,7 +445,8 @@ cdef class Array:
         return wrap_datum(out)
 
     @staticmethod
-    def from_pandas(obj, mask=None, type=None, MemoryPool memory_pool=None):
+    def from_pandas(obj, mask=None, type=None, bint safe=True,
+                    MemoryPool memory_pool=None):
         """
         Convert pandas.Series to an Arrow Array, using pandas's semantics about
         what values indicate nulls. See pyarrow.array for more general
@@ -453,6 +460,8 @@ cdef class Array:
         type : pyarrow.DataType
             Explicit type to attempt to coerce to, otherwise will be inferred
             from the data
+        safe : boolean, default True
+            Check for overflows or other unsafe conversions
         memory_pool : pyarrow.MemoryPool, optional
             If not passed, will allocate memory from the currently-set default
             memory pool
@@ -468,8 +477,8 @@ cdef class Array:
         array : pyarrow.Array or pyarrow.ChunkedArray (if object data
         overflows binary buffer)
         """
-        return array(obj, mask=mask, type=type, memory_pool=memory_pool,
-                     from_pandas=True)
+        return array(obj, mask=mask, type=type, safe=safe, from_pandas=True,
+                     memory_pool=memory_pool)
 
     def __reduce__(self):
         return _restore_array, \
@@ -597,9 +606,8 @@ cdef class Array:
 
         return pyarrow_wrap_array(result)
 
-    def to_pandas(self, c_bool strings_to_categorical=False,
-                  c_bool zero_copy_only=False,
-                  c_bool integer_object_nulls=False):
+    def to_pandas(self, bint strings_to_categorical=False,
+                  bint zero_copy_only=False, bint integer_object_nulls=False):
         """
         Convert to a NumPy array object suitable for use in pandas.
 
@@ -1051,8 +1059,8 @@ cdef class DictionaryArray(Array):
         return self._indices
 
     @staticmethod
-    def from_arrays(indices, dictionary, mask=None, ordered=False,
-                    from_pandas=False, safe=True,
+    def from_arrays(indices, dictionary, mask=None, bint ordered=False,
+                    bint from_pandas=False, bint safe=True,
                     MemoryPool memory_pool=None):
         """
         Construct Arrow DictionaryArray from array of indices (must be
diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd
index 7fe354d..8bbbfcf 100644
--- a/python/pyarrow/includes/libarrow.pxd
+++ b/python/pyarrow/includes/libarrow.pxd
@@ -968,6 +968,12 @@ cdef extern from "arrow/python/api.h" namespace "arrow::py" nogil:
                            const shared_ptr[CDataType]& type,
                            shared_ptr[CChunkedArray]* out)
 
+    CStatus NdarrayToArrow(CMemoryPool* pool, object ao, object mo,
+                           c_bool from_pandas,
+                           const shared_ptr[CDataType]& type,
+                           const CCastOptions& cast_options,
+                           shared_ptr[CChunkedArray]* out)
+
     CStatus NdarrayToTensor(CMemoryPool* pool, object ao,
                             shared_ptr[CTensor]* out)
 
diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py
index 4bebe31..d4b582e 100644
--- a/python/pyarrow/tests/test_array.py
+++ b/python/pyarrow/tests/test_array.py
@@ -479,13 +479,18 @@ def test_string_from_buffers():
 
 def _check_cast_case(case, safe=True):
     in_data, in_type, out_data, out_type = case
+    expected = pa.array(out_data, type=out_type)
 
+    # check casting an already created array
     in_arr = pa.array(in_data, type=in_type)
-
     casted = in_arr.cast(out_type, safe=safe)
-    expected = pa.array(out_data, type=out_type)
     assert casted.equals(expected)
 
+    # constructing an array with out type which optionally involves casting
+    # for more see ARROW-1949
+    in_arr = pa.array(in_data, type=out_type, safe=safe)
+    assert in_arr.equals(expected)
+
 
 def test_cast_integers_safe():
     safe_cases = [
@@ -573,6 +578,22 @@ def test_cast_timestamp_unit():
     result = arr.cast(target, safe=False)
     assert result.equals(expected)
 
+    # ARROW-1949
+    series = pd.Series([pd.Timestamp(1), pd.Timestamp(10), pd.Timestamp(1000)])
+    expected = pa.array([0, 0, 1], type=pa.timestamp('us'))
+
+    with pytest.raises(ValueError):
+        pa.array(series, type=pa.timestamp('us'))
+
+    with pytest.raises(ValueError):
+        pa.Array.from_pandas(series, type=pa.timestamp('us'))
+
+    result = pa.Array.from_pandas(series, type=pa.timestamp('us'), safe=False)
+    assert result.equals(expected)
+
+    result = pa.array(series, type=pa.timestamp('us'), safe=False)
+    assert result.equals(expected)
+
 
 def test_cast_signed_to_unsigned():
     safe_cases = [