You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2019/06/25 00:18:38 UTC

[arrow] branch master updated: ARROW-5208: [Python] Add mask argument to pyarrow.infer_type, do not look at masked values when inferring output type in pyarrow.array

This is an automated email from the ASF dual-hosted git repository.

wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 7e4039b  ARROW-5208: [Python] Add mask argument to pyarrow.infer_type, do not look at masked values when inferring output type in pyarrow.array
7e4039b is described below

commit 7e4039b32ef20e48944d12c64ffd78d5efb4ebd0
Author: Wes McKinney <we...@apache.org>
AuthorDate: Mon Jun 24 19:18:28 2019 -0500

    ARROW-5208: [Python] Add mask argument to pyarrow.infer_type, do not look at masked values when inferring output type in pyarrow.array
    
    Author: Wes McKinney <we...@apache.org>
    
    Closes #4677 from wesm/ARROW-5208 and squashes the following commits:
    
    833f075d9 <Wes McKinney> add another test case with ndarray dtype=object argument
    d7860a206 <Wes McKinney> Add mask arguments to infer_type, respect mask in ConvertPySequence when inferring type
---
 cpp/src/arrow/python/inference.cc       | 29 ++++++++++++++++++++---------
 cpp/src/arrow/python/inference.h        | 15 +++++++++++----
 cpp/src/arrow/python/python_to_arrow.cc |  2 +-
 python/pyarrow/array.pxi                | 10 ++++++++--
 python/pyarrow/includes/libarrow.pxd    |  3 ++-
 python/pyarrow/tests/test_array.py      | 27 +++++++++++++++++++++++++++
 6 files changed, 69 insertions(+), 17 deletions(-)

diff --git a/cpp/src/arrow/python/inference.cc b/cpp/src/arrow/python/inference.cc
index 4ec4d9d..9d2c35f 100644
--- a/cpp/src/arrow/python/inference.cc
+++ b/cpp/src/arrow/python/inference.cc
@@ -370,10 +370,21 @@ class TypeInferrer {
   }
 
   // Infer value type from a sequence of values
-  Status VisitSequence(PyObject* obj) {
-    return internal::VisitSequence(obj, [this](PyObject* value, bool* keep_going) {
-      return Visit(value, keep_going);
-    });
+  Status VisitSequence(PyObject* obj, PyObject* mask = nullptr) {
+    if (mask == nullptr || mask == Py_None) {
+      return internal::VisitSequence(obj, [this](PyObject* value, bool* keep_going) {
+        return Visit(value, keep_going);
+      });
+    } else {
+      return internal::VisitSequenceMasked(
+          obj, mask, [this](PyObject* value, uint8_t masked, bool* keep_going) {
+            if (!masked) {
+              return Visit(value, keep_going);
+            } else {
+              return Status::OK();
+            }
+          });
+    }
   }
 
   Status GetType(std::shared_ptr<DataType>* out) {
@@ -605,11 +616,11 @@ class TypeInferrer {
 };
 
 // Non-exhaustive type inference
-Status InferArrowType(PyObject* obj, bool pandas_null_sentinels,
+Status InferArrowType(PyObject* obj, PyObject* mask, bool pandas_null_sentinels,
                       std::shared_ptr<DataType>* out_type) {
   PyDateTime_IMPORT;
   TypeInferrer inferrer(pandas_null_sentinels);
-  RETURN_NOT_OK(inferrer.VisitSequence(obj));
+  RETURN_NOT_OK(inferrer.VisitSequence(obj, mask));
   RETURN_NOT_OK(inferrer.GetType(out_type));
   if (*out_type == nullptr) {
     return Status::TypeError("Unable to determine data type");
@@ -618,8 +629,8 @@ Status InferArrowType(PyObject* obj, bool pandas_null_sentinels,
   return Status::OK();
 }
 
-Status InferArrowTypeAndSize(PyObject* obj, bool pandas_null_sentinels, int64_t* size,
-                             std::shared_ptr<DataType>* out_type) {
+Status InferArrowTypeAndSize(PyObject* obj, PyObject* mask, bool pandas_null_sentinels,
+                             int64_t* size, std::shared_ptr<DataType>* out_type) {
   if (!PySequence_Check(obj)) {
     return Status::TypeError("Object is not a sequence");
   }
@@ -630,7 +641,7 @@ Status InferArrowTypeAndSize(PyObject* obj, bool pandas_null_sentinels, int64_t*
     *out_type = null();
     return Status::OK();
   }
-  RETURN_NOT_OK(InferArrowType(obj, pandas_null_sentinels, out_type));
+  RETURN_NOT_OK(InferArrowType(obj, mask, pandas_null_sentinels, out_type));
 
   return Status::OK();
 }
diff --git a/cpp/src/arrow/python/inference.h b/cpp/src/arrow/python/inference.h
index 746e922..b1b7651 100644
--- a/cpp/src/arrow/python/inference.h
+++ b/cpp/src/arrow/python/inference.h
@@ -38,14 +38,21 @@ class Status;
 
 namespace py {
 
-// These three functions take a sequence input, not arbitrary iterables
+// These functions take a sequence input, not arbitrary iterables
+
+/// \brief Infer Arrow type from a Python sequence
+/// \param[in] obj the sequence of values
+/// \param[in] mask an optional mask where True values are null. May
+/// be nullptr
+/// \param[in] pandas_null_sentinels use pandas's null value markers
+/// \param[out] out_type the inferred type
 ARROW_PYTHON_EXPORT
-arrow::Status InferArrowType(PyObject* obj, bool pandas_null_sentinels,
+arrow::Status InferArrowType(PyObject* obj, PyObject* mask, bool pandas_null_sentinels,
                              std::shared_ptr<arrow::DataType>* out_type);
 
 ARROW_PYTHON_EXPORT
-arrow::Status InferArrowTypeAndSize(PyObject* obj, bool pandas_null_sentinels,
-                                    int64_t* size,
+arrow::Status InferArrowTypeAndSize(PyObject* obj, PyObject* mask,
+                                    bool pandas_null_sentinels, int64_t* size,
                                     std::shared_ptr<arrow::DataType>* out_type);
 
 /// Checks whether the passed Python object is a boolean scalar
diff --git a/cpp/src/arrow/python/python_to_arrow.cc b/cpp/src/arrow/python/python_to_arrow.cc
index d8adcc5..28d8c13 100644
--- a/cpp/src/arrow/python/python_to_arrow.cc
+++ b/cpp/src/arrow/python/python_to_arrow.cc
@@ -978,7 +978,7 @@ Status ConvertPySequence(PyObject* sequence_source, PyObject* mask,
   bool strict_conversions = false;
 
   if (options.type == nullptr) {
-    RETURN_NOT_OK(InferArrowType(seq, options.from_pandas, &real_type));
+    RETURN_NOT_OK(InferArrowType(seq, mask, options.from_pandas, &real_type));
   } else {
     real_type = options.type;
     strict_conversions = true;
diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi
index a985ff2..189e8a0 100644
--- a/python/pyarrow/array.pxi
+++ b/python/pyarrow/array.pxi
@@ -218,7 +218,7 @@ def asarray(values, type=None):
         return array(values, type=type)
 
 
-def infer_type(values, from_pandas=False):
+def infer_type(values, mask=None, from_pandas=False):
     """
     Attempt to infer Arrow data type that can hold the passed Python
     sequence type in an Array object
@@ -226,6 +226,9 @@ def infer_type(values, from_pandas=False):
     Parameters
     ----------
     values : array-like
+        Sequence to infer type from
+    mask : ndarray (bool type), optional
+        Optional exclusion mask where True marks null, False non-null
     from_pandas : boolean, default False
         Use pandas's NA/null sentinel values for type inference
 
@@ -237,7 +240,10 @@ def infer_type(values, from_pandas=False):
         shared_ptr[CDataType] out
         c_bool use_pandas_sentinels = from_pandas
 
-    check_status(InferArrowType(values, use_pandas_sentinels, &out))
+    if mask is not None and not isinstance(mask, np.ndarray):
+        mask = np.array(mask, dtype=bool)
+
+    check_status(InferArrowType(values, mask, use_pandas_sentinels, &out))
     return pyarrow_wrap_data_type(out)
 
 
diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd
index 494b82d..305055c 100644
--- a/python/pyarrow/includes/libarrow.pxd
+++ b/python/pyarrow/includes/libarrow.pxd
@@ -1153,7 +1153,8 @@ cdef extern from "arrow/compute/api.h" namespace "arrow::compute" nogil:
 
 cdef extern from "arrow/python/api.h" namespace "arrow::py":
     # Requires GIL
-    CStatus InferArrowType(object obj, c_bool pandas_null_sentinels,
+    CStatus InferArrowType(object obj, object mask,
+                           c_bool pandas_null_sentinels,
                            shared_ptr[CDataType]* out_type)
 
 
diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py
index 531b835..80625ba 100644
--- a/python/pyarrow/tests/test_array.py
+++ b/python/pyarrow/tests/test_array.py
@@ -1430,6 +1430,33 @@ def test_numpy_string_overflow_to_chunked():
             value_index += 1
 
 
+def test_infer_type_masked():
+    # ARROW-5208
+    ty = pa.infer_type([u'foo', u'bar', None, 2],
+                       mask=[False, False, False, True])
+    assert ty == pa.utf8()
+
+    # all masked
+    ty = pa.infer_type([u'foo', u'bar', None, 2],
+                       mask=np.array([True, True, True, True]))
+    assert ty == pa.null()
+
+    # length 0
+    assert pa.infer_type([], mask=[]) == pa.null()
+
+
+def test_array_masked():
+    # ARROW-5208
+    arr = pa.array([4, None, 4, 3.],
+                   mask=np.array([False, True, False, True]))
+    assert arr.type == pa.int64()
+
+    # ndarray dtype=object argument
+    arr = pa.array(np.array([4, None, 4, 3.], dtype="O"),
+                   mask=np.array([False, True, False, True]))
+    assert arr.type == pa.int64()
+
+
 def test_array_from_large_pyints():
     # ARROW-5430
     with pytest.raises(pa.ArrowInvalid):