You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2019/06/25 00:18:38 UTC
[arrow] branch master updated: ARROW-5208: [Python] Add mask
argument to pyarrow.infer_type,
do not look at masked values when inferring output type in pyarrow.array
This is an automated email from the ASF dual-hosted git repository.
wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 7e4039b ARROW-5208: [Python] Add mask argument to pyarrow.infer_type, do not look at masked values when inferring output type in pyarrow.array
7e4039b is described below
commit 7e4039b32ef20e48944d12c64ffd78d5efb4ebd0
Author: Wes McKinney <we...@apache.org>
AuthorDate: Mon Jun 24 19:18:28 2019 -0500
ARROW-5208: [Python] Add mask argument to pyarrow.infer_type, do not look at masked values when inferring output type in pyarrow.array
Author: Wes McKinney <we...@apache.org>
Closes #4677 from wesm/ARROW-5208 and squashes the following commits:
833f075d9 <Wes McKinney> add another test case with ndarray dtype=object argument
d7860a206 <Wes McKinney> Add mask arguments to infer_type, respect mask in ConvertPySequence when inferring type
---
cpp/src/arrow/python/inference.cc | 29 ++++++++++++++++++++---------
cpp/src/arrow/python/inference.h | 15 +++++++++++----
cpp/src/arrow/python/python_to_arrow.cc | 2 +-
python/pyarrow/array.pxi | 10 ++++++++--
python/pyarrow/includes/libarrow.pxd | 3 ++-
python/pyarrow/tests/test_array.py | 27 +++++++++++++++++++++++++++
6 files changed, 69 insertions(+), 17 deletions(-)
diff --git a/cpp/src/arrow/python/inference.cc b/cpp/src/arrow/python/inference.cc
index 4ec4d9d..9d2c35f 100644
--- a/cpp/src/arrow/python/inference.cc
+++ b/cpp/src/arrow/python/inference.cc
@@ -370,10 +370,21 @@ class TypeInferrer {
}
// Infer value type from a sequence of values
- Status VisitSequence(PyObject* obj) {
- return internal::VisitSequence(obj, [this](PyObject* value, bool* keep_going) {
- return Visit(value, keep_going);
- });
+ Status VisitSequence(PyObject* obj, PyObject* mask = nullptr) {
+ if (mask == nullptr || mask == Py_None) {
+ return internal::VisitSequence(obj, [this](PyObject* value, bool* keep_going) {
+ return Visit(value, keep_going);
+ });
+ } else {
+ return internal::VisitSequenceMasked(
+ obj, mask, [this](PyObject* value, uint8_t masked, bool* keep_going) {
+ if (!masked) {
+ return Visit(value, keep_going);
+ } else {
+ return Status::OK();
+ }
+ });
+ }
}
Status GetType(std::shared_ptr<DataType>* out) {
@@ -605,11 +616,11 @@ class TypeInferrer {
};
// Non-exhaustive type inference
-Status InferArrowType(PyObject* obj, bool pandas_null_sentinels,
+Status InferArrowType(PyObject* obj, PyObject* mask, bool pandas_null_sentinels,
std::shared_ptr<DataType>* out_type) {
PyDateTime_IMPORT;
TypeInferrer inferrer(pandas_null_sentinels);
- RETURN_NOT_OK(inferrer.VisitSequence(obj));
+ RETURN_NOT_OK(inferrer.VisitSequence(obj, mask));
RETURN_NOT_OK(inferrer.GetType(out_type));
if (*out_type == nullptr) {
return Status::TypeError("Unable to determine data type");
@@ -618,8 +629,8 @@ Status InferArrowType(PyObject* obj, bool pandas_null_sentinels,
return Status::OK();
}
-Status InferArrowTypeAndSize(PyObject* obj, bool pandas_null_sentinels, int64_t* size,
- std::shared_ptr<DataType>* out_type) {
+Status InferArrowTypeAndSize(PyObject* obj, PyObject* mask, bool pandas_null_sentinels,
+ int64_t* size, std::shared_ptr<DataType>* out_type) {
if (!PySequence_Check(obj)) {
return Status::TypeError("Object is not a sequence");
}
@@ -630,7 +641,7 @@ Status InferArrowTypeAndSize(PyObject* obj, bool pandas_null_sentinels, int64_t*
*out_type = null();
return Status::OK();
}
- RETURN_NOT_OK(InferArrowType(obj, pandas_null_sentinels, out_type));
+ RETURN_NOT_OK(InferArrowType(obj, mask, pandas_null_sentinels, out_type));
return Status::OK();
}
diff --git a/cpp/src/arrow/python/inference.h b/cpp/src/arrow/python/inference.h
index 746e922..b1b7651 100644
--- a/cpp/src/arrow/python/inference.h
+++ b/cpp/src/arrow/python/inference.h
@@ -38,14 +38,21 @@ class Status;
namespace py {
-// These three functions take a sequence input, not arbitrary iterables
+// These functions take a sequence input, not arbitrary iterables
+
+/// \brief Infer Arrow type from a Python sequence
+/// \param[in] obj the sequence of values
+/// \param[in] mask an optional mask where True values are null. May
+/// be nullptr
+/// \param[in] pandas_null_sentinels use pandas's null value markers
+/// \param[out] out_type the inferred type
ARROW_PYTHON_EXPORT
-arrow::Status InferArrowType(PyObject* obj, bool pandas_null_sentinels,
+arrow::Status InferArrowType(PyObject* obj, PyObject* mask, bool pandas_null_sentinels,
std::shared_ptr<arrow::DataType>* out_type);
ARROW_PYTHON_EXPORT
-arrow::Status InferArrowTypeAndSize(PyObject* obj, bool pandas_null_sentinels,
- int64_t* size,
+arrow::Status InferArrowTypeAndSize(PyObject* obj, PyObject* mask,
+ bool pandas_null_sentinels, int64_t* size,
std::shared_ptr<arrow::DataType>* out_type);
/// Checks whether the passed Python object is a boolean scalar
diff --git a/cpp/src/arrow/python/python_to_arrow.cc b/cpp/src/arrow/python/python_to_arrow.cc
index d8adcc5..28d8c13 100644
--- a/cpp/src/arrow/python/python_to_arrow.cc
+++ b/cpp/src/arrow/python/python_to_arrow.cc
@@ -978,7 +978,7 @@ Status ConvertPySequence(PyObject* sequence_source, PyObject* mask,
bool strict_conversions = false;
if (options.type == nullptr) {
- RETURN_NOT_OK(InferArrowType(seq, options.from_pandas, &real_type));
+ RETURN_NOT_OK(InferArrowType(seq, mask, options.from_pandas, &real_type));
} else {
real_type = options.type;
strict_conversions = true;
diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi
index a985ff2..189e8a0 100644
--- a/python/pyarrow/array.pxi
+++ b/python/pyarrow/array.pxi
@@ -218,7 +218,7 @@ def asarray(values, type=None):
return array(values, type=type)
-def infer_type(values, from_pandas=False):
+def infer_type(values, mask=None, from_pandas=False):
"""
Attempt to infer Arrow data type that can hold the passed Python
sequence type in an Array object
@@ -226,6 +226,9 @@ def infer_type(values, from_pandas=False):
Parameters
----------
values : array-like
+ Sequence to infer type from
+ mask : ndarray (bool type), optional
+ Optional exclusion mask where True marks null, False non-null
from_pandas : boolean, default False
Use pandas's NA/null sentinel values for type inference
@@ -237,7 +240,10 @@ def infer_type(values, from_pandas=False):
shared_ptr[CDataType] out
c_bool use_pandas_sentinels = from_pandas
- check_status(InferArrowType(values, use_pandas_sentinels, &out))
+ if mask is not None and not isinstance(mask, np.ndarray):
+ mask = np.array(mask, dtype=bool)
+
+ check_status(InferArrowType(values, mask, use_pandas_sentinels, &out))
return pyarrow_wrap_data_type(out)
diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd
index 494b82d..305055c 100644
--- a/python/pyarrow/includes/libarrow.pxd
+++ b/python/pyarrow/includes/libarrow.pxd
@@ -1153,7 +1153,8 @@ cdef extern from "arrow/compute/api.h" namespace "arrow::compute" nogil:
cdef extern from "arrow/python/api.h" namespace "arrow::py":
# Requires GIL
- CStatus InferArrowType(object obj, c_bool pandas_null_sentinels,
+ CStatus InferArrowType(object obj, object mask,
+ c_bool pandas_null_sentinels,
shared_ptr[CDataType]* out_type)
diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py
index 531b835..80625ba 100644
--- a/python/pyarrow/tests/test_array.py
+++ b/python/pyarrow/tests/test_array.py
@@ -1430,6 +1430,33 @@ def test_numpy_string_overflow_to_chunked():
value_index += 1
+def test_infer_type_masked():
+ # ARROW-5208
+ ty = pa.infer_type([u'foo', u'bar', None, 2],
+ mask=[False, False, False, True])
+ assert ty == pa.utf8()
+
+ # all masked
+ ty = pa.infer_type([u'foo', u'bar', None, 2],
+ mask=np.array([True, True, True, True]))
+ assert ty == pa.null()
+
+ # length 0
+ assert pa.infer_type([], mask=[]) == pa.null()
+
+
+def test_array_masked():
+ # ARROW-5208
+ arr = pa.array([4, None, 4, 3.],
+ mask=np.array([False, True, False, True]))
+ assert arr.type == pa.int64()
+
+ # ndarray dtype=object argument
+ arr = pa.array(np.array([4, None, 4, 3.], dtype="O"),
+ mask=np.array([False, True, False, True]))
+ assert arr.type == pa.int64()
+
+
def test_array_from_large_pyints():
# ARROW-5430
with pytest.raises(pa.ArrowInvalid):