You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2020/06/25 16:15:47 UTC
[arrow] branch master updated: ARROW-842: [Python] Recognize
pandas.NaT as null when converting object arrays with from_pandas=True
This is an automated email from the ASF dual-hosted git repository.
wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 390fa7a ARROW-842: [Python] Recognize pandas.NaT as null when converting object arrays with from_pandas=True
390fa7a is described below
commit 390fa7aaab321b2bc72966ad7c6ac427bbc82505
Author: Wes McKinney <we...@apache.org>
AuthorDate: Thu Jun 25 11:15:14 2020 -0500
ARROW-842: [Python] Recognize pandas.NaT as null when converting object arrays with from_pandas=True
This has been the root cause of a number of bugs. I'm unclear if there's a race condition with tearing down a `static OwnedRef` so we might need some other approach to managing symbols imported from various Python modules
Closes #7537 from wesm/ARROW-842
Authored-by: Wes McKinney <we...@apache.org>
Signed-off-by: Wes McKinney <we...@apache.org>
---
cpp/src/arrow/python/helpers.cc | 41 ++++++++++++++++++++++++++++++++-
cpp/src/arrow/python/helpers.h | 4 ++++
cpp/src/arrow/python/python_to_arrow.cc | 6 +++++
python/pyarrow/tests/test_pandas.py | 11 +++++++++
4 files changed, 61 insertions(+), 1 deletion(-)
diff --git a/cpp/src/arrow/python/helpers.cc b/cpp/src/arrow/python/helpers.cc
index 4d50eb9..852bf76 100644
--- a/cpp/src/arrow/python/helpers.cc
+++ b/cpp/src/arrow/python/helpers.cc
@@ -21,6 +21,7 @@
#include "arrow/python/helpers.h"
#include <limits>
+#include <mutex>
#include <sstream>
#include <type_traits>
#include <typeinfo>
@@ -254,6 +255,43 @@ bool PyFloat_IsNaN(PyObject* obj) {
return PyFloat_Check(obj) && std::isnan(PyFloat_AsDouble(obj));
}
+namespace {
+
+static std::once_flag pandas_static_initialized;
+static PyTypeObject* pandas_NaTType = nullptr;
+static PyObject* pandas_NA = nullptr;
+
+void GetPandasStaticSymbols() {
+ OwnedRef pandas;
+ Status s = ImportModule("pandas", &pandas);
+ if (!s.ok()) {
+ return;
+ }
+
+ OwnedRef ref;
+ s = ImportFromModule(pandas.obj(), "NaT", &ref);
+ if (!s.ok()) {
+ return;
+ }
+ PyObject* nat_type = PyObject_Type(ref.obj());
+ pandas_NaTType = reinterpret_cast<PyTypeObject*>(nat_type);
+
+ // PyObject_Type returns a new reference but we trust that pandas.NaT will
+ // outlive our use of this PyObject*
+ Py_DECREF(nat_type);
+
+ if (ImportFromModule(pandas.obj(), "NA", &ref).ok()) {
+ // If pandas.NA exists, retain a reference to it
+ pandas_NA = ref.obj();
+ }
+}
+
+} // namespace
+
+void InitPandasStaticData() {
+ std::call_once(pandas_static_initialized, GetPandasStaticSymbols);
+}
+
bool PandasObjectIsNull(PyObject* obj) {
if (!MayHaveNaN(obj)) {
return false;
@@ -261,7 +299,8 @@ bool PandasObjectIsNull(PyObject* obj) {
if (obj == Py_None) {
return true;
}
- if (PyFloat_IsNaN(obj) ||
+ if (PyFloat_IsNaN(obj) || (pandas_NA && obj == pandas_NA) ||
+ (pandas_NaTType && PyObject_TypeCheck(obj, pandas_NaTType)) ||
(internal::PyDecimal_Check(obj) && internal::PyDecimal_ISNAN(obj))) {
return true;
}
diff --git a/cpp/src/arrow/python/helpers.h b/cpp/src/arrow/python/helpers.h
index 92b4938..4d27c2c 100644
--- a/cpp/src/arrow/python/helpers.h
+++ b/cpp/src/arrow/python/helpers.h
@@ -68,6 +68,10 @@ Status ImportFromModule(PyObject* module, const std::string& name, OwnedRef* ref
// \brief Check whether obj is an integer, independent of Python versions.
inline bool IsPyInteger(PyObject* obj) { return PyLong_Check(obj); }
+// \brief Import symbols from pandas that we need for various type-checking,
+// like pandas.NaT or pandas.NA
+void InitPandasStaticData();
+
// \brief Use pandas missing value semantics to check if a value is null
ARROW_PYTHON_EXPORT
bool PandasObjectIsNull(PyObject* obj);
diff --git a/cpp/src/arrow/python/python_to_arrow.cc b/cpp/src/arrow/python/python_to_arrow.cc
index 6e23b7f..bdaae4c 100644
--- a/cpp/src/arrow/python/python_to_arrow.cc
+++ b/cpp/src/arrow/python/python_to_arrow.cc
@@ -1171,6 +1171,12 @@ Status GetConverterFlat(const std::shared_ptr<DataType>& type, bool strict_conve
Status GetConverter(const std::shared_ptr<DataType>& type, bool from_pandas,
bool strict_conversions, std::unique_ptr<SeqConverter>* out) {
+ if (from_pandas) {
+ // ARROW-842: If pandas is not installed then null checks will be less
+ // comprehensive, but that is okay.
+ internal::InitPandasStaticData();
+ }
+
switch (type->id()) {
case Type::LIST:
if (from_pandas) {
diff --git a/python/pyarrow/tests/test_pandas.py b/python/pyarrow/tests/test_pandas.py
index 791ce14..392a322 100644
--- a/python/pyarrow/tests/test_pandas.py
+++ b/python/pyarrow/tests/test_pandas.py
@@ -1205,6 +1205,17 @@ class TestConvertDateTimeLikeTypes:
tm.assert_frame_equal(table_pandas_objects,
expected_pandas_objects)
+ def test_object_null_values(self):
+ # ARROW-842
+ NA = getattr(pd, 'NA', None)
+ values = np.array([datetime(2000, 1, 1), pd.NaT, NA], dtype=object)
+ values_with_none = np.array([datetime(2000, 1, 1), None, None],
+ dtype=object)
+ result = pa.array(values, from_pandas=True)
+ expected = pa.array(values_with_none, from_pandas=True)
+ assert result.equals(expected)
+ assert result.null_count == 2
+
def test_dates_from_integers(self):
t1 = pa.date32()
t2 = pa.date64()