You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2020/06/25 16:15:47 UTC

[arrow] branch master updated: ARROW-842: [Python] Recognize pandas.NaT as null when converting object arrays with from_pandas=True

This is an automated email from the ASF dual-hosted git repository.

wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 390fa7a  ARROW-842: [Python] Recognize pandas.NaT as null when converting object arrays with from_pandas=True
390fa7a is described below

commit 390fa7aaab321b2bc72966ad7c6ac427bbc82505
Author: Wes McKinney <we...@apache.org>
AuthorDate: Thu Jun 25 11:15:14 2020 -0500

    ARROW-842: [Python] Recognize pandas.NaT as null when converting object arrays with from_pandas=True
    
    This has been the root cause of a number of bugs. I'm unclear if there's a race condition with tearing down a `static OwnedRef` so we might need some other approach to managing symbols imported from various Python modules
    
    Closes #7537 from wesm/ARROW-842
    
    Authored-by: Wes McKinney <we...@apache.org>
    Signed-off-by: Wes McKinney <we...@apache.org>
---
 cpp/src/arrow/python/helpers.cc         | 41 ++++++++++++++++++++++++++++++++-
 cpp/src/arrow/python/helpers.h          |  4 ++++
 cpp/src/arrow/python/python_to_arrow.cc |  6 +++++
 python/pyarrow/tests/test_pandas.py     | 11 +++++++++
 4 files changed, 61 insertions(+), 1 deletion(-)

diff --git a/cpp/src/arrow/python/helpers.cc b/cpp/src/arrow/python/helpers.cc
index 4d50eb9..852bf76 100644
--- a/cpp/src/arrow/python/helpers.cc
+++ b/cpp/src/arrow/python/helpers.cc
@@ -21,6 +21,7 @@
 #include "arrow/python/helpers.h"
 
 #include <limits>
+#include <mutex>
 #include <sstream>
 #include <type_traits>
 #include <typeinfo>
@@ -254,6 +255,43 @@ bool PyFloat_IsNaN(PyObject* obj) {
   return PyFloat_Check(obj) && std::isnan(PyFloat_AsDouble(obj));
 }
 
+namespace {
+
+static std::once_flag pandas_static_initialized;
+static PyTypeObject* pandas_NaTType = nullptr;
+static PyObject* pandas_NA = nullptr;
+
+void GetPandasStaticSymbols() {
+  OwnedRef pandas;
+  Status s = ImportModule("pandas", &pandas);
+  if (!s.ok()) {
+    return;
+  }
+
+  OwnedRef ref;
+  s = ImportFromModule(pandas.obj(), "NaT", &ref);
+  if (!s.ok()) {
+    return;
+  }
+  PyObject* nat_type = PyObject_Type(ref.obj());
+  pandas_NaTType = reinterpret_cast<PyTypeObject*>(nat_type);
+
+  // PyObject_Type returns a new reference but we trust that pandas.NaT will
+  // outlive our use of this PyObject*
+  Py_DECREF(nat_type);
+
+  if (ImportFromModule(pandas.obj(), "NA", &ref).ok()) {
+    // If pandas.NA exists, retain a reference to it
+    pandas_NA = ref.obj();
+  }
+}
+
+}  // namespace
+
+void InitPandasStaticData() {
+  std::call_once(pandas_static_initialized, GetPandasStaticSymbols);
+}
+
 bool PandasObjectIsNull(PyObject* obj) {
   if (!MayHaveNaN(obj)) {
     return false;
@@ -261,7 +299,8 @@ bool PandasObjectIsNull(PyObject* obj) {
   if (obj == Py_None) {
     return true;
   }
-  if (PyFloat_IsNaN(obj) ||
+  if (PyFloat_IsNaN(obj) || (pandas_NA && obj == pandas_NA) ||
+      (pandas_NaTType && PyObject_TypeCheck(obj, pandas_NaTType)) ||
       (internal::PyDecimal_Check(obj) && internal::PyDecimal_ISNAN(obj))) {
     return true;
   }
diff --git a/cpp/src/arrow/python/helpers.h b/cpp/src/arrow/python/helpers.h
index 92b4938..4d27c2c 100644
--- a/cpp/src/arrow/python/helpers.h
+++ b/cpp/src/arrow/python/helpers.h
@@ -68,6 +68,10 @@ Status ImportFromModule(PyObject* module, const std::string& name, OwnedRef* ref
 // \brief Check whether obj is an integer, independent of Python versions.
 inline bool IsPyInteger(PyObject* obj) { return PyLong_Check(obj); }
 
+// \brief Import symbols from pandas that we need for various type-checking,
+// like pandas.NaT or pandas.NA
+void InitPandasStaticData();
+
 // \brief Use pandas missing value semantics to check if a value is null
 ARROW_PYTHON_EXPORT
 bool PandasObjectIsNull(PyObject* obj);
diff --git a/cpp/src/arrow/python/python_to_arrow.cc b/cpp/src/arrow/python/python_to_arrow.cc
index 6e23b7f..bdaae4c 100644
--- a/cpp/src/arrow/python/python_to_arrow.cc
+++ b/cpp/src/arrow/python/python_to_arrow.cc
@@ -1171,6 +1171,12 @@ Status GetConverterFlat(const std::shared_ptr<DataType>& type, bool strict_conve
 
 Status GetConverter(const std::shared_ptr<DataType>& type, bool from_pandas,
                     bool strict_conversions, std::unique_ptr<SeqConverter>* out) {
+  if (from_pandas) {
+    // ARROW-842: If pandas is not installed then null checks will be less
+    // comprehensive, but that is okay.
+    internal::InitPandasStaticData();
+  }
+
   switch (type->id()) {
     case Type::LIST:
       if (from_pandas) {
diff --git a/python/pyarrow/tests/test_pandas.py b/python/pyarrow/tests/test_pandas.py
index 791ce14..392a322 100644
--- a/python/pyarrow/tests/test_pandas.py
+++ b/python/pyarrow/tests/test_pandas.py
@@ -1205,6 +1205,17 @@ class TestConvertDateTimeLikeTypes:
         tm.assert_frame_equal(table_pandas_objects,
                               expected_pandas_objects)
 
+    def test_object_null_values(self):
+        # ARROW-842
+        NA = getattr(pd, 'NA', None)
+        values = np.array([datetime(2000, 1, 1), pd.NaT, NA], dtype=object)
+        values_with_none = np.array([datetime(2000, 1, 1), None, None],
+                                    dtype=object)
+        result = pa.array(values, from_pandas=True)
+        expected = pa.array(values_with_none, from_pandas=True)
+        assert result.equals(expected)
+        assert result.null_count == 2
+
     def test_dates_from_integers(self):
         t1 = pa.date32()
         t2 = pa.date64()