You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2017/05/14 20:34:31 UTC

arrow git commit: ARROW-1004: [Python] Add conversions for numpy object arrays with integers and floats

Repository: arrow
Updated Branches:
  refs/heads/master 393f46abd -> 37dbddf0d


ARROW-1004: [Python] Add conversions for numpy object arrays with integers and floats

Author: Wes McKinney <we...@twosigma.com>

Closes #681 from wesm/ARROW-1004 and squashes the following commits:

9e0b2eae [Wes McKinney] Code review comments
45f1ecb9 [Wes McKinney] Fixes for manylinux1
4e4c7529 [Wes McKinney] Add conversions for numpy object arrays with integers and floats


Project: http://git-wip-us.apache.org/repos/asf/arrow/repo
Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/37dbddf0
Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/37dbddf0
Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/37dbddf0

Branch: refs/heads/master
Commit: 37dbddf0dc6582586a2bea98a436cb20726799a4
Parents: 393f46a
Author: Wes McKinney <we...@twosigma.com>
Authored: Sun May 14 16:30:19 2017 -0400
Committer: Wes McKinney <we...@twosigma.com>
Committed: Sun May 14 16:30:19 2017 -0400

----------------------------------------------------------------------
 cpp/src/arrow/python/pandas_convert.cc      | 82 +++++++++++++++++++++++-
 python/pyarrow/tests/test_convert_pandas.py | 18 ++++++
 2 files changed, 98 insertions(+), 2 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/arrow/blob/37dbddf0/cpp/src/arrow/python/pandas_convert.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/python/pandas_convert.cc b/cpp/src/arrow/python/pandas_convert.cc
index b6fb05e..96dd09a 100644
--- a/cpp/src/arrow/python/pandas_convert.cc
+++ b/cpp/src/arrow/python/pandas_convert.cc
@@ -80,6 +80,14 @@ static inline bool PyObject_is_string(const PyObject* obj) {
 #endif
 }
 
+static inline bool PyObject_is_float(const PyObject* obj) {
+  return PyFloat_Check(obj);
+}
+
+static inline bool PyObject_is_integer(const PyObject* obj) {
+  return (!PyBool_Check(obj)) && PyArray_IsIntegerScalar(obj);
+}
+
 template <int TYPE>
 static int64_t ValuesToBitmap(PyArrayObject* arr, uint8_t* bitmap) {
   typedef npy_traits<TYPE> traits;
@@ -394,9 +402,11 @@ class PandasConverter {
   template <typename ArrowType>
   Status ConvertDates();
 
+  Status ConvertBooleans();
   Status ConvertObjectStrings();
+  Status ConvertObjectFloats();
   Status ConvertObjectFixedWidthBytes(const std::shared_ptr<DataType>& type);
-  Status ConvertBooleans();
+  Status ConvertObjectIntegers();
   Status ConvertLists(const std::shared_ptr<DataType>& type);
   Status ConvertObjects();
   Status ConvertDecimals();
@@ -610,6 +620,70 @@ Status PandasConverter::ConvertObjectStrings() {
   return Status::OK();
 }
 
+Status PandasConverter::ConvertObjectFloats() {
+  PyAcquireGIL lock;
+
+  DoubleBuilder builder(pool_);
+  RETURN_NOT_OK(builder.Resize(length_));
+
+  Ndarray1DIndexer<PyObject*> objects(arr_);
+  Ndarray1DIndexer<uint8_t> mask_values;
+
+  bool have_mask = false;
+  if (mask_ != nullptr) {
+    mask_values.Init(mask_);
+    have_mask = true;
+  }
+
+  PyObject* obj;
+  for (int64_t i = 0; i < objects.size(); ++i) {
+    obj = objects[i];
+    if ((have_mask && mask_values[i]) || PandasObjectIsNull(obj)) {
+      RETURN_NOT_OK(builder.AppendNull());
+    } else if (PyFloat_Check(obj)) {
+      double val = PyFloat_AsDouble(obj);
+      RETURN_IF_PYERROR();
+      RETURN_NOT_OK(builder.Append(val));
+    } else {
+      return InvalidConversion(obj, "float");
+    }
+  }
+
+  return builder.Finish(&out_);
+}
+
+Status PandasConverter::ConvertObjectIntegers() {
+  PyAcquireGIL lock;
+
+  Int64Builder builder(pool_);
+  RETURN_NOT_OK(builder.Resize(length_));
+
+  Ndarray1DIndexer<PyObject*> objects(arr_);
+  Ndarray1DIndexer<uint8_t> mask_values;
+
+  bool have_mask = false;
+  if (mask_ != nullptr) {
+    mask_values.Init(mask_);
+    have_mask = true;
+  }
+
+  PyObject* obj;
+  for (int64_t i = 0; i < objects.size(); ++i) {
+    obj = objects[i];
+    if ((have_mask && mask_values[i]) || PandasObjectIsNull(obj)) {
+      RETURN_NOT_OK(builder.AppendNull());
+    } else if (PyObject_is_integer(obj)) {
+      const int64_t val = static_cast<int64_t>(PyLong_AsLong(obj));
+      RETURN_IF_PYERROR();
+      RETURN_NOT_OK(builder.Append(val));
+    } else {
+      return InvalidConversion(obj, "integer");
+    }
+  }
+
+  return builder.Finish(&out_);
+}
+
 Status PandasConverter::ConvertObjectFixedWidthBytes(
     const std::shared_ptr<DataType>& type) {
   PyAcquireGIL lock;
@@ -804,8 +878,12 @@ Status PandasConverter::ConvertObjects() {
         continue;
       } else if (PyObject_is_string(objects[i])) {
         return ConvertObjectStrings();
+      } else if (PyObject_is_float(objects[i])) {
+        return ConvertObjectFloats();
       } else if (PyBool_Check(objects[i])) {
         return ConvertBooleans();
+      } else if (PyObject_is_integer(objects[i])) {
+        return ConvertObjectIntegers();
       } else if (PyDate_CheckExact(objects[i])) {
         // We could choose Date32 or Date64
         return ConvertDates<Date32Type>();
@@ -813,7 +891,7 @@ Status PandasConverter::ConvertObjects() {
         return ConvertDecimals();
       } else {
         return InvalidConversion(
-            const_cast<PyObject*>(objects[i]), "string, bool, or date");
+            const_cast<PyObject*>(objects[i]), "string, bool, float, int, date, decimal");
       }
     }
   }

http://git-wip-us.apache.org/repos/asf/arrow/blob/37dbddf0/python/pyarrow/tests/test_convert_pandas.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/tests/test_convert_pandas.py b/python/pyarrow/tests/test_convert_pandas.py
index 9b9b751..be35905 100644
--- a/python/pyarrow/tests/test_convert_pandas.py
+++ b/python/pyarrow/tests/test_convert_pandas.py
@@ -140,6 +140,24 @@ class TestPandasConversion(unittest.TestCase):
         result = table.to_pandas()
         tm.assert_frame_equal(result, ex_frame)
 
+    def test_float_object_nulls(self):
+        arr = np.array([None, 1.5, np.float64(3.5)] * 5, dtype=object)
+        df = pd.DataFrame({'floats': arr})
+        expected = pd.DataFrame({'floats': pd.to_numeric(arr)})
+        field = pa.field('floats', pa.float64())
+        schema = pa.schema([field])
+        self._check_pandas_roundtrip(df, expected=expected,
+                                     expected_schema=schema)
+
+    def test_int_object_nulls(self):
+        arr = np.array([None, 1, np.int64(3)] * 5, dtype=object)
+        df = pd.DataFrame({'ints': arr})
+        expected = pd.DataFrame({'ints': pd.to_numeric(arr)})
+        field = pa.field('ints', pa.int64())
+        schema = pa.schema([field])
+        self._check_pandas_roundtrip(df, expected=expected,
+                                     expected_schema=schema)
+
     def test_integer_no_nulls(self):
         data = OrderedDict()
         fields = []