You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2017/05/14 18:35:34 UTC

arrow git commit: ARROW-1017: [Python] Fix memory leaks in conversion to pandas.DataFrame

Repository: arrow
Updated Branches:
  refs/heads/master 852ee4fbf -> c7839e9fa


ARROW-1017: [Python] Fix memory leaks in conversion to pandas.DataFrame

Notes:

* `PyList_Append` increments ref count, so new objects must be DECREF'd after being inserted
* `PyArray_SimpleNewFromDescr` does not set `NPY_ARRAY_OWNDATA`, neither does `NewFromDescr`

Author: Wes McKinney <we...@twosigma.com>

Closes #685 from wesm/ARROW-1017 and squashes the following commits:

8459123 [Wes McKinney] Fix memory leak caused by list append ref count, lack of setting NPY_ARRAY_OWNDATA


Project: http://git-wip-us.apache.org/repos/asf/arrow/repo
Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/c7839e9f
Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/c7839e9f
Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/c7839e9f

Branch: refs/heads/master
Commit: c7839e9fab91e62cced9367f23e561afb6728652
Parents: 852ee4f
Author: Wes McKinney <we...@twosigma.com>
Authored: Sun May 14 14:35:28 2017 -0400
Committer: Wes McKinney <we...@twosigma.com>
Committed: Sun May 14 14:35:28 2017 -0400

----------------------------------------------------------------------
 cpp/src/arrow/python/common.h          |  7 +++++-
 cpp/src/arrow/python/pandas_convert.cc | 25 ++++++++++-----------
 python/scripts/test_leak.py            | 35 +++++++++++++++++++++++++++++
 3 files changed, 53 insertions(+), 14 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/arrow/blob/c7839e9f/cpp/src/arrow/python/common.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/python/common.h b/cpp/src/arrow/python/common.h
index f6e706b..ec40d0e 100644
--- a/cpp/src/arrow/python/common.h
+++ b/cpp/src/arrow/python/common.h
@@ -69,7 +69,7 @@ class ARROW_EXPORT OwnedRef {
 
   ~OwnedRef() {
     PyAcquireGIL lock;
-    Py_XDECREF(obj_);
+    release();
   }
 
   void reset(PyObject* obj) {
@@ -80,6 +80,11 @@ class ARROW_EXPORT OwnedRef {
     obj_ = obj;
   }
 
+  void release() {
+    Py_XDECREF(obj_);
+    obj_ = nullptr;
+  }
+
   PyObject* obj() const { return obj_; }
 
  private:

http://git-wip-us.apache.org/repos/asf/arrow/blob/c7839e9f/cpp/src/arrow/python/pandas_convert.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/python/pandas_convert.cc b/cpp/src/arrow/python/pandas_convert.cc
index 264bed1..b6fb05e 100644
--- a/cpp/src/arrow/python/pandas_convert.cc
+++ b/cpp/src/arrow/python/pandas_convert.cc
@@ -1023,7 +1023,8 @@ static inline PyObject* NewArray1DFromType(
   }
 
   set_numpy_metadata(type, arrow_type, descr);
-  return PyArray_NewFromDescr(&PyArray_Type, descr, 1, dims, nullptr, data, 0, nullptr);
+  return PyArray_NewFromDescr(&PyArray_Type, descr, 1, dims, nullptr, data,
+      NPY_ARRAY_OWNDATA | NPY_ARRAY_CARRAY, nullptr);
 }
 
 class PandasBlock {
@@ -1078,12 +1079,10 @@ class PandasBlock {
     PyObject* block_arr;
     if (ndim == 2) {
       npy_intp block_dims[2] = {num_columns_, num_rows_};
-      block_arr = PyArray_NewFromDescr(
-          &PyArray_Type, descr, 2, block_dims, nullptr, nullptr, 0, nullptr);
+      block_arr = PyArray_SimpleNewFromDescr(2, block_dims, descr);
     } else {
       npy_intp block_dims[1] = {num_rows_};
-      block_arr = PyArray_NewFromDescr(
-          &PyArray_Type, descr, 1, block_dims, nullptr, nullptr, 0, nullptr);
+      block_arr = PyArray_SimpleNewFromDescr(1, block_dims, descr);
     }
 
     if (block_arr == NULL) {
@@ -1091,6 +1090,8 @@ class PandasBlock {
       return Status::OK();
     }
 
+    PyArray_ENABLEFLAGS(reinterpret_cast<PyArrayObject*>(block_arr), NPY_ARRAY_OWNDATA);
+
     npy_intp placement_dims[1] = {num_columns_};
     PyObject* placement_arr = PyArray_SimpleNew(1, placement_dims, NPY_INT64);
     if (placement_arr == NULL) {
@@ -1357,8 +1358,6 @@ inline void ConvertDatetimeNanos(const ChunkedArray& data, int64_t* out_values)
 class ObjectBlock : public PandasBlock {
  public:
   using PandasBlock::PandasBlock;
-  virtual ~ObjectBlock() {}
-
   Status Allocate() override { return AllocateNDArray(NPY_OBJECT); }
 
   Status Write(const std::shared_ptr<Column>& col, int64_t abs_placement,
@@ -1416,7 +1415,6 @@ template <int ARROW_TYPE, typename C_TYPE>
 class IntBlock : public PandasBlock {
  public:
   using PandasBlock::PandasBlock;
-
   Status Allocate() override {
     return AllocateNDArray(arrow_traits<ARROW_TYPE>::npy_type);
   }
@@ -1450,7 +1448,6 @@ using Int64Block = IntBlock<Type::INT64, int64_t>;
 class Float32Block : public PandasBlock {
  public:
   using PandasBlock::PandasBlock;
-
   Status Allocate() override { return AllocateNDArray(NPY_FLOAT32); }
 
   Status Write(const std::shared_ptr<Column>& col, int64_t abs_placement,
@@ -1470,7 +1467,6 @@ class Float32Block : public PandasBlock {
 class Float64Block : public PandasBlock {
  public:
   using PandasBlock::PandasBlock;
-
   Status Allocate() override { return AllocateNDArray(NPY_FLOAT64); }
 
   Status Write(const std::shared_ptr<Column>& col, int64_t abs_placement,
@@ -1523,7 +1519,6 @@ class Float64Block : public PandasBlock {
 class BoolBlock : public PandasBlock {
  public:
   using PandasBlock::PandasBlock;
-
   Status Allocate() override { return AllocateNDArray(NPY_BOOL); }
 
   Status Write(const std::shared_ptr<Column>& col, int64_t abs_placement,
@@ -1544,7 +1539,6 @@ class BoolBlock : public PandasBlock {
 class DatetimeBlock : public PandasBlock {
  public:
   using PandasBlock::PandasBlock;
-
   Status AllocateDatetime(int ndim) {
     RETURN_NOT_OK(AllocateNDArray(NPY_DATETIME, ndim));
 
@@ -1629,7 +1623,6 @@ template <int ARROW_INDEX_TYPE>
 class CategoricalBlock : public PandasBlock {
  public:
   explicit CategoricalBlock(int64_t num_rows) : PandasBlock(num_rows, 1) {}
-
   Status Allocate() override {
     constexpr int npy_type = arrow_traits<ARROW_INDEX_TYPE>::npy_type;
 
@@ -1960,6 +1953,9 @@ class DataFrameBlockCreator {
       PyObject* item;
       RETURN_NOT_OK(it.second->GetPyResult(&item));
       if (PyList_Append(list, item) < 0) { RETURN_IF_PYERROR(); }
+
+      // ARROW-1017; PyList_Append increments object refcount
+      Py_DECREF(item);
     }
     return Status::OK();
   }
@@ -2045,6 +2041,9 @@ class ArrowDeserializer {
     // Arrow data is immutable.
     PyArray_CLEARFLAGS(arr_, NPY_ARRAY_WRITEABLE);
 
+    // Arrow data is owned by another
+    PyArray_CLEARFLAGS(arr_, NPY_ARRAY_OWNDATA);
+
     return Status::OK();
   }
 

http://git-wip-us.apache.org/repos/asf/arrow/blob/c7839e9f/python/scripts/test_leak.py
----------------------------------------------------------------------
diff --git a/python/scripts/test_leak.py b/python/scripts/test_leak.py
new file mode 100644
index 0000000..2b197b6
--- /dev/null
+++ b/python/scripts/test_leak.py
@@ -0,0 +1,35 @@
+#!/usr/bin/env python
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import pyarrow as pa
+import numpy as np
+import memory_profiler
+import gc
+
+
+def leak():
+    data = [pa.array(np.concatenate([np.random.randn(100000)] * 1000))]
+    table = pa.Table.from_arrays(data, ['foo'])
+    while True:
+        print('calling to_pandas')
+        print('memory_usage: {0}'.format(memory_profiler.memory_usage()))
+        table.to_pandas()
+        gc.collect()
+
+leak()