You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2017/01/11 14:33:40 UTC

arrow git commit: ARROW-421: [Python] Retain parent reference in PyBytesReader

Repository: arrow
Updated Branches:
  refs/heads/master 543e50814 -> 7d3e2a3ab


ARROW-421: [Python] Retain parent reference in PyBytesReader

Pass Buffer to BufferReader so that zero-copy slices retain reference to PyBytesBuffer, which prevents the bytes object from being garbage collected prematurely. Also added some helper tools for inspecting Arrow Buffer objects in Python.

Close #278

Author: Wes McKinney <we...@twosigma.com>

Closes #279 from wesm/ARROW-421 and squashes the following commits:

acf730e [Wes McKinney] Rename method
50c195a [Wes McKinney] Fix accidental typo
ef20185 [Wes McKinney] Pass Buffer to BufferReader so that zero-copy slices retain reference to PyBytesBuffer, which prevents the bytes object from being garbage collected prematurely


Project: http://git-wip-us.apache.org/repos/asf/arrow/repo
Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/7d3e2a3a
Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/7d3e2a3a
Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/7d3e2a3a

Branch: refs/heads/master
Commit: 7d3e2a3ab90324625b738e464a020758379f457a
Parents: 543e508
Author: Wes McKinney <we...@twosigma.com>
Authored: Wed Jan 11 09:33:29 2017 -0500
Committer: Wes McKinney <we...@twosigma.com>
Committed: Wed Jan 11 09:33:29 2017 -0500

----------------------------------------------------------------------
 cpp/src/arrow/io/memory.h            |  2 ++
 python/pyarrow/_parquet.pxd          |  2 +-
 python/pyarrow/_parquet.pyx          |  8 +++---
 python/pyarrow/includes/libarrow.pxd |  1 +
 python/pyarrow/io.pyx                | 46 +++++++++++++++++++++++++++++--
 python/pyarrow/tests/test_io.py      | 14 ++++++++++
 python/src/pyarrow/io.cc             | 10 ++-----
 python/src/pyarrow/io.h              |  5 ++--
 8 files changed, 69 insertions(+), 19 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/arrow/blob/7d3e2a3a/cpp/src/arrow/io/memory.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/io/memory.h b/cpp/src/arrow/io/memory.h
index 8428a12..2d3df42 100644
--- a/cpp/src/arrow/io/memory.h
+++ b/cpp/src/arrow/io/memory.h
@@ -79,6 +79,8 @@ class ARROW_EXPORT BufferReader : public ReadableFileInterface {
 
   bool supports_zero_copy() const override;
 
+  std::shared_ptr<Buffer> buffer() const { return buffer_; }
+
  private:
   std::shared_ptr<Buffer> buffer_;
   const uint8_t* data_;

http://git-wip-us.apache.org/repos/asf/arrow/blob/7d3e2a3a/python/pyarrow/_parquet.pxd
----------------------------------------------------------------------
diff --git a/python/pyarrow/_parquet.pxd b/python/pyarrow/_parquet.pxd
index faca845..7e49e9e 100644
--- a/python/pyarrow/_parquet.pxd
+++ b/python/pyarrow/_parquet.pxd
@@ -156,7 +156,7 @@ cdef extern from "parquet/api/reader.h" namespace "parquet" nogil:
         int num_columns()
         int64_t num_rows()
         int num_row_groups()
-        int32_t version()
+        ParquetVersion version()
         const c_string created_by()
         int num_schema_elements()
 

http://git-wip-us.apache.org/repos/asf/arrow/blob/7d3e2a3a/python/pyarrow/_parquet.pyx
----------------------------------------------------------------------
diff --git a/python/pyarrow/_parquet.pyx b/python/pyarrow/_parquet.pyx
index c0dc3eb..30e3de4 100644
--- a/python/pyarrow/_parquet.pyx
+++ b/python/pyarrow/_parquet.pyx
@@ -138,11 +138,11 @@ cdef class FileMetaData:
     property format_version:
 
         def __get__(self):
-            cdef int version = self.metadata.version()
-            if version == 2:
-                return '2.0'
-            elif version == 1:
+            cdef ParquetVersion version = self.metadata.version()
+            if version == ParquetVersion_V1:
                 return '1.0'
+            if version == ParquetVersion_V2:
+                return '2.0'
             else:
                 print('Unrecognized file version, assuming 1.0: {0}'
                       .format(version))

http://git-wip-us.apache.org/repos/asf/arrow/blob/7d3e2a3a/python/pyarrow/includes/libarrow.pxd
----------------------------------------------------------------------
diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd
index b0f971d..d1970e5 100644
--- a/python/pyarrow/includes/libarrow.pxd
+++ b/python/pyarrow/includes/libarrow.pxd
@@ -66,6 +66,7 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
     cdef cppclass CBuffer" arrow::Buffer":
         uint8_t* data()
         int64_t size()
+        shared_ptr[CBuffer] parent()
 
     cdef cppclass ResizableBuffer(CBuffer):
         CStatus Resize(int64_t nbytes)

http://git-wip-us.apache.org/repos/asf/arrow/blob/7d3e2a3a/python/pyarrow/io.pyx
----------------------------------------------------------------------
diff --git a/python/pyarrow/io.pyx b/python/pyarrow/io.pyx
index cab6ccb..b62de6c 100644
--- a/python/pyarrow/io.pyx
+++ b/python/pyarrow/io.pyx
@@ -123,11 +123,17 @@ cdef class NativeFile:
         with nogil:
             check_status(self.wr_file.get().Write(buf, bufsize))
 
-    def read(self, int64_t nbytes):
+    def read(self, nbytes=None):
         cdef:
+            int64_t c_nbytes
             int64_t bytes_read = 0
             PyObject* obj
 
+        if nbytes is None:
+            c_nbytes = self.size() - self.tell()
+        else:
+            c_nbytes = nbytes
+
         self._assert_readable()
 
         # Allocate empty write space
@@ -135,17 +141,35 @@ cdef class NativeFile:
 
         cdef uint8_t* buf = <uint8_t*> cp.PyBytes_AS_STRING(<object> obj)
         with nogil:
-            check_status(self.rd_file.get().Read(nbytes, &bytes_read, buf))
+            check_status(self.rd_file.get().Read(c_nbytes, &bytes_read, buf))
 
-        if bytes_read < nbytes:
+        if bytes_read < c_nbytes:
             cp._PyBytes_Resize(&obj, <Py_ssize_t> bytes_read)
 
         return PyObject_to_object(obj)
 
+    def read_buffer(self, nbytes=None):
+        cdef:
+            int64_t c_nbytes
+            int64_t bytes_read = 0
+            shared_ptr[CBuffer] output
+        self._assert_readable()
+
+        if nbytes is None:
+            c_nbytes = self.size() - self.tell()
+        else:
+            c_nbytes = nbytes
+
+        with nogil:
+            check_status(self.rd_file.get().ReadB(c_nbytes, &output))
+
+        return wrap_buffer(output)
+
 
 # ----------------------------------------------------------------------
 # Python file-like objects
 
+
 cdef class PythonFileInterface(NativeFile):
     cdef:
         object handle
@@ -199,6 +223,16 @@ cdef class Buffer:
         def __get__(self):
             return self.buffer.get().size()
 
+    property parent:
+
+        def __get__(self):
+            cdef shared_ptr[CBuffer] parent_buf = self.buffer.get().parent()
+
+            if parent_buf.get() == NULL:
+                return None
+            else:
+                return wrap_buffer(parent_buf)
+
     def __getitem__(self, key):
         # TODO(wesm): buffer slicing
         raise NotImplementedError
@@ -209,6 +243,12 @@ cdef class Buffer:
             self.buffer.get().size())
 
 
+cdef wrap_buffer(const shared_ptr[CBuffer]& buffer):
+    cdef Buffer result = Buffer()
+    result.buffer = buffer
+    return result
+
+
 cdef shared_ptr[PoolBuffer] allocate_buffer():
     cdef shared_ptr[PoolBuffer] result
     result.reset(new PoolBuffer(pyarrow.get_memory_pool()))

http://git-wip-us.apache.org/repos/asf/arrow/blob/7d3e2a3a/python/pyarrow/tests/test_io.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/tests/test_io.py b/python/pyarrow/tests/test_io.py
index c10ed03..3e7a437 100644
--- a/python/pyarrow/tests/test_io.py
+++ b/python/pyarrow/tests/test_io.py
@@ -102,6 +102,20 @@ def test_bytes_reader_non_bytes():
         io.BytesReader(u('some sample data'))
 
 
+def test_bytes_reader_retains_parent_reference():
+    import gc
+
+    # ARROW-421
+    def get_buffer():
+        data = b'some sample data' * 1000
+        reader = io.BytesReader(data)
+        reader.seek(5)
+        return reader.read_buffer(6)
+
+    buf = get_buffer()
+    gc.collect()
+    assert buf.to_pybytes() == b'sample'
+    assert buf.parent is not None
 
 # ----------------------------------------------------------------------
 # Buffers

http://git-wip-us.apache.org/repos/asf/arrow/blob/7d3e2a3a/python/src/pyarrow/io.cc
----------------------------------------------------------------------
diff --git a/python/src/pyarrow/io.cc b/python/src/pyarrow/io.cc
index ac1aa63..01f851d 100644
--- a/python/src/pyarrow/io.cc
+++ b/python/src/pyarrow/io.cc
@@ -203,14 +203,8 @@ Status PyOutputStream::Write(const uint8_t* data, int64_t nbytes) {
 // A readable file that is backed by a PyBytes
 
 PyBytesReader::PyBytesReader(PyObject* obj)
-    : arrow::io::BufferReader(reinterpret_cast<const uint8_t*>(PyBytes_AS_STRING(obj)),
-          PyBytes_GET_SIZE(obj)),
-      obj_(obj) {
-  Py_INCREF(obj_);
-}
+    : arrow::io::BufferReader(std::make_shared<PyBytesBuffer>(obj)) {}
 
-PyBytesReader::~PyBytesReader() {
-  Py_DECREF(obj_);
-}
+PyBytesReader::~PyBytesReader() {}
 
 }  // namespace pyarrow

http://git-wip-us.apache.org/repos/asf/arrow/blob/7d3e2a3a/python/src/pyarrow/io.h
----------------------------------------------------------------------
diff --git a/python/src/pyarrow/io.h b/python/src/pyarrow/io.h
index fd3e7c0..4cb010f 100644
--- a/python/src/pyarrow/io.h
+++ b/python/src/pyarrow/io.h
@@ -22,6 +22,8 @@
 #include "arrow/io/memory.h"
 
 #include "pyarrow/config.h"
+
+#include "pyarrow/common.h"
 #include "pyarrow/visibility.h"
 
 namespace arrow {
@@ -87,9 +89,6 @@ class PYARROW_EXPORT PyBytesReader : public arrow::io::BufferReader {
  public:
   explicit PyBytesReader(PyObject* obj);
   virtual ~PyBytesReader();
-
- private:
-  PyObject* obj_;
 };
 
 // TODO(wesm): seekable output files