You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2017/01/11 14:33:40 UTC
arrow git commit: ARROW-421: [Python] Retain parent reference in
PyBytesReader
Repository: arrow
Updated Branches:
refs/heads/master 543e50814 -> 7d3e2a3ab
ARROW-421: [Python] Retain parent reference in PyBytesReader
Pass Buffer to BufferReader so that zero-copy slices retain reference to PyBytesBuffer, which prevents the bytes object from being garbage collected prematurely. Also added some helper tools for inspecting Arrow Buffer objects in Python.
Close #278
Author: Wes McKinney <we...@twosigma.com>
Closes #279 from wesm/ARROW-421 and squashes the following commits:
acf730e [Wes McKinney] Rename method
50c195a [Wes McKinney] Fix accidental typo
ef20185 [Wes McKinney] Pass Buffer to BufferReader so that zero-copy slices retain reference to PyBytesBuffer, which prevents the bytes object from being garbage collected prematurely
Project: http://git-wip-us.apache.org/repos/asf/arrow/repo
Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/7d3e2a3a
Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/7d3e2a3a
Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/7d3e2a3a
Branch: refs/heads/master
Commit: 7d3e2a3ab90324625b738e464a020758379f457a
Parents: 543e508
Author: Wes McKinney <we...@twosigma.com>
Authored: Wed Jan 11 09:33:29 2017 -0500
Committer: Wes McKinney <we...@twosigma.com>
Committed: Wed Jan 11 09:33:29 2017 -0500
----------------------------------------------------------------------
cpp/src/arrow/io/memory.h | 2 ++
python/pyarrow/_parquet.pxd | 2 +-
python/pyarrow/_parquet.pyx | 8 +++---
python/pyarrow/includes/libarrow.pxd | 1 +
python/pyarrow/io.pyx | 46 +++++++++++++++++++++++++++++--
python/pyarrow/tests/test_io.py | 14 ++++++++++
python/src/pyarrow/io.cc | 10 ++-----
python/src/pyarrow/io.h | 5 ++--
8 files changed, 69 insertions(+), 19 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/arrow/blob/7d3e2a3a/cpp/src/arrow/io/memory.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/io/memory.h b/cpp/src/arrow/io/memory.h
index 8428a12..2d3df42 100644
--- a/cpp/src/arrow/io/memory.h
+++ b/cpp/src/arrow/io/memory.h
@@ -79,6 +79,8 @@ class ARROW_EXPORT BufferReader : public ReadableFileInterface {
bool supports_zero_copy() const override;
+ std::shared_ptr<Buffer> buffer() const { return buffer_; }
+
private:
std::shared_ptr<Buffer> buffer_;
const uint8_t* data_;
http://git-wip-us.apache.org/repos/asf/arrow/blob/7d3e2a3a/python/pyarrow/_parquet.pxd
----------------------------------------------------------------------
diff --git a/python/pyarrow/_parquet.pxd b/python/pyarrow/_parquet.pxd
index faca845..7e49e9e 100644
--- a/python/pyarrow/_parquet.pxd
+++ b/python/pyarrow/_parquet.pxd
@@ -156,7 +156,7 @@ cdef extern from "parquet/api/reader.h" namespace "parquet" nogil:
int num_columns()
int64_t num_rows()
int num_row_groups()
- int32_t version()
+ ParquetVersion version()
const c_string created_by()
int num_schema_elements()
http://git-wip-us.apache.org/repos/asf/arrow/blob/7d3e2a3a/python/pyarrow/_parquet.pyx
----------------------------------------------------------------------
diff --git a/python/pyarrow/_parquet.pyx b/python/pyarrow/_parquet.pyx
index c0dc3eb..30e3de4 100644
--- a/python/pyarrow/_parquet.pyx
+++ b/python/pyarrow/_parquet.pyx
@@ -138,11 +138,11 @@ cdef class FileMetaData:
property format_version:
def __get__(self):
- cdef int version = self.metadata.version()
- if version == 2:
- return '2.0'
- elif version == 1:
+ cdef ParquetVersion version = self.metadata.version()
+ if version == ParquetVersion_V1:
return '1.0'
+ if version == ParquetVersion_V2:
+ return '2.0'
else:
print('Unrecognized file version, assuming 1.0: {0}'
.format(version))
http://git-wip-us.apache.org/repos/asf/arrow/blob/7d3e2a3a/python/pyarrow/includes/libarrow.pxd
----------------------------------------------------------------------
diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd
index b0f971d..d1970e5 100644
--- a/python/pyarrow/includes/libarrow.pxd
+++ b/python/pyarrow/includes/libarrow.pxd
@@ -66,6 +66,7 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
cdef cppclass CBuffer" arrow::Buffer":
uint8_t* data()
int64_t size()
+ shared_ptr[CBuffer] parent()
cdef cppclass ResizableBuffer(CBuffer):
CStatus Resize(int64_t nbytes)
http://git-wip-us.apache.org/repos/asf/arrow/blob/7d3e2a3a/python/pyarrow/io.pyx
----------------------------------------------------------------------
diff --git a/python/pyarrow/io.pyx b/python/pyarrow/io.pyx
index cab6ccb..b62de6c 100644
--- a/python/pyarrow/io.pyx
+++ b/python/pyarrow/io.pyx
@@ -123,11 +123,17 @@ cdef class NativeFile:
with nogil:
check_status(self.wr_file.get().Write(buf, bufsize))
- def read(self, int64_t nbytes):
+ def read(self, nbytes=None):
cdef:
+ int64_t c_nbytes
int64_t bytes_read = 0
PyObject* obj
+ if nbytes is None:
+ c_nbytes = self.size() - self.tell()
+ else:
+ c_nbytes = nbytes
+
self._assert_readable()
# Allocate empty write space
@@ -135,17 +141,35 @@ cdef class NativeFile:
cdef uint8_t* buf = <uint8_t*> cp.PyBytes_AS_STRING(<object> obj)
with nogil:
- check_status(self.rd_file.get().Read(nbytes, &bytes_read, buf))
+ check_status(self.rd_file.get().Read(c_nbytes, &bytes_read, buf))
- if bytes_read < nbytes:
+ if bytes_read < c_nbytes:
cp._PyBytes_Resize(&obj, <Py_ssize_t> bytes_read)
return PyObject_to_object(obj)
+ def read_buffer(self, nbytes=None):
+ cdef:
+ int64_t c_nbytes
+ int64_t bytes_read = 0
+ shared_ptr[CBuffer] output
+ self._assert_readable()
+
+ if nbytes is None:
+ c_nbytes = self.size() - self.tell()
+ else:
+ c_nbytes = nbytes
+
+ with nogil:
+ check_status(self.rd_file.get().ReadB(c_nbytes, &output))
+
+ return wrap_buffer(output)
+
# ----------------------------------------------------------------------
# Python file-like objects
+
cdef class PythonFileInterface(NativeFile):
cdef:
object handle
@@ -199,6 +223,16 @@ cdef class Buffer:
def __get__(self):
return self.buffer.get().size()
+ property parent:
+
+ def __get__(self):
+ cdef shared_ptr[CBuffer] parent_buf = self.buffer.get().parent()
+
+ if parent_buf.get() == NULL:
+ return None
+ else:
+ return wrap_buffer(parent_buf)
+
def __getitem__(self, key):
# TODO(wesm): buffer slicing
raise NotImplementedError
@@ -209,6 +243,12 @@ cdef class Buffer:
self.buffer.get().size())
+cdef wrap_buffer(const shared_ptr[CBuffer]& buffer):
+ cdef Buffer result = Buffer()
+ result.buffer = buffer
+ return result
+
+
cdef shared_ptr[PoolBuffer] allocate_buffer():
cdef shared_ptr[PoolBuffer] result
result.reset(new PoolBuffer(pyarrow.get_memory_pool()))
http://git-wip-us.apache.org/repos/asf/arrow/blob/7d3e2a3a/python/pyarrow/tests/test_io.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/tests/test_io.py b/python/pyarrow/tests/test_io.py
index c10ed03..3e7a437 100644
--- a/python/pyarrow/tests/test_io.py
+++ b/python/pyarrow/tests/test_io.py
@@ -102,6 +102,20 @@ def test_bytes_reader_non_bytes():
io.BytesReader(u('some sample data'))
+def test_bytes_reader_retains_parent_reference():
+ import gc
+
+ # ARROW-421
+ def get_buffer():
+ data = b'some sample data' * 1000
+ reader = io.BytesReader(data)
+ reader.seek(5)
+ return reader.read_buffer(6)
+
+ buf = get_buffer()
+ gc.collect()
+ assert buf.to_pybytes() == b'sample'
+ assert buf.parent is not None
# ----------------------------------------------------------------------
# Buffers
http://git-wip-us.apache.org/repos/asf/arrow/blob/7d3e2a3a/python/src/pyarrow/io.cc
----------------------------------------------------------------------
diff --git a/python/src/pyarrow/io.cc b/python/src/pyarrow/io.cc
index ac1aa63..01f851d 100644
--- a/python/src/pyarrow/io.cc
+++ b/python/src/pyarrow/io.cc
@@ -203,14 +203,8 @@ Status PyOutputStream::Write(const uint8_t* data, int64_t nbytes) {
// A readable file that is backed by a PyBytes
PyBytesReader::PyBytesReader(PyObject* obj)
- : arrow::io::BufferReader(reinterpret_cast<const uint8_t*>(PyBytes_AS_STRING(obj)),
- PyBytes_GET_SIZE(obj)),
- obj_(obj) {
- Py_INCREF(obj_);
-}
+ : arrow::io::BufferReader(std::make_shared<PyBytesBuffer>(obj)) {}
-PyBytesReader::~PyBytesReader() {
- Py_DECREF(obj_);
-}
+PyBytesReader::~PyBytesReader() {}
} // namespace pyarrow
http://git-wip-us.apache.org/repos/asf/arrow/blob/7d3e2a3a/python/src/pyarrow/io.h
----------------------------------------------------------------------
diff --git a/python/src/pyarrow/io.h b/python/src/pyarrow/io.h
index fd3e7c0..4cb010f 100644
--- a/python/src/pyarrow/io.h
+++ b/python/src/pyarrow/io.h
@@ -22,6 +22,8 @@
#include "arrow/io/memory.h"
#include "pyarrow/config.h"
+
+#include "pyarrow/common.h"
#include "pyarrow/visibility.h"
namespace arrow {
@@ -87,9 +89,6 @@ class PYARROW_EXPORT PyBytesReader : public arrow::io::BufferReader {
public:
explicit PyBytesReader(PyObject* obj);
virtual ~PyBytesReader();
-
- private:
- PyObject* obj_;
};
// TODO(wesm): seekable output files