You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2017/03/25 16:15:08 UTC
arrow git commit: ARROW-626: [Python] Replace PyBytesBuffer with
zero-copy, memoryview-based PyBuffer
Repository: arrow
Updated Branches:
refs/heads/master c7947dc2d -> 685ebf490
ARROW-626: [Python] Replace PyBytesBuffer with zero-copy, memoryview-based PyBuffer
WIP, but tests all pass
Author: Jeff Knupp <je...@enigma.io>
Closes #410 from jeffknupp/master and squashes the following commits:
bfba71d [Jeff Knupp] Fix typo in test
0a39f24 [Jeff Knupp] Fix some logical issues with initialization; add bytearray test
fb2cfa3 [Jeff Knupp] Add proper reference counting regardless of if buf is memoryview
26f8b74 [Jeff Knupp] Need to investigate why test failed travis but not locally. For now, revert
f7d21ac [Jeff Knupp] ARROW-626: [Python] Replace PyBytesBuffer with zero-copy, memoryview-based PyBuffer
Project: http://git-wip-us.apache.org/repos/asf/arrow/repo
Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/685ebf49
Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/685ebf49
Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/685ebf49
Branch: refs/heads/master
Commit: 685ebf49001ef02134e404dddae2bfd032dc4a65
Parents: c7947dc
Author: Jeff Knupp <je...@enigma.io>
Authored: Sat Mar 25 12:15:02 2017 -0400
Committer: Wes McKinney <we...@twosigma.com>
Committed: Sat Mar 25 12:15:02 2017 -0400
----------------------------------------------------------------------
python/pyarrow/includes/pyarrow.pxd | 4 ++--
python/pyarrow/io.pyx | 19 ++++++++-----------
python/pyarrow/tests/test_io.py | 20 +++++++++++++++-----
python/src/pyarrow/common.cc | 24 +++++++++++++++---------
python/src/pyarrow/common.h | 10 +++++++---
python/src/pyarrow/io.cc | 6 +++---
python/src/pyarrow/io.h | 2 +-
7 files changed, 51 insertions(+), 34 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/arrow/blob/685ebf49/python/pyarrow/includes/pyarrow.pxd
----------------------------------------------------------------------
diff --git a/python/pyarrow/includes/pyarrow.pxd b/python/pyarrow/includes/pyarrow.pxd
index 805950b..3fdbebc 100644
--- a/python/pyarrow/includes/pyarrow.pxd
+++ b/python/pyarrow/includes/pyarrow.pxd
@@ -55,8 +55,8 @@ cdef extern from "pyarrow/api.h" namespace "arrow::py" nogil:
cdef extern from "pyarrow/common.h" namespace "arrow::py" nogil:
- cdef cppclass PyBytesBuffer(CBuffer):
- PyBytesBuffer(object o)
+ cdef cppclass PyBuffer(CBuffer):
+ PyBuffer(object o)
cdef extern from "pyarrow/io.h" namespace "arrow::py" nogil:
http://git-wip-us.apache.org/repos/asf/arrow/blob/685ebf49/python/pyarrow/io.pyx
----------------------------------------------------------------------
diff --git a/python/pyarrow/io.pyx b/python/pyarrow/io.pyx
index 72e0e0f..cb44ce8 100644
--- a/python/pyarrow/io.pyx
+++ b/python/pyarrow/io.pyx
@@ -501,16 +501,11 @@ cdef class BufferReader(NativeFile):
Buffer buffer
def __cinit__(self, object obj):
- cdef shared_ptr[CBuffer] buf
if isinstance(obj, Buffer):
self.buffer = obj
- elif isinstance(obj, bytes):
- buf.reset(new pyarrow.PyBytesBuffer(obj))
- self.buffer = wrap_buffer(buf)
else:
- raise ValueError('Unable to convert value to buffer: {0}'
- .format(type(obj)))
+ self.buffer = build_arrow_buffer(obj)
self.rd_file.reset(new CBufferReader(self.buffer.buffer))
self.is_readable = 1
@@ -518,16 +513,18 @@ cdef class BufferReader(NativeFile):
self.is_open = True
-def buffer_from_bytes(object obj):
+def build_arrow_buffer(object obj):
"""
Construct an Arrow buffer from a Python bytes object
"""
cdef shared_ptr[CBuffer] buf
- if not isinstance(obj, bytes):
- raise ValueError('Must pass bytes object')
+ try:
+ memoryview(obj)
+ buf.reset(new pyarrow.PyBuffer(obj))
+ return wrap_buffer(buf)
+ except TypeError:
+ raise ValueError('Must pass object that implements buffer protocol')
- buf.reset(new pyarrow.PyBytesBuffer(obj))
- return wrap_buffer(buf)
cdef Buffer wrap_buffer(const shared_ptr[CBuffer]& buf):
http://git-wip-us.apache.org/repos/asf/arrow/blob/685ebf49/python/pyarrow/tests/test_io.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/tests/test_io.py b/python/pyarrow/tests/test_io.py
index c6caba5..9cd15c4 100644
--- a/python/pyarrow/tests/test_io.py
+++ b/python/pyarrow/tests/test_io.py
@@ -82,7 +82,6 @@ def test_bytes_reader():
# Like a BytesIO, but zero-copy underneath for C++ consumers
data = b'some sample data'
f = io.BufferReader(data)
-
assert f.tell() == 0
assert f.size() == len(data)
@@ -128,7 +127,7 @@ def test_bytes_reader_retains_parent_reference():
def test_buffer_bytes():
val = b'some data'
- buf = io.buffer_from_bytes(val)
+ buf = io.build_arrow_buffer(val)
assert isinstance(buf, io.Buffer)
result = buf.to_pybytes()
@@ -138,18 +137,29 @@ def test_buffer_bytes():
def test_buffer_memoryview():
val = b'some data'
- buf = io.buffer_from_bytes(val)
+ buf = io.build_arrow_buffer(val)
assert isinstance(buf, io.Buffer)
result = memoryview(buf)
assert result == val
+def test_buffer_bytearray():
+ val = bytearray(b'some data')
+
+
+ buf = io.build_arrow_buffer(val)
+ assert isinstance(buf, io.Buffer)
+
+ result = bytearray(buf)
+
+ assert result == val
+
def test_buffer_memoryview_is_immutable():
val = b'some data'
- buf = io.buffer_from_bytes(val)
+ buf = io.build_arrow_buffer(val)
assert isinstance(buf, io.Buffer)
result = memoryview(buf)
@@ -192,7 +202,7 @@ def test_buffer_protocol_ref_counting():
import gc
def make_buffer(bytes_obj):
- return bytearray(io.buffer_from_bytes(bytes_obj))
+ return bytearray(io.build_arrow_buffer(bytes_obj))
buf = make_buffer(b'foo')
gc.collect()
http://git-wip-us.apache.org/repos/asf/arrow/blob/685ebf49/python/src/pyarrow/common.cc
----------------------------------------------------------------------
diff --git a/python/src/pyarrow/common.cc b/python/src/pyarrow/common.cc
index c898f63..792aa47 100644
--- a/python/src/pyarrow/common.cc
+++ b/python/src/pyarrow/common.cc
@@ -45,18 +45,24 @@ MemoryPool* get_memory_pool() {
}
// ----------------------------------------------------------------------
-// PyBytesBuffer
+// PyBuffer
-PyBytesBuffer::PyBytesBuffer(PyObject* obj)
- : Buffer(reinterpret_cast<const uint8_t*>(PyBytes_AS_STRING(obj)),
- PyBytes_GET_SIZE(obj)),
- obj_(obj) {
- Py_INCREF(obj_);
+PyBuffer::PyBuffer(PyObject* obj)
+ : Buffer(nullptr, 0) {
+ if (PyObject_CheckBuffer(obj)) {
+ obj_ = PyMemoryView_FromObject(obj);
+ Py_buffer* buffer = PyMemoryView_GET_BUFFER(obj_);
+ data_ = reinterpret_cast<const uint8_t*>(buffer->buf);
+ size_ = buffer->len;
+ capacity_ = buffer->len;
+ is_mutable_ = false;
+ Py_INCREF(obj_);
+ }
}
-PyBytesBuffer::~PyBytesBuffer() {
- PyAcquireGIL lock;
- Py_DECREF(obj_);
+PyBuffer::~PyBuffer() {
+ PyAcquireGIL lock;
+ Py_DECREF(obj_);
}
} // namespace py
http://git-wip-us.apache.org/repos/asf/arrow/blob/685ebf49/python/src/pyarrow/common.h
----------------------------------------------------------------------
diff --git a/python/src/pyarrow/common.h b/python/src/pyarrow/common.h
index 0b4c6be..b4e4ea6 100644
--- a/python/src/pyarrow/common.h
+++ b/python/src/pyarrow/common.h
@@ -118,10 +118,14 @@ class ARROW_EXPORT NumPyBuffer : public Buffer {
PyArrayObject* arr_;
};
-class ARROW_EXPORT PyBytesBuffer : public Buffer {
+class ARROW_EXPORT PyBuffer : public Buffer {
public:
- PyBytesBuffer(PyObject* obj);
- ~PyBytesBuffer();
+ /// Note that the GIL must be held when calling the PyBuffer constructor.
+ ///
+ /// While memoryview objects support multi-demensional buffers, PyBuffer only supports
+ /// one-dimensional byte buffers.
+ PyBuffer(PyObject* obj);
+ ~PyBuffer();
private:
PyObject* obj_;
http://git-wip-us.apache.org/repos/asf/arrow/blob/685ebf49/python/src/pyarrow/io.cc
----------------------------------------------------------------------
diff --git a/python/src/pyarrow/io.cc b/python/src/pyarrow/io.cc
index 0aa61dc..c66155b 100644
--- a/python/src/pyarrow/io.cc
+++ b/python/src/pyarrow/io.cc
@@ -156,7 +156,7 @@ Status PyReadableFile::Read(int64_t nbytes, std::shared_ptr<Buffer>* out) {
PyObject* bytes_obj;
ARROW_RETURN_NOT_OK(file_->Read(nbytes, &bytes_obj));
- *out = std::make_shared<PyBytesBuffer>(bytes_obj);
+ *out = std::make_shared<PyBuffer>(bytes_obj);
Py_DECREF(bytes_obj);
return Status::OK();
@@ -210,10 +210,10 @@ Status PyOutputStream::Write(const uint8_t* data, int64_t nbytes) {
}
// ----------------------------------------------------------------------
-// A readable file that is backed by a PyBytes
+// A readable file that is backed by a PyBuffer
PyBytesReader::PyBytesReader(PyObject* obj)
- : io::BufferReader(std::make_shared<PyBytesBuffer>(obj)) {}
+ : io::BufferReader(std::make_shared<PyBuffer>(obj)) {}
PyBytesReader::~PyBytesReader() {}
http://git-wip-us.apache.org/repos/asf/arrow/blob/685ebf49/python/src/pyarrow/io.h
----------------------------------------------------------------------
diff --git a/python/src/pyarrow/io.h b/python/src/pyarrow/io.h
index e38cd81..89af609 100644
--- a/python/src/pyarrow/io.h
+++ b/python/src/pyarrow/io.h
@@ -84,7 +84,7 @@ class ARROW_EXPORT PyOutputStream : public io::OutputStream {
std::unique_ptr<PythonFile> file_;
};
-// A zero-copy reader backed by a PyBytes object
+// A zero-copy reader backed by a PyBuffer object
class ARROW_EXPORT PyBytesReader : public io::BufferReader {
public:
explicit PyBytesReader(PyObject* obj);