You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2017/03/25 16:15:08 UTC

arrow git commit: ARROW-626: [Python] Replace PyBytesBuffer with zero-copy, memoryview-based PyBuffer

Repository: arrow
Updated Branches:
  refs/heads/master c7947dc2d -> 685ebf490


ARROW-626: [Python] Replace PyBytesBuffer with zero-copy, memoryview-based PyBuffer

WIP, but tests all pass

Author: Jeff Knupp <je...@enigma.io>

Closes #410 from jeffknupp/master and squashes the following commits:

bfba71d [Jeff Knupp] Fix typo in test
0a39f24 [Jeff Knupp] Fix some logical issues with initialization; add bytearray test
fb2cfa3 [Jeff Knupp] Add proper reference counting regardless of if buf is memoryview
26f8b74 [Jeff Knupp] Need to investigate why test failed travis but not locally. For now, revert
f7d21ac [Jeff Knupp] ARROW-626: [Python] Replace PyBytesBuffer with zero-copy, memoryview-based PyBuffer


Project: http://git-wip-us.apache.org/repos/asf/arrow/repo
Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/685ebf49
Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/685ebf49
Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/685ebf49

Branch: refs/heads/master
Commit: 685ebf49001ef02134e404dddae2bfd032dc4a65
Parents: c7947dc
Author: Jeff Knupp <je...@enigma.io>
Authored: Sat Mar 25 12:15:02 2017 -0400
Committer: Wes McKinney <we...@twosigma.com>
Committed: Sat Mar 25 12:15:02 2017 -0400

----------------------------------------------------------------------
 python/pyarrow/includes/pyarrow.pxd |  4 ++--
 python/pyarrow/io.pyx               | 19 ++++++++-----------
 python/pyarrow/tests/test_io.py     | 20 +++++++++++++++-----
 python/src/pyarrow/common.cc        | 24 +++++++++++++++---------
 python/src/pyarrow/common.h         | 10 +++++++---
 python/src/pyarrow/io.cc            |  6 +++---
 python/src/pyarrow/io.h             |  2 +-
 7 files changed, 51 insertions(+), 34 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/arrow/blob/685ebf49/python/pyarrow/includes/pyarrow.pxd
----------------------------------------------------------------------
diff --git a/python/pyarrow/includes/pyarrow.pxd b/python/pyarrow/includes/pyarrow.pxd
index 805950b..3fdbebc 100644
--- a/python/pyarrow/includes/pyarrow.pxd
+++ b/python/pyarrow/includes/pyarrow.pxd
@@ -55,8 +55,8 @@ cdef extern from "pyarrow/api.h" namespace "arrow::py" nogil:
 
 
 cdef extern from "pyarrow/common.h" namespace "arrow::py" nogil:
-    cdef cppclass PyBytesBuffer(CBuffer):
-        PyBytesBuffer(object o)
+    cdef cppclass PyBuffer(CBuffer):
+        PyBuffer(object o)
 
 
 cdef extern from "pyarrow/io.h" namespace "arrow::py" nogil:

http://git-wip-us.apache.org/repos/asf/arrow/blob/685ebf49/python/pyarrow/io.pyx
----------------------------------------------------------------------
diff --git a/python/pyarrow/io.pyx b/python/pyarrow/io.pyx
index 72e0e0f..cb44ce8 100644
--- a/python/pyarrow/io.pyx
+++ b/python/pyarrow/io.pyx
@@ -501,16 +501,11 @@ cdef class BufferReader(NativeFile):
         Buffer buffer
 
     def __cinit__(self, object obj):
-        cdef shared_ptr[CBuffer] buf
 
         if isinstance(obj, Buffer):
             self.buffer = obj
-        elif isinstance(obj, bytes):
-            buf.reset(new pyarrow.PyBytesBuffer(obj))
-            self.buffer = wrap_buffer(buf)
         else:
-            raise ValueError('Unable to convert value to buffer: {0}'
-                             .format(type(obj)))
+            self.buffer = build_arrow_buffer(obj)
 
         self.rd_file.reset(new CBufferReader(self.buffer.buffer))
         self.is_readable = 1
@@ -518,16 +513,18 @@ cdef class BufferReader(NativeFile):
         self.is_open = True
 
 
-def buffer_from_bytes(object obj):
+def build_arrow_buffer(object obj):
     """
     Construct an Arrow buffer from a Python bytes object
     """
     cdef shared_ptr[CBuffer] buf
-    if not isinstance(obj, bytes):
-        raise ValueError('Must pass bytes object')
+    try:
+        memoryview(obj)
+        buf.reset(new pyarrow.PyBuffer(obj))
+        return wrap_buffer(buf)
+    except TypeError:
+        raise ValueError('Must pass object that implements buffer protocol')
 
-    buf.reset(new pyarrow.PyBytesBuffer(obj))
-    return wrap_buffer(buf)
 
 
 cdef Buffer wrap_buffer(const shared_ptr[CBuffer]& buf):

http://git-wip-us.apache.org/repos/asf/arrow/blob/685ebf49/python/pyarrow/tests/test_io.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/tests/test_io.py b/python/pyarrow/tests/test_io.py
index c6caba5..9cd15c4 100644
--- a/python/pyarrow/tests/test_io.py
+++ b/python/pyarrow/tests/test_io.py
@@ -82,7 +82,6 @@ def test_bytes_reader():
     # Like a BytesIO, but zero-copy underneath for C++ consumers
     data = b'some sample data'
     f = io.BufferReader(data)
-
     assert f.tell() == 0
 
     assert f.size() == len(data)
@@ -128,7 +127,7 @@ def test_bytes_reader_retains_parent_reference():
 def test_buffer_bytes():
     val = b'some data'
 
-    buf = io.buffer_from_bytes(val)
+    buf = io.build_arrow_buffer(val)
     assert isinstance(buf, io.Buffer)
 
     result = buf.to_pybytes()
@@ -138,18 +137,29 @@ def test_buffer_bytes():
 def test_buffer_memoryview():
     val = b'some data'
 
-    buf = io.buffer_from_bytes(val)
+    buf = io.build_arrow_buffer(val)
     assert isinstance(buf, io.Buffer)
 
     result = memoryview(buf)
 
     assert result == val
 
+def test_buffer_bytearray():
+    val = bytearray(b'some data')
+
+
+    buf = io.build_arrow_buffer(val)
+    assert isinstance(buf, io.Buffer)
+
+    result = bytearray(buf)
+
+    assert result == val
+
 
 def test_buffer_memoryview_is_immutable():
     val = b'some data'
 
-    buf = io.buffer_from_bytes(val)
+    buf = io.build_arrow_buffer(val)
     assert isinstance(buf, io.Buffer)
 
     result = memoryview(buf)
@@ -192,7 +202,7 @@ def test_buffer_protocol_ref_counting():
     import gc
 
     def make_buffer(bytes_obj):
-        return bytearray(io.buffer_from_bytes(bytes_obj))
+        return bytearray(io.build_arrow_buffer(bytes_obj))
 
     buf = make_buffer(b'foo')
     gc.collect()

http://git-wip-us.apache.org/repos/asf/arrow/blob/685ebf49/python/src/pyarrow/common.cc
----------------------------------------------------------------------
diff --git a/python/src/pyarrow/common.cc b/python/src/pyarrow/common.cc
index c898f63..792aa47 100644
--- a/python/src/pyarrow/common.cc
+++ b/python/src/pyarrow/common.cc
@@ -45,18 +45,24 @@ MemoryPool* get_memory_pool() {
 }
 
 // ----------------------------------------------------------------------
-// PyBytesBuffer
+// PyBuffer
 
-PyBytesBuffer::PyBytesBuffer(PyObject* obj)
-    : Buffer(reinterpret_cast<const uint8_t*>(PyBytes_AS_STRING(obj)),
-          PyBytes_GET_SIZE(obj)),
-      obj_(obj) {
-  Py_INCREF(obj_);
+PyBuffer::PyBuffer(PyObject* obj)
+    : Buffer(nullptr, 0) {
+    if (PyObject_CheckBuffer(obj)) {
+        obj_ = PyMemoryView_FromObject(obj);
+        Py_buffer* buffer = PyMemoryView_GET_BUFFER(obj_);
+        data_ = reinterpret_cast<const uint8_t*>(buffer->buf);
+        size_ = buffer->len;
+        capacity_ = buffer->len;
+        is_mutable_ = false;
+        Py_INCREF(obj_);
+    } 
 }
 
-PyBytesBuffer::~PyBytesBuffer() {
-  PyAcquireGIL lock;
-  Py_DECREF(obj_);
+PyBuffer::~PyBuffer() {
+    PyAcquireGIL lock;
+    Py_DECREF(obj_);
 }
 
 }  // namespace py

http://git-wip-us.apache.org/repos/asf/arrow/blob/685ebf49/python/src/pyarrow/common.h
----------------------------------------------------------------------
diff --git a/python/src/pyarrow/common.h b/python/src/pyarrow/common.h
index 0b4c6be..b4e4ea6 100644
--- a/python/src/pyarrow/common.h
+++ b/python/src/pyarrow/common.h
@@ -118,10 +118,14 @@ class ARROW_EXPORT NumPyBuffer : public Buffer {
   PyArrayObject* arr_;
 };
 
-class ARROW_EXPORT PyBytesBuffer : public Buffer {
+class ARROW_EXPORT PyBuffer : public Buffer {
  public:
-  PyBytesBuffer(PyObject* obj);
-  ~PyBytesBuffer();
+  /// Note that the GIL must be held when calling the PyBuffer constructor.
+  ///
+  /// While memoryview objects support multi-demensional buffers, PyBuffer only supports
+  /// one-dimensional byte buffers.
+  PyBuffer(PyObject* obj);
+  ~PyBuffer();
 
  private:
   PyObject* obj_;

http://git-wip-us.apache.org/repos/asf/arrow/blob/685ebf49/python/src/pyarrow/io.cc
----------------------------------------------------------------------
diff --git a/python/src/pyarrow/io.cc b/python/src/pyarrow/io.cc
index 0aa61dc..c66155b 100644
--- a/python/src/pyarrow/io.cc
+++ b/python/src/pyarrow/io.cc
@@ -156,7 +156,7 @@ Status PyReadableFile::Read(int64_t nbytes, std::shared_ptr<Buffer>* out) {
   PyObject* bytes_obj;
   ARROW_RETURN_NOT_OK(file_->Read(nbytes, &bytes_obj));
 
-  *out = std::make_shared<PyBytesBuffer>(bytes_obj);
+  *out = std::make_shared<PyBuffer>(bytes_obj);
   Py_DECREF(bytes_obj);
 
   return Status::OK();
@@ -210,10 +210,10 @@ Status PyOutputStream::Write(const uint8_t* data, int64_t nbytes) {
 }
 
 // ----------------------------------------------------------------------
-// A readable file that is backed by a PyBytes
+// A readable file that is backed by a PyBuffer
 
 PyBytesReader::PyBytesReader(PyObject* obj)
-    : io::BufferReader(std::make_shared<PyBytesBuffer>(obj)) {}
+    : io::BufferReader(std::make_shared<PyBuffer>(obj)) {}
 
 PyBytesReader::~PyBytesReader() {}
 

http://git-wip-us.apache.org/repos/asf/arrow/blob/685ebf49/python/src/pyarrow/io.h
----------------------------------------------------------------------
diff --git a/python/src/pyarrow/io.h b/python/src/pyarrow/io.h
index e38cd81..89af609 100644
--- a/python/src/pyarrow/io.h
+++ b/python/src/pyarrow/io.h
@@ -84,7 +84,7 @@ class ARROW_EXPORT PyOutputStream : public io::OutputStream {
   std::unique_ptr<PythonFile> file_;
 };
 
-// A zero-copy reader backed by a PyBytes object
+// A zero-copy reader backed by a PyBuffer object
 class ARROW_EXPORT PyBytesReader : public io::BufferReader {
  public:
   explicit PyBytesReader(PyObject* obj);