You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ap...@apache.org on 2020/12/01 10:40:19 UTC

[arrow] branch master updated: ARROW-10709: [C++][Python] Allow PyReadableFile::Read() to call pyobj.read_buffer()

This is an automated email from the ASF dual-hosted git repository.

apitrou pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 03a89aa  ARROW-10709: [C++][Python] Allow PyReadableFile::Read() to call pyobj.read_buffer()
03a89aa is described below

commit 03a89aa7c05f5d8707c313eadd4dbe2f8a46136a
Author: Maarten A. Breddels <ma...@gmail.com>
AuthorDate: Tue Dec 1 11:39:26 2020 +0100

    ARROW-10709: [C++][Python] Allow PyReadableFile::Read() to call pyobj.read_buffer()
    
    This makes it possible to create a Python-Land-based file object with great performance.
    
    Related https://github.com/vaexio/vaex/pull/1075
    
    Closes #8755 from maartenbreddels/ARROW-10709
    
    Authored-by: Maarten A. Breddels <ma...@gmail.com>
    Signed-off-by: Antoine Pitrou <an...@python.org>
---
 cpp/src/arrow/python/io.cc      | 50 ++++++++++++++++++++++++++++++++---------
 python/pyarrow/tests/test_io.py | 28 +++++++++++++++++++++++
 2 files changed, 67 insertions(+), 11 deletions(-)

diff --git a/cpp/src/arrow/python/io.cc b/cpp/src/arrow/python/io.cc
index 515ea13..73525fe 100644
--- a/cpp/src/arrow/python/io.cc
+++ b/cpp/src/arrow/python/io.cc
@@ -44,7 +44,9 @@ namespace py {
 // calling any methods
 class PythonFile {
  public:
-  explicit PythonFile(PyObject* file) : file_(file) { Py_INCREF(file); }
+  explicit PythonFile(PyObject* file) : file_(file), checked_read_buffer_(false) {
+    Py_INCREF(file);
+  }
 
   Status CheckClosed() const {
     if (!file_) {
@@ -108,6 +110,14 @@ class PythonFile {
     return Status::OK();
   }
 
+  Status ReadBuffer(int64_t nbytes, PyObject** out) {
+    PyObject* result = cpp_PyObject_CallMethod(file_.obj(), "read_buffer", "(n)",
+                                               static_cast<Py_ssize_t>(nbytes));
+    PY_RETURN_IF_ERROR(StatusCode::IOError);
+    *out = result;
+    return Status::OK();
+  }
+
   Status Write(const void* data, int64_t nbytes) {
     RETURN_NOT_OK(CheckClosed());
 
@@ -152,9 +162,19 @@ class PythonFile {
 
   std::mutex& lock() { return lock_; }
 
+  bool HasReadBuffer() {
+    if (!checked_read_buffer_) {  // we don't want to check this each time
+      has_read_buffer_ = PyObject_HasAttrString(file_.obj(), "read_buffer") == 1;
+      checked_read_buffer_ = true;
+    }
+    return has_read_buffer_;
+  }
+
  private:
   std::mutex lock_;
   OwnedRefNoGIL file_;
+  bool has_read_buffer_;
+  bool checked_read_buffer_;
 };
 
 // ----------------------------------------------------------------------
@@ -199,25 +219,33 @@ Result<int64_t> PyReadableFile::Read(int64_t nbytes, void* out) {
     PyObject* bytes_obj = bytes.obj();
     DCHECK(bytes_obj != NULL);
 
-    if (!PyBytes_Check(bytes_obj)) {
+    Py_buffer py_buf;
+    if (!PyObject_GetBuffer(bytes_obj, &py_buf, PyBUF_ANY_CONTIGUOUS)) {
+      const uint8_t* data = reinterpret_cast<const uint8_t*>(py_buf.buf);
+      std::memcpy(out, data, py_buf.len);
+      int64_t len = py_buf.len;
+      PyBuffer_Release(&py_buf);
+      return len;
+    } else {
       return Status::TypeError(
-          "Python file read() should have returned a bytes object, got '",
+          "Python file read() should have returned a bytes object or an object "
+          "supporting the buffer protocol, got '",
           Py_TYPE(bytes_obj)->tp_name, "' (did you open the file in binary mode?)");
     }
-
-    int64_t bytes_read = PyBytes_GET_SIZE(bytes_obj);
-    std::memcpy(out, PyBytes_AS_STRING(bytes_obj), bytes_read);
-    return bytes_read;
   });
 }
 
 Result<std::shared_ptr<Buffer>> PyReadableFile::Read(int64_t nbytes) {
   return SafeCallIntoPython([=]() -> Result<std::shared_ptr<Buffer>> {
-    OwnedRef bytes_obj;
-    RETURN_NOT_OK(file_->Read(nbytes, bytes_obj.ref()));
-    DCHECK(bytes_obj.obj() != NULL);
+    OwnedRef buffer_obj;
+    if (file_->HasReadBuffer()) {
+      RETURN_NOT_OK(file_->ReadBuffer(nbytes, buffer_obj.ref()));
+    } else {
+      RETURN_NOT_OK(file_->Read(nbytes, buffer_obj.ref()));
+    }
+    DCHECK(buffer_obj.obj() != NULL);
 
-    return PyBuffer::FromPyObject(bytes_obj.obj());
+    return PyBuffer::FromPyObject(buffer_obj.obj());
   });
 }
 
diff --git a/python/pyarrow/tests/test_io.py b/python/pyarrow/tests/test_io.py
index 69dc135..d637fc8 100644
--- a/python/pyarrow/tests/test_io.py
+++ b/python/pyarrow/tests/test_io.py
@@ -163,6 +163,34 @@ def test_python_file_readinto():
         assert len(dst_buf) == length
 
 
+def test_python_file_read_buffer():
+    length = 10
+    data = b'0123456798'
+    dst_buf = bytearray(data)
+
+    class DuckReader:
+        def close(self):
+            pass
+
+        @property
+        def closed(self):
+            return False
+
+        def read_buffer(self, nbytes):
+            assert nbytes == length
+            return memoryview(dst_buf)[:nbytes]
+
+    duck_reader = DuckReader()
+    with pa.PythonFile(duck_reader, mode='r') as f:
+        buf = f.read_buffer(length)
+        assert len(buf) == length
+        assert memoryview(buf).tobytes() == dst_buf[:length]
+        # buf should point to the same memory, so modyfing it
+        memoryview(buf)[0] = ord(b'x')
+        # should modify the original
+        assert dst_buf[0] == ord(b'x')
+
+
 def test_python_file_correct_abc():
     with pa.PythonFile(BytesIO(b''), mode='r') as f:
         assert isinstance(f, BufferedIOBase)