You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ap...@apache.org on 2020/12/01 10:40:19 UTC
[arrow] branch master updated: ARROW-10709: [C++][Python] Allow
PyReadableFile::Read() to call pyobj.read_buffer()
This is an automated email from the ASF dual-hosted git repository.
apitrou pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 03a89aa ARROW-10709: [C++][Python] Allow PyReadableFile::Read() to call pyobj.read_buffer()
03a89aa is described below
commit 03a89aa7c05f5d8707c313eadd4dbe2f8a46136a
Author: Maarten A. Breddels <ma...@gmail.com>
AuthorDate: Tue Dec 1 11:39:26 2020 +0100
ARROW-10709: [C++][Python] Allow PyReadableFile::Read() to call pyobj.read_buffer()
This makes it possible to create a Python-Land-based file object with great performance.
Related https://github.com/vaexio/vaex/pull/1075
Closes #8755 from maartenbreddels/ARROW-10709
Authored-by: Maarten A. Breddels <ma...@gmail.com>
Signed-off-by: Antoine Pitrou <an...@python.org>
---
cpp/src/arrow/python/io.cc | 50 ++++++++++++++++++++++++++++++++---------
python/pyarrow/tests/test_io.py | 28 +++++++++++++++++++++++
2 files changed, 67 insertions(+), 11 deletions(-)
diff --git a/cpp/src/arrow/python/io.cc b/cpp/src/arrow/python/io.cc
index 515ea13..73525fe 100644
--- a/cpp/src/arrow/python/io.cc
+++ b/cpp/src/arrow/python/io.cc
@@ -44,7 +44,9 @@ namespace py {
// calling any methods
class PythonFile {
public:
- explicit PythonFile(PyObject* file) : file_(file) { Py_INCREF(file); }
+ explicit PythonFile(PyObject* file) : file_(file), checked_read_buffer_(false) {
+ Py_INCREF(file);
+ }
Status CheckClosed() const {
if (!file_) {
@@ -108,6 +110,14 @@ class PythonFile {
return Status::OK();
}
+ Status ReadBuffer(int64_t nbytes, PyObject** out) {
+ PyObject* result = cpp_PyObject_CallMethod(file_.obj(), "read_buffer", "(n)",
+ static_cast<Py_ssize_t>(nbytes));
+ PY_RETURN_IF_ERROR(StatusCode::IOError);
+ *out = result;
+ return Status::OK();
+ }
+
Status Write(const void* data, int64_t nbytes) {
RETURN_NOT_OK(CheckClosed());
@@ -152,9 +162,19 @@ class PythonFile {
std::mutex& lock() { return lock_; }
+ bool HasReadBuffer() {
+ if (!checked_read_buffer_) { // we don't want to check this each time
+ has_read_buffer_ = PyObject_HasAttrString(file_.obj(), "read_buffer") == 1;
+ checked_read_buffer_ = true;
+ }
+ return has_read_buffer_;
+ }
+
private:
std::mutex lock_;
OwnedRefNoGIL file_;
+ bool has_read_buffer_;
+ bool checked_read_buffer_;
};
// ----------------------------------------------------------------------
@@ -199,25 +219,33 @@ Result<int64_t> PyReadableFile::Read(int64_t nbytes, void* out) {
PyObject* bytes_obj = bytes.obj();
DCHECK(bytes_obj != NULL);
- if (!PyBytes_Check(bytes_obj)) {
+ Py_buffer py_buf;
+ if (!PyObject_GetBuffer(bytes_obj, &py_buf, PyBUF_ANY_CONTIGUOUS)) {
+ const uint8_t* data = reinterpret_cast<const uint8_t*>(py_buf.buf);
+ std::memcpy(out, data, py_buf.len);
+ int64_t len = py_buf.len;
+ PyBuffer_Release(&py_buf);
+ return len;
+ } else {
return Status::TypeError(
- "Python file read() should have returned a bytes object, got '",
+ "Python file read() should have returned a bytes object or an object "
+ "supporting the buffer protocol, got '",
Py_TYPE(bytes_obj)->tp_name, "' (did you open the file in binary mode?)");
}
-
- int64_t bytes_read = PyBytes_GET_SIZE(bytes_obj);
- std::memcpy(out, PyBytes_AS_STRING(bytes_obj), bytes_read);
- return bytes_read;
});
}
Result<std::shared_ptr<Buffer>> PyReadableFile::Read(int64_t nbytes) {
return SafeCallIntoPython([=]() -> Result<std::shared_ptr<Buffer>> {
- OwnedRef bytes_obj;
- RETURN_NOT_OK(file_->Read(nbytes, bytes_obj.ref()));
- DCHECK(bytes_obj.obj() != NULL);
+ OwnedRef buffer_obj;
+ if (file_->HasReadBuffer()) {
+ RETURN_NOT_OK(file_->ReadBuffer(nbytes, buffer_obj.ref()));
+ } else {
+ RETURN_NOT_OK(file_->Read(nbytes, buffer_obj.ref()));
+ }
+ DCHECK(buffer_obj.obj() != NULL);
- return PyBuffer::FromPyObject(bytes_obj.obj());
+ return PyBuffer::FromPyObject(buffer_obj.obj());
});
}
diff --git a/python/pyarrow/tests/test_io.py b/python/pyarrow/tests/test_io.py
index 69dc135..d637fc8 100644
--- a/python/pyarrow/tests/test_io.py
+++ b/python/pyarrow/tests/test_io.py
@@ -163,6 +163,34 @@ def test_python_file_readinto():
assert len(dst_buf) == length
+def test_python_file_read_buffer():
+ length = 10
+ data = b'0123456798'
+ dst_buf = bytearray(data)
+
+ class DuckReader:
+ def close(self):
+ pass
+
+ @property
+ def closed(self):
+ return False
+
+ def read_buffer(self, nbytes):
+ assert nbytes == length
+ return memoryview(dst_buf)[:nbytes]
+
+ duck_reader = DuckReader()
+ with pa.PythonFile(duck_reader, mode='r') as f:
+ buf = f.read_buffer(length)
+ assert len(buf) == length
+ assert memoryview(buf).tobytes() == dst_buf[:length]
+ # buf should point to the same memory, so modyfing it
+ memoryview(buf)[0] = ord(b'x')
+ # should modify the original
+ assert dst_buf[0] == ord(b'x')
+
+
def test_python_file_correct_abc():
with pa.PythonFile(BytesIO(b''), mode='r') as f:
assert isinstance(f, BufferedIOBase)