You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2017/01/13 20:45:26 UTC
arrow git commit: ARROW-478: Consolidate BytesReader and BufferReader
to accept PyBytes or Buffer
Repository: arrow
Updated Branches:
refs/heads/master 876ae8509 -> 47115aa3e
ARROW-478: Consolidate BytesReader and BufferReader to accept PyBytes or Buffer
API simplification. I think `pyarrow::PyBytesReader` can even be removed.
Author: Wes McKinney <we...@twosigma.com>
Closes #285 from wesm/ARROW-478 and squashes the following commits:
cbfc8ad [Wes McKinney] Consolidate BytesReader and BufferReader to accept either PyBytes or Buffer
Project: http://git-wip-us.apache.org/repos/asf/arrow/repo
Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/47115aa3
Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/47115aa3
Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/47115aa3
Branch: refs/heads/master
Commit: 47115aa3e33d85bbc697a7950da33cf18f4c71be
Parents: 876ae85
Author: Wes McKinney <we...@twosigma.com>
Authored: Fri Jan 13 15:45:19 2017 -0500
Committer: Wes McKinney <we...@twosigma.com>
Committed: Fri Jan 13 15:45:19 2017 -0500
----------------------------------------------------------------------
python/pyarrow/__init__.py | 3 +-
python/pyarrow/includes/libarrow_io.pxd | 1 +
python/pyarrow/io.pyx | 57 +++++++++++++---------------
python/pyarrow/parquet.py | 4 +-
python/pyarrow/tests/test_io.py | 7 ++--
python/pyarrow/tests/test_ipc.py | 2 +-
6 files changed, 34 insertions(+), 40 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/arrow/blob/47115aa3/python/pyarrow/__init__.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py
index d25cdd4..a8c3e8e 100644
--- a/python/pyarrow/__init__.py
+++ b/python/pyarrow/__init__.py
@@ -38,8 +38,7 @@ from pyarrow.array import (Array,
from pyarrow.error import ArrowException
from pyarrow.io import (HdfsClient, HdfsFile, NativeFile, PythonFileInterface,
- BytesReader, Buffer, InMemoryOutputStream,
- BufferReader)
+ Buffer, InMemoryOutputStream, BufferReader)
from pyarrow.scalar import (ArrayValue, Scalar, NA, NAType,
BooleanValue,
http://git-wip-us.apache.org/repos/asf/arrow/blob/47115aa3/python/pyarrow/includes/libarrow_io.pxd
----------------------------------------------------------------------
diff --git a/python/pyarrow/includes/libarrow_io.pxd b/python/pyarrow/includes/libarrow_io.pxd
index 6b141a3..417af7d 100644
--- a/python/pyarrow/includes/libarrow_io.pxd
+++ b/python/pyarrow/includes/libarrow_io.pxd
@@ -162,6 +162,7 @@ cdef extern from "arrow/io/hdfs.h" namespace "arrow::io" nogil:
cdef extern from "arrow/io/memory.h" namespace "arrow::io" nogil:
cdef cppclass CBufferReader" arrow::io::BufferReader"\
(ReadableFileInterface):
+ CBufferReader(const shared_ptr[CBuffer]& buffer)
CBufferReader(const uint8_t* data, int64_t nbytes)
cdef cppclass BufferOutputStream(OutputStream):
http://git-wip-us.apache.org/repos/asf/arrow/blob/47115aa3/python/pyarrow/io.pyx
----------------------------------------------------------------------
diff --git a/python/pyarrow/io.pyx b/python/pyarrow/io.pyx
index 1939fe8..0f626f1 100644
--- a/python/pyarrow/io.pyx
+++ b/python/pyarrow/io.pyx
@@ -275,21 +275,6 @@ cdef class OSFile(NativeFile):
self.wr_file = <shared_ptr[OutputStream]> handle
-cdef class BytesReader(NativeFile):
- cdef:
- object obj
-
- def __cinit__(self, obj):
- if not isinstance(obj, bytes):
- raise ValueError('Must pass bytes object')
-
- self.obj = obj
- self.is_readable = 1
- self.is_writeable = 0
- self.is_open = True
-
- self.rd_file.reset(new pyarrow.PyBytesReader(obj))
-
# ----------------------------------------------------------------------
# Arrow buffers
@@ -330,12 +315,6 @@ cdef class Buffer:
self.buffer.get().size())
-cdef wrap_buffer(const shared_ptr[CBuffer]& buffer):
- cdef Buffer result = Buffer()
- result.buffer = buffer
- return result
-
-
cdef shared_ptr[PoolBuffer] allocate_buffer():
cdef shared_ptr[PoolBuffer] result
result.reset(new PoolBuffer(pyarrow.get_memory_pool()))
@@ -356,23 +335,35 @@ cdef class InMemoryOutputStream(NativeFile):
self.is_open = True
def get_result(self):
- cdef Buffer result = Buffer()
-
check_status(self.wr_file.get().Close())
- result.init(<shared_ptr[CBuffer]> self.buffer)
-
self.is_open = False
- return result
+ return wrap_buffer(<shared_ptr[CBuffer]> self.buffer)
cdef class BufferReader(NativeFile):
+ """
+ Zero-copy reader from objects convertible to Arrow buffer
+
+ Parameters
+ ----------
+ obj : Python bytes or pyarrow.io.Buffer
+ """
cdef:
Buffer buffer
- def __cinit__(self, Buffer buffer):
- self.buffer = buffer
- self.rd_file.reset(new CBufferReader(buffer.buffer.get().data(),
- buffer.buffer.get().size()))
+ def __cinit__(self, object obj):
+ cdef shared_ptr[CBuffer] buf
+
+ if isinstance(obj, Buffer):
+ self.buffer = obj
+ elif isinstance(obj, bytes):
+ buf.reset(new pyarrow.PyBytesBuffer(obj))
+ self.buffer = wrap_buffer(buf)
+ else:
+ raise ValueError('Unable to convert value to buffer: {0}'
+ .format(type(obj)))
+
+ self.rd_file.reset(new CBufferReader(self.buffer.buffer))
self.is_readable = 1
self.is_writeable = 0
self.is_open = True
@@ -382,16 +373,20 @@ def buffer_from_bytes(object obj):
"""
Construct an Arrow buffer from a Python bytes object
"""
+ cdef shared_ptr[CBuffer] buf
if not isinstance(obj, bytes):
raise ValueError('Must pass bytes object')
- cdef shared_ptr[CBuffer] buf
buf.reset(new pyarrow.PyBytesBuffer(obj))
+ return wrap_buffer(buf)
+
+cdef Buffer wrap_buffer(const shared_ptr[CBuffer]& buf):
cdef Buffer result = Buffer()
result.init(buf)
return result
+
cdef get_reader(object source, shared_ptr[ReadableFileInterface]* reader):
cdef NativeFile nf
http://git-wip-us.apache.org/repos/asf/arrow/blob/47115aa3/python/pyarrow/parquet.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/parquet.py b/python/pyarrow/parquet.py
index 708ae65..2a1ac9d 100644
--- a/python/pyarrow/parquet.py
+++ b/python/pyarrow/parquet.py
@@ -27,7 +27,7 @@ class ParquetFile(object):
----------
source : str or pyarrow.io.NativeFile
Readable source. For passing Python file objects or byte buffers,
- see pyarrow.io.PythonFileInterface or pyarrow.io.BytesReader.
+ see pyarrow.io.PythonFileInterface or pyarrow.io.BufferReader.
metadata : ParquetFileMetadata, default None
Use existing metadata object, rather than reading from file.
"""
@@ -78,7 +78,7 @@ def read_table(source, columns=None):
----------
source: str or pyarrow.io.NativeFile
Readable source. For passing Python file objects or byte buffers, see
- pyarrow.io.PythonFileInterface or pyarrow.io.BytesReader.
+ pyarrow.io.PythonFileInterface or pyarrow.io.BufferReader.
columns: list
If not None, only these columns will be read from the file.
http://git-wip-us.apache.org/repos/asf/arrow/blob/47115aa3/python/pyarrow/tests/test_io.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/tests/test_io.py b/python/pyarrow/tests/test_io.py
index 224f20d..f28d44a 100644
--- a/python/pyarrow/tests/test_io.py
+++ b/python/pyarrow/tests/test_io.py
@@ -23,7 +23,6 @@ import numpy as np
from pyarrow.compat import u, guid
import pyarrow.io as io
-import pyarrow as pa
# ----------------------------------------------------------------------
# Python file-like objects
@@ -81,7 +80,7 @@ def test_python_file_read():
def test_bytes_reader():
# Like a BytesIO, but zero-copy underneath for C++ consumers
data = b'some sample data'
- f = io.BytesReader(data)
+ f = io.BufferReader(data)
assert f.tell() == 0
@@ -103,7 +102,7 @@ def test_bytes_reader():
def test_bytes_reader_non_bytes():
with pytest.raises(ValueError):
- io.BytesReader(u('some sample data'))
+ io.BufferReader(u('some sample data'))
def test_bytes_reader_retains_parent_reference():
@@ -112,7 +111,7 @@ def test_bytes_reader_retains_parent_reference():
# ARROW-421
def get_buffer():
data = b'some sample data' * 1000
- reader = io.BytesReader(data)
+ reader = io.BufferReader(data)
reader.seek(5)
return reader.read_buffer(6)
http://git-wip-us.apache.org/repos/asf/arrow/blob/47115aa3/python/pyarrow/tests/test_ipc.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/tests/test_ipc.py b/python/pyarrow/tests/test_ipc.py
index 368a9d4..bbd6c6a 100644
--- a/python/pyarrow/tests/test_ipc.py
+++ b/python/pyarrow/tests/test_ipc.py
@@ -63,7 +63,7 @@ class RoundtripTest(object):
writer.close()
file_contents = self._get_source()
- reader = ipc.ArrowFileReader(aio.BytesReader(file_contents))
+ reader = ipc.ArrowFileReader(aio.BufferReader(file_contents))
assert reader.num_record_batches == num_batches