You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2017/01/13 20:45:26 UTC

arrow git commit: ARROW-478: Consolidate BytesReader and BufferReader to accept PyBytes or Buffer

Repository: arrow
Updated Branches:
  refs/heads/master 876ae8509 -> 47115aa3e


ARROW-478: Consolidate BytesReader and BufferReader to accept PyBytes or Buffer

API simplification. I think `pyarrow::PyBytesReader` can even be removed.

Author: Wes McKinney <we...@twosigma.com>

Closes #285 from wesm/ARROW-478 and squashes the following commits:

cbfc8ad [Wes McKinney] Consolidate BytesReader and BufferReader to accept either PyBytes or Buffer


Project: http://git-wip-us.apache.org/repos/asf/arrow/repo
Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/47115aa3
Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/47115aa3
Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/47115aa3

Branch: refs/heads/master
Commit: 47115aa3e33d85bbc697a7950da33cf18f4c71be
Parents: 876ae85
Author: Wes McKinney <we...@twosigma.com>
Authored: Fri Jan 13 15:45:19 2017 -0500
Committer: Wes McKinney <we...@twosigma.com>
Committed: Fri Jan 13 15:45:19 2017 -0500

----------------------------------------------------------------------
 python/pyarrow/__init__.py              |  3 +-
 python/pyarrow/includes/libarrow_io.pxd |  1 +
 python/pyarrow/io.pyx                   | 57 +++++++++++++---------------
 python/pyarrow/parquet.py               |  4 +-
 python/pyarrow/tests/test_io.py         |  7 ++--
 python/pyarrow/tests/test_ipc.py        |  2 +-
 6 files changed, 34 insertions(+), 40 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/arrow/blob/47115aa3/python/pyarrow/__init__.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py
index d25cdd4..a8c3e8e 100644
--- a/python/pyarrow/__init__.py
+++ b/python/pyarrow/__init__.py
@@ -38,8 +38,7 @@ from pyarrow.array import (Array,
 from pyarrow.error import ArrowException
 
 from pyarrow.io import (HdfsClient, HdfsFile, NativeFile, PythonFileInterface,
-                        BytesReader, Buffer, InMemoryOutputStream,
-                        BufferReader)
+                        Buffer, InMemoryOutputStream, BufferReader)
 
 from pyarrow.scalar import (ArrayValue, Scalar, NA, NAType,
                             BooleanValue,

http://git-wip-us.apache.org/repos/asf/arrow/blob/47115aa3/python/pyarrow/includes/libarrow_io.pxd
----------------------------------------------------------------------
diff --git a/python/pyarrow/includes/libarrow_io.pxd b/python/pyarrow/includes/libarrow_io.pxd
index 6b141a3..417af7d 100644
--- a/python/pyarrow/includes/libarrow_io.pxd
+++ b/python/pyarrow/includes/libarrow_io.pxd
@@ -162,6 +162,7 @@ cdef extern from "arrow/io/hdfs.h" namespace "arrow::io" nogil:
 cdef extern from "arrow/io/memory.h" namespace "arrow::io" nogil:
     cdef cppclass CBufferReader" arrow::io::BufferReader"\
         (ReadableFileInterface):
+        CBufferReader(const shared_ptr[CBuffer]& buffer)
         CBufferReader(const uint8_t* data, int64_t nbytes)
 
     cdef cppclass BufferOutputStream(OutputStream):

http://git-wip-us.apache.org/repos/asf/arrow/blob/47115aa3/python/pyarrow/io.pyx
----------------------------------------------------------------------
diff --git a/python/pyarrow/io.pyx b/python/pyarrow/io.pyx
index 1939fe8..0f626f1 100644
--- a/python/pyarrow/io.pyx
+++ b/python/pyarrow/io.pyx
@@ -275,21 +275,6 @@ cdef class OSFile(NativeFile):
         self.wr_file = <shared_ptr[OutputStream]> handle
 
 
-cdef class BytesReader(NativeFile):
-    cdef:
-        object obj
-
-    def __cinit__(self, obj):
-        if not isinstance(obj, bytes):
-            raise ValueError('Must pass bytes object')
-
-        self.obj = obj
-        self.is_readable = 1
-        self.is_writeable = 0
-        self.is_open = True
-
-        self.rd_file.reset(new pyarrow.PyBytesReader(obj))
-
 # ----------------------------------------------------------------------
 # Arrow buffers
 
@@ -330,12 +315,6 @@ cdef class Buffer:
             self.buffer.get().size())
 
 
-cdef wrap_buffer(const shared_ptr[CBuffer]& buffer):
-    cdef Buffer result = Buffer()
-    result.buffer = buffer
-    return result
-
-
 cdef shared_ptr[PoolBuffer] allocate_buffer():
     cdef shared_ptr[PoolBuffer] result
     result.reset(new PoolBuffer(pyarrow.get_memory_pool()))
@@ -356,23 +335,35 @@ cdef class InMemoryOutputStream(NativeFile):
         self.is_open = True
 
     def get_result(self):
-        cdef Buffer result = Buffer()
-
         check_status(self.wr_file.get().Close())
-        result.init(<shared_ptr[CBuffer]> self.buffer)
-
         self.is_open = False
-        return result
+        return wrap_buffer(<shared_ptr[CBuffer]> self.buffer)
 
 
 cdef class BufferReader(NativeFile):
+    """
+    Zero-copy reader from objects convertible to Arrow buffer
+
+    Parameters
+    ----------
+    obj : Python bytes or pyarrow.io.Buffer
+    """
     cdef:
         Buffer buffer
 
-    def __cinit__(self, Buffer buffer):
-        self.buffer = buffer
-        self.rd_file.reset(new CBufferReader(buffer.buffer.get().data(),
-                                             buffer.buffer.get().size()))
+    def __cinit__(self, object obj):
+        cdef shared_ptr[CBuffer] buf
+
+        if isinstance(obj, Buffer):
+            self.buffer = obj
+        elif isinstance(obj, bytes):
+            buf.reset(new pyarrow.PyBytesBuffer(obj))
+            self.buffer = wrap_buffer(buf)
+        else:
+            raise ValueError('Unable to convert value to buffer: {0}'
+                             .format(type(obj)))
+
+        self.rd_file.reset(new CBufferReader(self.buffer.buffer))
         self.is_readable = 1
         self.is_writeable = 0
         self.is_open = True
@@ -382,16 +373,20 @@ def buffer_from_bytes(object obj):
     """
     Construct an Arrow buffer from a Python bytes object
     """
+    cdef shared_ptr[CBuffer] buf
     if not isinstance(obj, bytes):
         raise ValueError('Must pass bytes object')
 
-    cdef shared_ptr[CBuffer] buf
     buf.reset(new pyarrow.PyBytesBuffer(obj))
+    return wrap_buffer(buf)
+
 
+cdef Buffer wrap_buffer(const shared_ptr[CBuffer]& buf):
     cdef Buffer result = Buffer()
     result.init(buf)
     return result
 
+
 cdef get_reader(object source, shared_ptr[ReadableFileInterface]* reader):
     cdef NativeFile nf
 

http://git-wip-us.apache.org/repos/asf/arrow/blob/47115aa3/python/pyarrow/parquet.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/parquet.py b/python/pyarrow/parquet.py
index 708ae65..2a1ac9d 100644
--- a/python/pyarrow/parquet.py
+++ b/python/pyarrow/parquet.py
@@ -27,7 +27,7 @@ class ParquetFile(object):
     ----------
     source : str or pyarrow.io.NativeFile
         Readable source. For passing Python file objects or byte buffers,
-        see pyarrow.io.PythonFileInterface or pyarrow.io.BytesReader.
+        see pyarrow.io.PythonFileInterface or pyarrow.io.BufferReader.
     metadata : ParquetFileMetadata, default None
         Use existing metadata object, rather than reading from file.
     """
@@ -78,7 +78,7 @@ def read_table(source, columns=None):
     ----------
     source: str or pyarrow.io.NativeFile
         Readable source. For passing Python file objects or byte buffers, see
-        pyarrow.io.PythonFileInterface or pyarrow.io.BytesReader.
+        pyarrow.io.PythonFileInterface or pyarrow.io.BufferReader.
     columns: list
         If not None, only these columns will be read from the file.
 

http://git-wip-us.apache.org/repos/asf/arrow/blob/47115aa3/python/pyarrow/tests/test_io.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/tests/test_io.py b/python/pyarrow/tests/test_io.py
index 224f20d..f28d44a 100644
--- a/python/pyarrow/tests/test_io.py
+++ b/python/pyarrow/tests/test_io.py
@@ -23,7 +23,6 @@ import numpy as np
 
 from pyarrow.compat import u, guid
 import pyarrow.io as io
-import pyarrow as pa
 
 # ----------------------------------------------------------------------
 # Python file-like objects
@@ -81,7 +80,7 @@ def test_python_file_read():
 def test_bytes_reader():
     # Like a BytesIO, but zero-copy underneath for C++ consumers
     data = b'some sample data'
-    f = io.BytesReader(data)
+    f = io.BufferReader(data)
 
     assert f.tell() == 0
 
@@ -103,7 +102,7 @@ def test_bytes_reader():
 
 def test_bytes_reader_non_bytes():
     with pytest.raises(ValueError):
-        io.BytesReader(u('some sample data'))
+        io.BufferReader(u('some sample data'))
 
 
 def test_bytes_reader_retains_parent_reference():
@@ -112,7 +111,7 @@ def test_bytes_reader_retains_parent_reference():
     # ARROW-421
     def get_buffer():
         data = b'some sample data' * 1000
-        reader = io.BytesReader(data)
+        reader = io.BufferReader(data)
         reader.seek(5)
         return reader.read_buffer(6)
 

http://git-wip-us.apache.org/repos/asf/arrow/blob/47115aa3/python/pyarrow/tests/test_ipc.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/tests/test_ipc.py b/python/pyarrow/tests/test_ipc.py
index 368a9d4..bbd6c6a 100644
--- a/python/pyarrow/tests/test_ipc.py
+++ b/python/pyarrow/tests/test_ipc.py
@@ -63,7 +63,7 @@ class RoundtripTest(object):
         writer.close()
 
         file_contents = self._get_source()
-        reader = ipc.ArrowFileReader(aio.BytesReader(file_contents))
+        reader = ipc.ArrowFileReader(aio.BufferReader(file_contents))
 
         assert reader.num_record_batches == num_batches