You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ks...@apache.org on 2018/12/22 23:59:18 UTC
[arrow] branch master updated: ARROW-4098: [Python] Deprecate
open_file/open_stream top level APIs in favor of using ipc namespace
This is an automated email from the ASF dual-hosted git repository.
kszucs pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new e179dda ARROW-4098: [Python] Deprecate open_file/open_stream top level APIs in favor of using ipc namespace
e179dda is described below
commit e179dda432e1f67020a0c832a11fc496eec67e7f
Author: Krisztián Szűcs <sz...@gmail.com>
AuthorDate: Sun Dec 23 00:59:06 2018 +0100
ARROW-4098: [Python] Deprecate open_file/open_stream top level APIs in favor of using ipc namespace
This will mean some user code will have to change (e.g. https://github.com/apache/spark/blob/8edae94fa7ec1a1cc2c69e0924da0da85d4aac83/python/pyspark/serializers.py#L240) but it is the most maintainable option for the long term. We should not remove the deprecated APIs until we are confident that at least our open source downstream dependencies are taken care of
Author: Krisztián Szűcs <sz...@gmail.com>
Author: Wes McKinney <we...@apache.org>
Closes #3244 from wesm/ARROW-4098 and squashes the following commits:
ec3c54be <Krisztián Szűcs> update ipc doc
9017e7ff <Krisztián Szűcs> remove accidentally committed file
36b6a861 <Wes McKinney> Fix up API docs
7ed5343e <Wes McKinney> Deprecate pyarrow.open_stream/open_file in favor of ipc-namespaced versions
---
docs/source/python/api.rst | 4 ++--
docs/source/python/ipc.rst | 10 +++++-----
python/pyarrow/__init__.py | 22 ++++++++++++++++++++++
python/pyarrow/tests/test_ipc.py | 40 ++++++++++++++++++++--------------------
4 files changed, 49 insertions(+), 27 deletions(-)
diff --git a/docs/source/python/api.rst b/docs/source/python/api.rst
index 40ccb68..0bad76f 100644
--- a/docs/source/python/api.rst
+++ b/docs/source/python/api.rst
@@ -259,14 +259,14 @@ Serialization and IPC
.. autosummary::
:toctree: generated/
+ ipc.open_file
+ ipc.open_stream
Message
MessageReader
RecordBatchFileReader
RecordBatchFileWriter
RecordBatchStreamReader
RecordBatchStreamWriter
- open_file
- open_stream
read_message
read_record_batch
get_record_batch_size
diff --git a/docs/source/python/ipc.rst b/docs/source/python/ipc.rst
index 3f7e787..812d843 100644
--- a/docs/source/python/ipc.rst
+++ b/docs/source/python/ipc.rst
@@ -84,11 +84,11 @@ particular stream. Now we can do:
Now ``buf`` contains the complete stream as an in-memory byte buffer. We can
read such a stream with :class:`~pyarrow.RecordBatchStreamReader` or the
-convenience function ``pyarrow.open_stream``:
+convenience function ``pyarrow.ipc.open_stream``:
.. ipython:: python
- reader = pa.open_stream(buf)
+ reader = pa.ipc.open_stream(buf)
reader.schema
batches = [b for b in reader]
@@ -125,11 +125,11 @@ The :class:`~pyarrow.RecordBatchFileWriter` has the same API as
The difference between :class:`~pyarrow.RecordBatchFileReader` and
:class:`~pyarrow.RecordBatchStreamReader` is that the input source must have a
``seek`` method for random access. The stream reader only requires read
-operations. We can also use the ``pyarrow.open_file`` method to open a file:
+operations. We can also use the ``pyarrow.ipc.open_file`` method to open a file:
.. ipython:: python
- reader = pa.open_file(buf)
+ reader = pa.ipc.open_file(buf)
Because we have access to the entire payload, we know the number of record
batches in the file, and can read any at random:
@@ -149,7 +149,7 @@ DataFrame output:
.. ipython:: python
- df = pa.open_file(buf).read_pandas()
+ df = pa.ipc.open_file(buf).read_pandas()
df[:5]
Arbitrary Object Serialization
diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py
index 7f0a371..0d1c1be 100644
--- a/python/pyarrow/__init__.py
+++ b/python/pyarrow/__init__.py
@@ -146,6 +146,28 @@ from pyarrow.ipc import (Message, MessageReader,
open_stream,
open_file,
serialize_pandas, deserialize_pandas)
+import pyarrow.ipc as ipc
+
+
+def open_stream(source):
+ """
+ pyarrow.open_stream deprecated since 0.12, use pyarrow.ipc.open_stream
+ """
+ import warnings
+ warnings.warn("pyarrow.open_stream is deprecated, please use "
+ "pyarrow.ipc.open_stream")
+ return ipc.open_stream(source)
+
+
+def open_file(source):
+ """
+ pyarrow.open_file deprecated since 0.12, use pyarrow.ipc.open_file
+ """
+ import warnings
+ warnings.warn("pyarrow.open_file is deprecated, please use "
+ "pyarrow.ipc.open_file")
+ return ipc.open_file(source)
+
localfs = LocalFileSystem.get_instance()
diff --git a/python/pyarrow/tests/test_ipc.py b/python/pyarrow/tests/test_ipc.py
index 0fb66f8..67a91b9 100644
--- a/python/pyarrow/tests/test_ipc.py
+++ b/python/pyarrow/tests/test_ipc.py
@@ -80,7 +80,7 @@ class FileFormatFixture(IpcFixture):
_, batches = self.write_batches(as_table=as_table)
file_contents = pa.BufferReader(self.get_source())
- reader = pa.open_file(file_contents)
+ reader = pa.ipc.open_file(file_contents)
assert reader.num_record_batches == len(batches)
@@ -121,7 +121,7 @@ def stream_fixture():
def test_empty_file():
buf = b''
with pytest.raises(pa.ArrowInvalid):
- pa.open_file(pa.BufferReader(buf))
+ pa.ipc.open_file(pa.BufferReader(buf))
def test_file_simple_roundtrip(file_fixture):
@@ -142,7 +142,7 @@ def test_file_read_all(sink_factory):
_, batches = fixture.write_batches()
file_contents = pa.BufferReader(fixture.get_source())
- reader = pa.open_file(file_contents)
+ reader = pa.ipc.open_file(file_contents)
result = reader.read_all()
expected = pa.Table.from_batches(batches)
@@ -154,8 +154,8 @@ def test_open_file_from_buffer(file_fixture):
_, batches = file_fixture.write_batches()
source = file_fixture.get_source()
- reader1 = pa.open_file(source)
- reader2 = pa.open_file(pa.BufferReader(source))
+ reader1 = pa.ipc.open_file(source)
+ reader2 = pa.ipc.open_file(pa.BufferReader(source))
reader3 = pa.RecordBatchFileReader(source)
result1 = reader1.read_all()
@@ -170,7 +170,7 @@ def test_file_read_pandas(file_fixture):
frames, _ = file_fixture.write_batches()
file_contents = pa.BufferReader(file_fixture.get_source())
- reader = pa.open_file(file_contents)
+ reader = pa.ipc.open_file(file_contents)
result = reader.read_pandas()
expected = pd.concat(frames)
@@ -189,8 +189,8 @@ def test_file_pathlib(file_fixture, tmpdir):
with open(path, 'wb') as f:
f.write(source)
- t1 = pa.open_file(pathlib.Path(path)).read_all()
- t2 = pa.open_file(pa.OSFile(path)).read_all()
+ t1 = pa.ipc.open_file(pathlib.Path(path)).read_all()
+ t2 = pa.ipc.open_file(pa.OSFile(path)).read_all()
assert t1.equals(t2)
@@ -198,7 +198,7 @@ def test_file_pathlib(file_fixture, tmpdir):
def test_empty_stream():
buf = io.BytesIO(b'')
with pytest.raises(pa.ArrowInvalid):
- pa.open_stream(buf)
+ pa.ipc.open_stream(buf)
def test_stream_categorical_roundtrip(stream_fixture):
@@ -213,7 +213,7 @@ def test_stream_categorical_roundtrip(stream_fixture):
writer.write_batch(pa.RecordBatch.from_pandas(df))
writer.close()
- table = (pa.open_stream(pa.BufferReader(stream_fixture.get_source()))
+ table = (pa.ipc.open_stream(pa.BufferReader(stream_fixture.get_source()))
.read_all())
assert_frame_equal(table.to_pandas(), df)
@@ -223,8 +223,8 @@ def test_open_stream_from_buffer(stream_fixture):
_, batches = stream_fixture.write_batches()
source = stream_fixture.get_source()
- reader1 = pa.open_stream(source)
- reader2 = pa.open_stream(pa.BufferReader(source))
+ reader1 = pa.ipc.open_stream(source)
+ reader2 = pa.ipc.open_stream(pa.BufferReader(source))
reader3 = pa.RecordBatchStreamReader(source)
result1 = reader1.read_all()
@@ -250,7 +250,7 @@ def test_stream_write_dispatch(stream_fixture):
writer.write(batch)
writer.close()
- table = (pa.open_stream(pa.BufferReader(stream_fixture.get_source()))
+ table = (pa.ipc.open_stream(pa.BufferReader(stream_fixture.get_source()))
.read_all())
assert_frame_equal(table.to_pandas(),
pd.concat([df, df], ignore_index=True))
@@ -271,7 +271,7 @@ def test_stream_write_table_batches(stream_fixture):
writer.write_table(table, chunksize=15)
writer.close()
- batches = list(pa.open_stream(stream_fixture.get_source()))
+ batches = list(pa.ipc.open_stream(stream_fixture.get_source()))
assert list(map(len, batches)) == [10, 15, 5, 10]
result_table = pa.Table.from_batches(batches)
@@ -283,7 +283,7 @@ def test_stream_write_table_batches(stream_fixture):
def test_stream_simple_roundtrip(stream_fixture):
_, batches = stream_fixture.write_batches()
file_contents = pa.BufferReader(stream_fixture.get_source())
- reader = pa.open_stream(file_contents)
+ reader = pa.ipc.open_stream(file_contents)
assert reader.schema.equals(batches[0].schema)
@@ -301,7 +301,7 @@ def test_stream_simple_roundtrip(stream_fixture):
def test_stream_read_all(stream_fixture):
_, batches = stream_fixture.write_batches()
file_contents = pa.BufferReader(stream_fixture.get_source())
- reader = pa.open_stream(file_contents)
+ reader = pa.ipc.open_stream(file_contents)
result = reader.read_all()
expected = pa.Table.from_batches(batches)
@@ -311,7 +311,7 @@ def test_stream_read_all(stream_fixture):
def test_stream_read_pandas(stream_fixture):
frames, _ = stream_fixture.write_batches()
file_contents = stream_fixture.get_source()
- reader = pa.open_stream(file_contents)
+ reader = pa.ipc.open_stream(file_contents)
result = reader.read_pandas()
expected = pd.concat(frames)
@@ -393,7 +393,7 @@ class StreamReaderServer(threading.Thread):
connection, client_address = self._sock.accept()
try:
source = connection.makefile(mode='rb')
- reader = pa.open_stream(source)
+ reader = pa.ipc.open_stream(source)
self._schema = reader.schema
if self._do_read_all:
self._table = reader.read_all()
@@ -494,7 +494,7 @@ def test_ipc_stream_no_batches():
writer.close()
source = sink.getvalue()
- reader = pa.open_stream(source)
+ reader = pa.ipc.open_stream(source)
result = reader.read_all()
assert result.schema.equals(table.schema)
@@ -636,7 +636,7 @@ def write_file(batch, sink):
def read_file(source):
- reader = pa.open_file(source)
+ reader = pa.ipc.open_file(source)
return [reader.get_batch(i)
for i in range(reader.num_record_batches)]