You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2017/07/17 13:03:25 UTC
arrow git commit: ARROW-597: [Python] Add read_pandas convenience to
stream and file reader classes. Add some data type docstrings
Repository: arrow
Updated Branches:
refs/heads/master e370174dd -> 5fbfd8e58
ARROW-597: [Python] Add read_pandas convenience to stream and file reader classes. Add some data type docstrings
Author: Wes McKinney <we...@twosigma.com>
Closes #855 from wesm/ARROW-597 and squashes the following commits:
1c9c3e20 [Wes McKinney] Add read_pandas convenience to stream and file reader classes. Add a bunch of missing API docstrings
Project: http://git-wip-us.apache.org/repos/asf/arrow/repo
Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/5fbfd8e5
Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/5fbfd8e5
Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/5fbfd8e5
Branch: refs/heads/master
Commit: 5fbfd8e58392439c754cdf19f6f3b27b28303e0a
Parents: e370174
Author: Wes McKinney <we...@twosigma.com>
Authored: Mon Jul 17 09:03:20 2017 -0400
Committer: Wes McKinney <we...@twosigma.com>
Committed: Mon Jul 17 09:03:20 2017 -0400
----------------------------------------------------------------------
python/doc/source/api.rst | 7 --
python/doc/source/ipc.rst | 12 ++
python/doc/source/pandas.rst | 2 +
python/pyarrow/__init__.py | 1 +
python/pyarrow/ipc.pxi | 7 ++
python/pyarrow/ipc.py | 23 +++-
python/pyarrow/scalar.pxi | 4 +-
python/pyarrow/tests/test_ipc.py | 35 ++++--
python/pyarrow/types.pxi | 206 ++++++++++++++++++++++++++--------
9 files changed, 233 insertions(+), 64 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/arrow/blob/5fbfd8e5/python/doc/source/api.rst
----------------------------------------------------------------------
diff --git a/python/doc/source/api.rst b/python/doc/source/api.rst
index 400614d..c52d400 100644
--- a/python/doc/source/api.rst
+++ b/python/doc/source/api.rst
@@ -67,7 +67,6 @@ Scalar Value Types
:toctree: generated/
NA
- NAType
Scalar
ArrayValue
BooleanValue
@@ -210,12 +209,6 @@ Type Classes
:toctree: generated/
DataType
- DecimalType
- DictionaryType
- FixedSizeBinaryType
- Time32Type
- Time64Type
- TimestampType
Field
Schema
http://git-wip-us.apache.org/repos/asf/arrow/blob/5fbfd8e5/python/doc/source/ipc.rst
----------------------------------------------------------------------
diff --git a/python/doc/source/ipc.rst b/python/doc/source/ipc.rst
index f0844cd..dca776b 100644
--- a/python/doc/source/ipc.rst
+++ b/python/doc/source/ipc.rst
@@ -136,3 +136,15 @@ batches in the file, and can read any at random:
reader.num_record_batches
b = reader.get_batch(3)
b.equals(batch)
+
+Reading from Stream and File Format for pandas
+----------------------------------------------
+
+The stream and file reader classes have a special ``read_pandas`` method to
+simplify reading multiple record batches and converting them to a single
+DataFrame output:
+
+.. ipython:: python
+
+ df = pa.open_file(buf).read_pandas()
+ df
http://git-wip-us.apache.org/repos/asf/arrow/blob/5fbfd8e5/python/doc/source/pandas.rst
----------------------------------------------------------------------
diff --git a/python/doc/source/pandas.rst b/python/doc/source/pandas.rst
index d234e78..765d62a 100644
--- a/python/doc/source/pandas.rst
+++ b/python/doc/source/pandas.rst
@@ -15,6 +15,8 @@
.. specific language governing permissions and limitations
.. under the License.
+.. _pandas:
+
Using PyArrow with pandas
=========================
http://git-wip-us.apache.org/repos/asf/arrow/blob/5fbfd8e5/python/pyarrow/__init__.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py
index f7cddd0..e3d783a 100644
--- a/python/pyarrow/__init__.py
+++ b/python/pyarrow/__init__.py
@@ -37,6 +37,7 @@ from pyarrow.lib import (null, bool_,
float16, float32, float64,
binary, string, decimal,
list_, struct, dictionary, field,
+ DataType, NAType,
Field,
Schema,
schema,
http://git-wip-us.apache.org/repos/asf/arrow/blob/5fbfd8e5/python/pyarrow/ipc.pxi
----------------------------------------------------------------------
diff --git a/python/pyarrow/ipc.pxi b/python/pyarrow/ipc.pxi
index d6df30b..31ee578 100644
--- a/python/pyarrow/ipc.pxi
+++ b/python/pyarrow/ipc.pxi
@@ -16,6 +16,9 @@
# under the License.
cdef class Message:
+ """
+ Container for an Arrow IPC message with metadata and optional body
+ """
cdef:
unique_ptr[CMessage] message
@@ -100,6 +103,10 @@ body length: {2}""".format(self.type, metadata_len, body_len)
cdef class MessageReader:
+ """
+ Interface for reading Message objects from some source (like an
+ InputStream)
+ """
cdef:
unique_ptr[CMessageReader] reader
http://git-wip-us.apache.org/repos/asf/arrow/blob/5fbfd8e5/python/pyarrow/ipc.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/ipc.py b/python/pyarrow/ipc.py
index e8ea3ac..f863128 100644
--- a/python/pyarrow/ipc.py
+++ b/python/pyarrow/ipc.py
@@ -26,7 +26,26 @@ from pyarrow.lib import (Message, MessageReader, # noqa
import pyarrow.lib as lib
-class RecordBatchStreamReader(lib._RecordBatchReader):
+class _ReadPandasOption(object):
+
+ def read_pandas(self, **options):
+ """
+ Read contents of stream and convert to pandas.DataFrame using
+ Table.to_pandas
+
+ Parameters
+ ----------
+ **options : arguments to forward to Table.to_pandas
+
+ Returns
+ -------
+ df : pandas.DataFrame
+ """
+ table = self.read_all()
+ return table.to_pandas(**options)
+
+
+class RecordBatchStreamReader(lib._RecordBatchReader, _ReadPandasOption):
"""
Reader for the Arrow streaming binary format
@@ -54,7 +73,7 @@ class RecordBatchStreamWriter(lib._RecordBatchWriter):
self._open(sink, schema)
-class RecordBatchFileReader(lib._RecordBatchFileReader):
+class RecordBatchFileReader(lib._RecordBatchFileReader, _ReadPandasOption):
"""
Class for reading Arrow record batch data from the Arrow binary file format
http://git-wip-us.apache.org/repos/asf/arrow/blob/5fbfd8e5/python/pyarrow/scalar.pxi
----------------------------------------------------------------------
diff --git a/python/pyarrow/scalar.pxi b/python/pyarrow/scalar.pxi
index 11ed0ef..dec5341 100644
--- a/python/pyarrow/scalar.pxi
+++ b/python/pyarrow/scalar.pxi
@@ -20,7 +20,9 @@ NA = None
cdef class NAType(Scalar):
-
+ """
+ Null (NA) value singleton
+ """
def __cinit__(self):
global NA
if NA is not None:
http://git-wip-us.apache.org/repos/asf/arrow/blob/5fbfd8e5/python/pyarrow/tests/test_ipc.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/tests/test_ipc.py b/python/pyarrow/tests/test_ipc.py
index b2b90d4..bcaca6d 100644
--- a/python/pyarrow/tests/test_ipc.py
+++ b/python/pyarrow/tests/test_ipc.py
@@ -63,7 +63,7 @@ class MessagingTest(object):
batches.append(batch)
writer.close()
- return batches
+ return frames, batches
class TestFile(MessagingTest, unittest.TestCase):
@@ -78,7 +78,7 @@ class TestFile(MessagingTest, unittest.TestCase):
pa.open_file(buf)
def test_simple_roundtrip(self):
- batches = self.write_batches()
+ _, batches = self.write_batches()
file_contents = pa.BufferReader(self._get_source())
reader = pa.open_file(file_contents)
@@ -92,7 +92,7 @@ class TestFile(MessagingTest, unittest.TestCase):
assert reader.schema.equals(batches[0].schema)
def test_read_all(self):
- batches = self.write_batches()
+ _, batches = self.write_batches()
file_contents = pa.BufferReader(self._get_source())
reader = pa.open_file(file_contents)
@@ -101,6 +101,16 @@ class TestFile(MessagingTest, unittest.TestCase):
expected = pa.Table.from_batches(batches)
assert result.equals(expected)
+ def test_read_pandas(self):
+ frames, _ = self.write_batches()
+
+ file_contents = pa.BufferReader(self._get_source())
+ reader = pa.open_file(file_contents)
+ result = reader.read_pandas()
+
+ expected = pd.concat(frames)
+ assert_frame_equal(result, expected)
+
class TestStream(MessagingTest, unittest.TestCase):
@@ -113,7 +123,7 @@ class TestStream(MessagingTest, unittest.TestCase):
pa.open_stream(buf)
def test_simple_roundtrip(self):
- batches = self.write_batches()
+ _, batches = self.write_batches()
file_contents = pa.BufferReader(self._get_source())
reader = pa.open_stream(file_contents)
@@ -130,7 +140,7 @@ class TestStream(MessagingTest, unittest.TestCase):
reader.get_next_batch()
def test_read_all(self):
- batches = self.write_batches()
+ _, batches = self.write_batches()
file_contents = pa.BufferReader(self._get_source())
reader = pa.open_stream(file_contents)
@@ -142,7 +152,7 @@ class TestStream(MessagingTest, unittest.TestCase):
class TestMessageReader(MessagingTest, unittest.TestCase):
def _get_example_messages(self):
- batches = self.write_batches()
+ _, batches = self.write_batches()
file_contents = self._get_source()
buf_reader = pa.BufferReader(file_contents)
reader = pa.MessageReader.open_stream(buf_reader)
@@ -187,6 +197,15 @@ class TestMessageReader(MessagingTest, unittest.TestCase):
read_batch = pa.read_record_batch(message, batch.schema)
assert read_batch.equals(batch)
+ def test_read_pandas(self):
+ frames, _ = self.write_batches()
+ file_contents = pa.BufferReader(self._get_source())
+ reader = pa.open_stream(file_contents)
+ result = reader.read_pandas()
+
+ expected = pd.concat(frames)
+ assert_frame_equal(result, expected)
+
class TestSocket(MessagingTest, unittest.TestCase):
@@ -249,7 +268,7 @@ class TestSocket(MessagingTest, unittest.TestCase):
def test_simple_roundtrip(self):
self.start_server(do_read_all=False)
- writer_batches = self.write_batches()
+ _, writer_batches = self.write_batches()
reader_schema, reader_batches = self.stop_and_get_result()
assert reader_schema.equals(writer_batches[0].schema)
@@ -259,7 +278,7 @@ class TestSocket(MessagingTest, unittest.TestCase):
def test_read_all(self):
self.start_server(do_read_all=True)
- writer_batches = self.write_batches()
+ _, writer_batches = self.write_batches()
_, result = self.stop_and_get_result()
expected = pa.Table.from_batches(writer_batches)
http://git-wip-us.apache.org/repos/asf/arrow/blob/5fbfd8e5/python/pyarrow/types.pxi
----------------------------------------------------------------------
diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi
index 0ae25c8..95bfbfb 100644
--- a/python/pyarrow/types.pxi
+++ b/python/pyarrow/types.pxi
@@ -43,7 +43,9 @@ cdef dict _pandas_type_map = {
cdef class DataType:
-
+ """
+ Base type for Apache Arrow data type instances. Wraps C++ arrow::DataType
+ """
def __cinit__(self):
pass
@@ -475,42 +477,72 @@ cdef set PRIMITIVE_TYPES = set([
def null():
+ """
+ Create instance of null type
+ """
return primitive_type(_Type_NA)
def bool_():
+ """
+ Create instance of boolean type
+ """
return primitive_type(_Type_BOOL)
def uint8():
+ """
+ Create instance of boolean type
+ """
return primitive_type(_Type_UINT8)
def int8():
+ """
+ Create instance of signed int8 type
+ """
return primitive_type(_Type_INT8)
def uint16():
+ """
+ Create instance of unsigned uint16 type
+ """
return primitive_type(_Type_UINT16)
def int16():
+ """
+ Create instance of signed int16 type
+ """
return primitive_type(_Type_INT16)
def uint32():
+ """
+ Create instance of unsigned uint32 type
+ """
return primitive_type(_Type_UINT32)
def int32():
+ """
+ Create instance of signed int32 type
+ """
return primitive_type(_Type_INT32)
def uint64():
+ """
+ Create instance of unsigned uint64 type
+ """
return primitive_type(_Type_UINT64)
def int64():
+ """
+ Create instance of signed int64 type
+ """
return primitive_type(_Type_INT64)
@@ -529,104 +561,183 @@ cdef timeunit_to_string(TimeUnit unit):
return 'ns'
-def timestamp(unit_str, tz=None):
+def timestamp(unit, tz=None):
+ """
+ Create instance of timestamp type with resolution and optional time zone
+
+ Parameters
+ ----------
+ unit : string
+ one of 's' [second], 'ms' [millisecond], 'us' [microsecond], or 'ns'
+ [nanosecond]
+ tz : string, default None
+ Time zone name. None indicates time zone naive
+
+ Examples
+ --------
+ ::
+
+ t1 = pa.timestamp('us')
+ t2 = pa.timestamp('s', tz='America/New_York')
+
+ Returns
+ -------
+ timestamp_type : TimestampType
+ """
cdef:
- TimeUnit unit
+ TimeUnit unit_code
c_string c_timezone
- if unit_str == "s":
- unit = TimeUnit_SECOND
- elif unit_str == 'ms':
- unit = TimeUnit_MILLI
- elif unit_str == 'us':
- unit = TimeUnit_MICRO
- elif unit_str == 'ns':
- unit = TimeUnit_NANO
+ if unit == "s":
+ unit_code = TimeUnit_SECOND
+ elif unit == 'ms':
+ unit_code = TimeUnit_MILLI
+ elif unit == 'us':
+ unit_code = TimeUnit_MICRO
+ elif unit == 'ns':
+ unit_code = TimeUnit_NANO
else:
raise ValueError('Invalid TimeUnit string')
cdef TimestampType out = TimestampType()
if tz is None:
- out.init(ctimestamp(unit))
- if unit in _timestamp_type_cache:
- return _timestamp_type_cache[unit]
- _timestamp_type_cache[unit] = out
+ out.init(ctimestamp(unit_code))
+ if unit_code in _timestamp_type_cache:
+ return _timestamp_type_cache[unit_code]
+ _timestamp_type_cache[unit_code] = out
else:
if not isinstance(tz, six.string_types):
tz = tz.zone
c_timezone = tobytes(tz)
- out.init(ctimestamp(unit, c_timezone))
+ out.init(ctimestamp(unit_code, c_timezone))
return out
-def time32(unit_str):
+def time32(unit):
+ """
+ Create instance of 32-bit time (time of day) type with unit resolution
+
+ Parameters
+ ----------
+ unit : string
+ one of 's' [second], or 'ms' [millisecond]
+
+ Examples
+ --------
+ ::
+
+ t1 = pa.time32('s')
+ t2 = pa.time32('ms')
+ """
cdef:
- TimeUnit unit
+ TimeUnit unit_code
c_string c_timezone
- if unit_str == "s":
- unit = TimeUnit_SECOND
- elif unit_str == 'ms':
- unit = TimeUnit_MILLI
+ if unit == "s":
+ unit_code = TimeUnit_SECOND
+ elif unit == 'ms':
+ unit_code = TimeUnit_MILLI
else:
- raise ValueError('Invalid TimeUnit for time32: {}'.format(unit_str))
+ raise ValueError('Invalid TimeUnit for time32: {}'.format(unit))
cdef Time32Type out
- if unit in _time_type_cache:
- return _time_type_cache[unit]
+ if unit_code in _time_type_cache:
+ return _time_type_cache[unit_code]
else:
out = Time32Type()
- out.init(ctime32(unit))
- _time_type_cache[unit] = out
+ out.init(ctime32(unit_code))
+ _time_type_cache[unit_code] = out
return out
-def time64(unit_str):
+def time64(unit):
+ """
+ Create instance of 64-bit time (time of day) type with unit resolution
+
+ Parameters
+ ----------
+ unit : string
+ one of 'us' [microsecond], or 'ns' [nanosecond]
+
+ Examples
+ --------
+ ::
+
+ t1 = pa.time64('us')
+ t2 = pa.time64('ns')
+ """
cdef:
- TimeUnit unit
+ TimeUnit unit_code
c_string c_timezone
- if unit_str == "us":
- unit = TimeUnit_MICRO
- elif unit_str == 'ns':
- unit = TimeUnit_NANO
+ if unit == "us":
+ unit_code = TimeUnit_MICRO
+ elif unit == 'ns':
+ unit_code = TimeUnit_NANO
else:
- raise ValueError('Invalid TimeUnit for time64: {}'.format(unit_str))
+ raise ValueError('Invalid TimeUnit for time64: {}'.format(unit))
cdef Time64Type out
- if unit in _time_type_cache:
- return _time_type_cache[unit]
+ if unit_code in _time_type_cache:
+ return _time_type_cache[unit_code]
else:
out = Time64Type()
- out.init(ctime64(unit))
- _time_type_cache[unit] = out
+ out.init(ctime64(unit_code))
+ _time_type_cache[unit_code] = out
return out
def date32():
+ """
+ Create instance of 32-bit date (days since UNIX epoch 1970-01-01)
+ """
return primitive_type(_Type_DATE32)
def date64():
+ """
+ Create instance of 64-bit date (milliseconds since UNIX epoch 1970-01-01)
+ """
return primitive_type(_Type_DATE64)
def float16():
+ """
+ Create half-precision floating point type
+ """
return primitive_type(_Type_HALF_FLOAT)
def float32():
+ """
+ Create single-precision floating point type
+ """
return primitive_type(_Type_FLOAT)
def float64():
+ """
+ Create double-precision floating point type
+ """
return primitive_type(_Type_DOUBLE)
cpdef DataType decimal(int precision, int scale=0):
+ """
+ Create decimal type with precision and scale
+
+ Parameters
+ ----------
+ precision : int
+ scale : int
+
+ Returns
+ -------
+ decimal_type : DecimalType
+ """
cdef shared_ptr[CDataType] decimal_type
decimal_type.reset(new CDecimalType(precision, scale))
return pyarrow_wrap_data_type(decimal_type)
@@ -634,13 +745,14 @@ cpdef DataType decimal(int precision, int scale=0):
def string():
"""
- UTF8 string
+ Create UTF8 variable-length string type
"""
return primitive_type(_Type_STRING)
def binary(int length=-1):
- """Binary (PyBytes-like) type
+ """
+ Create variable-length binary type
Parameters
----------
@@ -717,12 +829,14 @@ def struct(fields):
Examples
--------
- import pyarrow as pa
- fields = [
- pa.field('f1', pa.int32()),
- pa.field('f2', pa.string())
- ]
- struct_type = pa.struct(fields)
+ ::
+
+ import pyarrow as pa
+ fields = [
+ pa.field('f1', pa.int32()),
+ pa.field('f2', pa.string())
+ ]
+ struct_type = pa.struct(fields)
Returns
-------