You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2017/07/17 13:03:25 UTC

arrow git commit: ARROW-597: [Python] Add read_pandas convenience to stream and file reader classes. Add some data type docstrings

Repository: arrow
Updated Branches:
  refs/heads/master e370174dd -> 5fbfd8e58


ARROW-597: [Python] Add read_pandas convenience to stream and file reader classes. Add some data type docstrings

Author: Wes McKinney <we...@twosigma.com>

Closes #855 from wesm/ARROW-597 and squashes the following commits:

1c9c3e20 [Wes McKinney] Add read_pandas convenience to stream and file reader classes. Add a bunch of missing API docstrings


Project: http://git-wip-us.apache.org/repos/asf/arrow/repo
Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/5fbfd8e5
Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/5fbfd8e5
Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/5fbfd8e5

Branch: refs/heads/master
Commit: 5fbfd8e58392439c754cdf19f6f3b27b28303e0a
Parents: e370174
Author: Wes McKinney <we...@twosigma.com>
Authored: Mon Jul 17 09:03:20 2017 -0400
Committer: Wes McKinney <we...@twosigma.com>
Committed: Mon Jul 17 09:03:20 2017 -0400

----------------------------------------------------------------------
 python/doc/source/api.rst        |   7 --
 python/doc/source/ipc.rst        |  12 ++
 python/doc/source/pandas.rst     |   2 +
 python/pyarrow/__init__.py       |   1 +
 python/pyarrow/ipc.pxi           |   7 ++
 python/pyarrow/ipc.py            |  23 +++-
 python/pyarrow/scalar.pxi        |   4 +-
 python/pyarrow/tests/test_ipc.py |  35 ++++--
 python/pyarrow/types.pxi         | 206 ++++++++++++++++++++++++++--------
 9 files changed, 233 insertions(+), 64 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/arrow/blob/5fbfd8e5/python/doc/source/api.rst
----------------------------------------------------------------------
diff --git a/python/doc/source/api.rst b/python/doc/source/api.rst
index 400614d..c52d400 100644
--- a/python/doc/source/api.rst
+++ b/python/doc/source/api.rst
@@ -67,7 +67,6 @@ Scalar Value Types
    :toctree: generated/
 
    NA
-   NAType
    Scalar
    ArrayValue
    BooleanValue
@@ -210,12 +209,6 @@ Type Classes
    :toctree: generated/
 
    DataType
-   DecimalType
-   DictionaryType
-   FixedSizeBinaryType
-   Time32Type
-   Time64Type
-   TimestampType
    Field
    Schema
 

http://git-wip-us.apache.org/repos/asf/arrow/blob/5fbfd8e5/python/doc/source/ipc.rst
----------------------------------------------------------------------
diff --git a/python/doc/source/ipc.rst b/python/doc/source/ipc.rst
index f0844cd..dca776b 100644
--- a/python/doc/source/ipc.rst
+++ b/python/doc/source/ipc.rst
@@ -136,3 +136,15 @@ batches in the file, and can read any at random:
    reader.num_record_batches
    b = reader.get_batch(3)
    b.equals(batch)
+
+Reading from Stream and File Format for pandas
+----------------------------------------------
+
+The stream and file reader classes have a special ``read_pandas`` method to
+simplify reading multiple record batches and converting them to a single
+DataFrame output:
+
+.. ipython:: python
+
+   df = pa.open_file(buf).read_pandas()
+   df

http://git-wip-us.apache.org/repos/asf/arrow/blob/5fbfd8e5/python/doc/source/pandas.rst
----------------------------------------------------------------------
diff --git a/python/doc/source/pandas.rst b/python/doc/source/pandas.rst
index d234e78..765d62a 100644
--- a/python/doc/source/pandas.rst
+++ b/python/doc/source/pandas.rst
@@ -15,6 +15,8 @@
 .. specific language governing permissions and limitations
 .. under the License.
 
+.. _pandas:
+
 Using PyArrow with pandas
 =========================
 

http://git-wip-us.apache.org/repos/asf/arrow/blob/5fbfd8e5/python/pyarrow/__init__.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py
index f7cddd0..e3d783a 100644
--- a/python/pyarrow/__init__.py
+++ b/python/pyarrow/__init__.py
@@ -37,6 +37,7 @@ from pyarrow.lib import (null, bool_,
                          float16, float32, float64,
                          binary, string, decimal,
                          list_, struct, dictionary, field,
+                         DataType, NAType,
                          Field,
                          Schema,
                          schema,

http://git-wip-us.apache.org/repos/asf/arrow/blob/5fbfd8e5/python/pyarrow/ipc.pxi
----------------------------------------------------------------------
diff --git a/python/pyarrow/ipc.pxi b/python/pyarrow/ipc.pxi
index d6df30b..31ee578 100644
--- a/python/pyarrow/ipc.pxi
+++ b/python/pyarrow/ipc.pxi
@@ -16,6 +16,9 @@
 # under the License.
 
 cdef class Message:
+    """
+    Container for an Arrow IPC message with metadata and optional body
+    """
     cdef:
         unique_ptr[CMessage] message
 
@@ -100,6 +103,10 @@ body length: {2}""".format(self.type, metadata_len, body_len)
 
 
 cdef class MessageReader:
+    """
+    Interface for reading Message objects from some source (like an
+    InputStream)
+    """
     cdef:
         unique_ptr[CMessageReader] reader
 

http://git-wip-us.apache.org/repos/asf/arrow/blob/5fbfd8e5/python/pyarrow/ipc.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/ipc.py b/python/pyarrow/ipc.py
index e8ea3ac..f863128 100644
--- a/python/pyarrow/ipc.py
+++ b/python/pyarrow/ipc.py
@@ -26,7 +26,26 @@ from pyarrow.lib import (Message, MessageReader,  # noqa
 import pyarrow.lib as lib
 
 
-class RecordBatchStreamReader(lib._RecordBatchReader):
+class _ReadPandasOption(object):
+
+    def read_pandas(self, **options):
+        """
+        Read contents of stream and convert to pandas.DataFrame using
+        Table.to_pandas
+
+        Parameters
+        ----------
+        **options : arguments to forward to Table.to_pandas
+
+        Returns
+        -------
+        df : pandas.DataFrame
+        """
+        table = self.read_all()
+        return table.to_pandas(**options)
+
+
+class RecordBatchStreamReader(lib._RecordBatchReader, _ReadPandasOption):
     """
     Reader for the Arrow streaming binary format
 
@@ -54,7 +73,7 @@ class RecordBatchStreamWriter(lib._RecordBatchWriter):
         self._open(sink, schema)
 
 
-class RecordBatchFileReader(lib._RecordBatchFileReader):
+class RecordBatchFileReader(lib._RecordBatchFileReader, _ReadPandasOption):
     """
     Class for reading Arrow record batch data from the Arrow binary file format
 

http://git-wip-us.apache.org/repos/asf/arrow/blob/5fbfd8e5/python/pyarrow/scalar.pxi
----------------------------------------------------------------------
diff --git a/python/pyarrow/scalar.pxi b/python/pyarrow/scalar.pxi
index 11ed0ef..dec5341 100644
--- a/python/pyarrow/scalar.pxi
+++ b/python/pyarrow/scalar.pxi
@@ -20,7 +20,9 @@ NA = None
 
 
 cdef class NAType(Scalar):
-
+    """
+    Null (NA) value singleton
+    """
     def __cinit__(self):
         global NA
         if NA is not None:

http://git-wip-us.apache.org/repos/asf/arrow/blob/5fbfd8e5/python/pyarrow/tests/test_ipc.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/tests/test_ipc.py b/python/pyarrow/tests/test_ipc.py
index b2b90d4..bcaca6d 100644
--- a/python/pyarrow/tests/test_ipc.py
+++ b/python/pyarrow/tests/test_ipc.py
@@ -63,7 +63,7 @@ class MessagingTest(object):
             batches.append(batch)
 
         writer.close()
-        return batches
+        return frames, batches
 
 
 class TestFile(MessagingTest, unittest.TestCase):
@@ -78,7 +78,7 @@ class TestFile(MessagingTest, unittest.TestCase):
             pa.open_file(buf)
 
     def test_simple_roundtrip(self):
-        batches = self.write_batches()
+        _, batches = self.write_batches()
         file_contents = pa.BufferReader(self._get_source())
 
         reader = pa.open_file(file_contents)
@@ -92,7 +92,7 @@ class TestFile(MessagingTest, unittest.TestCase):
             assert reader.schema.equals(batches[0].schema)
 
     def test_read_all(self):
-        batches = self.write_batches()
+        _, batches = self.write_batches()
         file_contents = pa.BufferReader(self._get_source())
 
         reader = pa.open_file(file_contents)
@@ -101,6 +101,16 @@ class TestFile(MessagingTest, unittest.TestCase):
         expected = pa.Table.from_batches(batches)
         assert result.equals(expected)
 
+    def test_read_pandas(self):
+        frames, _ = self.write_batches()
+
+        file_contents = pa.BufferReader(self._get_source())
+        reader = pa.open_file(file_contents)
+        result = reader.read_pandas()
+
+        expected = pd.concat(frames)
+        assert_frame_equal(result, expected)
+
 
 class TestStream(MessagingTest, unittest.TestCase):
 
@@ -113,7 +123,7 @@ class TestStream(MessagingTest, unittest.TestCase):
             pa.open_stream(buf)
 
     def test_simple_roundtrip(self):
-        batches = self.write_batches()
+        _, batches = self.write_batches()
         file_contents = pa.BufferReader(self._get_source())
         reader = pa.open_stream(file_contents)
 
@@ -130,7 +140,7 @@ class TestStream(MessagingTest, unittest.TestCase):
             reader.get_next_batch()
 
     def test_read_all(self):
-        batches = self.write_batches()
+        _, batches = self.write_batches()
         file_contents = pa.BufferReader(self._get_source())
         reader = pa.open_stream(file_contents)
 
@@ -142,7 +152,7 @@ class TestStream(MessagingTest, unittest.TestCase):
 class TestMessageReader(MessagingTest, unittest.TestCase):
 
     def _get_example_messages(self):
-        batches = self.write_batches()
+        _, batches = self.write_batches()
         file_contents = self._get_source()
         buf_reader = pa.BufferReader(file_contents)
         reader = pa.MessageReader.open_stream(buf_reader)
@@ -187,6 +197,15 @@ class TestMessageReader(MessagingTest, unittest.TestCase):
             read_batch = pa.read_record_batch(message, batch.schema)
             assert read_batch.equals(batch)
 
+    def test_read_pandas(self):
+        frames, _ = self.write_batches()
+        file_contents = pa.BufferReader(self._get_source())
+        reader = pa.open_stream(file_contents)
+        result = reader.read_pandas()
+
+        expected = pd.concat(frames)
+        assert_frame_equal(result, expected)
+
 
 class TestSocket(MessagingTest, unittest.TestCase):
 
@@ -249,7 +268,7 @@ class TestSocket(MessagingTest, unittest.TestCase):
 
     def test_simple_roundtrip(self):
         self.start_server(do_read_all=False)
-        writer_batches = self.write_batches()
+        _, writer_batches = self.write_batches()
         reader_schema, reader_batches = self.stop_and_get_result()
 
         assert reader_schema.equals(writer_batches[0].schema)
@@ -259,7 +278,7 @@ class TestSocket(MessagingTest, unittest.TestCase):
 
     def test_read_all(self):
         self.start_server(do_read_all=True)
-        writer_batches = self.write_batches()
+        _, writer_batches = self.write_batches()
         _, result = self.stop_and_get_result()
 
         expected = pa.Table.from_batches(writer_batches)

http://git-wip-us.apache.org/repos/asf/arrow/blob/5fbfd8e5/python/pyarrow/types.pxi
----------------------------------------------------------------------
diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi
index 0ae25c8..95bfbfb 100644
--- a/python/pyarrow/types.pxi
+++ b/python/pyarrow/types.pxi
@@ -43,7 +43,9 @@ cdef dict _pandas_type_map = {
 
 
 cdef class DataType:
-
+    """
+    Base type for Apache Arrow data type instances. Wraps C++ arrow::DataType
+    """
     def __cinit__(self):
         pass
 
@@ -475,42 +477,72 @@ cdef set PRIMITIVE_TYPES = set([
 
 
 def null():
+    """
+    Create instance of null type
+    """
     return primitive_type(_Type_NA)
 
 
 def bool_():
+    """
+    Create instance of boolean type
+    """
     return primitive_type(_Type_BOOL)
 
 
 def uint8():
+    """
+    Create instance of boolean type
+    """
     return primitive_type(_Type_UINT8)
 
 
 def int8():
+    """
+    Create instance of signed int8 type
+    """
     return primitive_type(_Type_INT8)
 
 
 def uint16():
+    """
+    Create instance of unsigned uint16 type
+    """
     return primitive_type(_Type_UINT16)
 
 
 def int16():
+    """
+    Create instance of signed int16 type
+    """
     return primitive_type(_Type_INT16)
 
 
 def uint32():
+    """
+    Create instance of unsigned uint32 type
+    """
     return primitive_type(_Type_UINT32)
 
 
 def int32():
+    """
+    Create instance of signed int32 type
+    """
     return primitive_type(_Type_INT32)
 
 
 def uint64():
+    """
+    Create instance of unsigned uint64 type
+    """
     return primitive_type(_Type_UINT64)
 
 
 def int64():
+    """
+    Create instance of signed int64 type
+    """
     return primitive_type(_Type_INT64)
 
 
@@ -529,104 +561,183 @@ cdef timeunit_to_string(TimeUnit unit):
         return 'ns'
 
 
-def timestamp(unit_str, tz=None):
+def timestamp(unit, tz=None):
+    """
+    Create instance of timestamp type with resolution and optional time zone
+
+    Parameters
+    ----------
+    unit : string
+        one of 's' [second], 'ms' [millisecond], 'us' [microsecond], or 'ns'
+        [nanosecond]
+    tz : string, default None
+        Time zone name. None indicates time zone naive
+
+    Examples
+    --------
+    ::
+
+        t1 = pa.timestamp('us')
+        t2 = pa.timestamp('s', tz='America/New_York')
+
+    Returns
+    -------
+    timestamp_type : TimestampType
+    """
     cdef:
-        TimeUnit unit
+        TimeUnit unit_code
         c_string c_timezone
 
-    if unit_str == "s":
-        unit = TimeUnit_SECOND
-    elif unit_str == 'ms':
-        unit = TimeUnit_MILLI
-    elif unit_str == 'us':
-        unit = TimeUnit_MICRO
-    elif unit_str == 'ns':
-        unit = TimeUnit_NANO
+    if unit == "s":
+        unit_code = TimeUnit_SECOND
+    elif unit == 'ms':
+        unit_code = TimeUnit_MILLI
+    elif unit == 'us':
+        unit_code = TimeUnit_MICRO
+    elif unit == 'ns':
+        unit_code = TimeUnit_NANO
     else:
         raise ValueError('Invalid TimeUnit string')
 
     cdef TimestampType out = TimestampType()
 
     if tz is None:
-        out.init(ctimestamp(unit))
-        if unit in _timestamp_type_cache:
-            return _timestamp_type_cache[unit]
-        _timestamp_type_cache[unit] = out
+        out.init(ctimestamp(unit_code))
+        if unit_code in _timestamp_type_cache:
+            return _timestamp_type_cache[unit_code]
+        _timestamp_type_cache[unit_code] = out
     else:
         if not isinstance(tz, six.string_types):
             tz = tz.zone
 
         c_timezone = tobytes(tz)
-        out.init(ctimestamp(unit, c_timezone))
+        out.init(ctimestamp(unit_code, c_timezone))
 
     return out
 
 
-def time32(unit_str):
+def time32(unit):
+    """
+    Create instance of 32-bit time (time of day) type with unit resolution
+
+    Parameters
+    ----------
+    unit : string
+        one of 's' [second], or 'ms' [millisecond]
+
+    Examples
+    --------
+    ::
+
+        t1 = pa.time32('s')
+        t2 = pa.time32('ms')
+    """
     cdef:
-        TimeUnit unit
+        TimeUnit unit_code
         c_string c_timezone
 
-    if unit_str == "s":
-        unit = TimeUnit_SECOND
-    elif unit_str == 'ms':
-        unit = TimeUnit_MILLI
+    if unit == "s":
+        unit_code = TimeUnit_SECOND
+    elif unit == 'ms':
+        unit_code = TimeUnit_MILLI
     else:
-        raise ValueError('Invalid TimeUnit for time32: {}'.format(unit_str))
+        raise ValueError('Invalid TimeUnit for time32: {}'.format(unit))
 
     cdef Time32Type out
-    if unit in _time_type_cache:
-        return _time_type_cache[unit]
+    if unit_code in _time_type_cache:
+        return _time_type_cache[unit_code]
     else:
         out = Time32Type()
-        out.init(ctime32(unit))
-        _time_type_cache[unit] = out
+        out.init(ctime32(unit_code))
+        _time_type_cache[unit_code] = out
         return out
 
 
-def time64(unit_str):
+def time64(unit):
+    """
+    Create instance of 64-bit time (time of day) type with unit resolution
+
+    Parameters
+    ----------
+    unit : string
+        one of 'us' [microsecond], or 'ns' [nanosecond]
+
+    Examples
+    --------
+    ::
+
+        t1 = pa.time64('us')
+        t2 = pa.time64('ns')
+    """
     cdef:
-        TimeUnit unit
+        TimeUnit unit_code
         c_string c_timezone
 
-    if unit_str == "us":
-        unit = TimeUnit_MICRO
-    elif unit_str == 'ns':
-        unit = TimeUnit_NANO
+    if unit == "us":
+        unit_code = TimeUnit_MICRO
+    elif unit == 'ns':
+        unit_code = TimeUnit_NANO
     else:
-        raise ValueError('Invalid TimeUnit for time64: {}'.format(unit_str))
+        raise ValueError('Invalid TimeUnit for time64: {}'.format(unit))
 
     cdef Time64Type out
-    if unit in _time_type_cache:
-        return _time_type_cache[unit]
+    if unit_code in _time_type_cache:
+        return _time_type_cache[unit_code]
     else:
         out = Time64Type()
-        out.init(ctime64(unit))
-        _time_type_cache[unit] = out
+        out.init(ctime64(unit_code))
+        _time_type_cache[unit_code] = out
         return out
 
 
 def date32():
+    """
+    Create instance of 32-bit date (days since UNIX epoch 1970-01-01)
+    """
     return primitive_type(_Type_DATE32)
 
 
 def date64():
+    """
+    Create instance of 64-bit date (milliseconds since UNIX epoch 1970-01-01)
+    """
     return primitive_type(_Type_DATE64)
 
 
 def float16():
+    """
+    Create half-precision floating point type
+    """
     return primitive_type(_Type_HALF_FLOAT)
 
 
 def float32():
+    """
+    Create single-precision floating point type
+    """
     return primitive_type(_Type_FLOAT)
 
 
 def float64():
+    """
+    Create double-precision floating point type
+    """
     return primitive_type(_Type_DOUBLE)
 
 
 cpdef DataType decimal(int precision, int scale=0):
+    """
+    Create decimal type with precision and scale
+
+    Parameters
+    ----------
+    precision : int
+    scale : int
+
+    Returns
+    -------
+    decimal_type : DecimalType
+    """
     cdef shared_ptr[CDataType] decimal_type
     decimal_type.reset(new CDecimalType(precision, scale))
     return pyarrow_wrap_data_type(decimal_type)
@@ -634,13 +745,14 @@ cpdef DataType decimal(int precision, int scale=0):
 
 def string():
     """
-    UTF8 string
+    Create UTF8 variable-length string type
     """
     return primitive_type(_Type_STRING)
 
 
 def binary(int length=-1):
-    """Binary (PyBytes-like) type
+    """
+    Create variable-length binary type
 
     Parameters
     ----------
@@ -717,12 +829,14 @@ def struct(fields):
 
     Examples
     --------
-    import pyarrow as pa
-    fields = [
-        pa.field('f1', pa.int32()),
-        pa.field('f2', pa.string())
-    ]
-    struct_type = pa.struct(fields)
+    ::
+
+        import pyarrow as pa
+        fields = [
+            pa.field('f1', pa.int32()),
+            pa.field('f2', pa.string())
+        ]
+        struct_type = pa.struct(fields)
 
     Returns
     -------