You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by uw...@apache.org on 2017/04/18 14:37:08 UTC

arrow git commit: ARROW-818: [Python] Expand Sphinx API docs, pyarrow.* namespace. Add factory functions for time32, time64

Repository: arrow
Updated Branches:
  refs/heads/master bb287e203 -> 7f20f6e73


ARROW-818: [Python] Expand Sphinx API docs, pyarrow.* namespace. Add factory functions for time32, time64

Author: Wes McKinney <we...@twosigma.com>

Closes #557 from wesm/ARROW-818 and squashes the following commits:

96ce436 [Wes McKinney] Expand Sphinx API docs, pyarrow.* namespace. Add factory functions for time32, time64


Project: http://git-wip-us.apache.org/repos/asf/arrow/repo
Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/7f20f6e7
Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/7f20f6e7
Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/7f20f6e7

Branch: refs/heads/master
Commit: 7f20f6e738a2e163b0b753416ee4c4ed00998f4b
Parents: bb287e2
Author: Wes McKinney <we...@twosigma.com>
Authored: Tue Apr 18 16:37:03 2017 +0200
Committer: Uwe L. Korn <uw...@xhochy.com>
Committed: Tue Apr 18 16:37:03 2017 +0200

----------------------------------------------------------------------
 python/doc/source/api.rst            | 69 +++++++++++++++++++++++-----
 python/pyarrow/__init__.py           | 33 ++++++++++----
 python/pyarrow/_array.pxd            | 10 +++++
 python/pyarrow/_array.pyx            | 74 ++++++++++++++++++++++++++++++-
 python/pyarrow/_io.pyx               |  6 +--
 python/pyarrow/includes/libarrow.pxd |  3 ++
 python/pyarrow/tests/test_io.py      |  4 +-
 python/pyarrow/tests/test_schema.py  | 21 +++++++++
 8 files changed, 195 insertions(+), 25 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/arrow/blob/7f20f6e7/python/doc/source/api.rst
----------------------------------------------------------------------
diff --git a/python/doc/source/api.rst b/python/doc/source/api.rst
index 92e248b..08a0694 100644
--- a/python/doc/source/api.rst
+++ b/python/doc/source/api.rst
@@ -24,8 +24,8 @@ API Reference
 
 .. _api.functions:
 
-Type Metadata and Schemas
--------------------------
+Type and Schema Factory Functions
+---------------------------------
 
 .. autosummary::
    :toctree: generated/
@@ -43,6 +43,8 @@ Type Metadata and Schemas
    float16
    float32
    float64
+   time32
+   time64
    timestamp
    date32
    date64
@@ -53,10 +55,8 @@ Type Metadata and Schemas
    struct
    dictionary
    field
-   DataType
-   Field
-   Schema
    schema
+   from_numpy_dtype
 
 Scalar Value Types
 ------------------
@@ -68,6 +68,7 @@ Scalar Value Types
    NAType
    Scalar
    ArrayValue
+   BooleanValue
    Int8Value
    Int16Value
    Int32Value
@@ -82,6 +83,11 @@ Scalar Value Types
    BinaryValue
    StringValue
    FixedSizeBinaryValue
+   Date32Value
+   Date64Value
+   TimestampValue
+   DecimalValue
+
 
 Array Types and Constructors
 ----------------------------
@@ -91,21 +97,30 @@ Array Types and Constructors
 
    array
    Array
-   NullArray
-   NumericArray
-   IntegerArray
-   FloatingPointArray
    BooleanArray
+   DictionaryArray
+   FloatingPointArray
+   IntegerArray
    Int8Array
    Int16Array
    Int32Array
    Int64Array
+   NullArray
+   NumericArray
    UInt8Array
    UInt16Array
    UInt32Array
    UInt64Array
-   DictionaryArray
+   BinaryArray
+   FixedSizeBinaryArray
    StringArray
+   Time32Array
+   Time64Array
+   Date32Array
+   Date64Array
+   TimestampArray
+   DecimalArray
+   ListArray
 
 Tables and Record Batches
 -------------------------
@@ -113,9 +128,11 @@ Tables and Record Batches
 .. autosummary::
    :toctree: generated/
 
+   ChunkedArray
    Column
    RecordBatch
    Table
+   get_record_batch_size
 
 Tensor type and Functions
 -------------------------
@@ -141,7 +158,7 @@ Input / Output and Shared Memory
    MemoryMappedFile
    memory_map
    create_memory_map
-   PythonFileInterface
+   PythonFile
 
 Interprocess Communication and Messaging
 ----------------------------------------
@@ -165,3 +182,33 @@ Memory Pools
    jemalloc_memory_pool
    total_allocated_bytes
    set_memory_pool
+
+Type Classes
+------------
+
+.. autosummary::
+   :toctree: generated/
+
+   DataType
+   DecimalType
+   DictionaryType
+   FixedSizeBinaryType
+   Time32Type
+   Time64Type
+   TimestampType
+   Field
+   Schema
+
+.. currentmodule:: pyarrow.parquet
+
+Apache Parquet
+--------------
+
+.. autosummary::
+   :toctree: generated/
+
+   ParquetDataset
+   ParquetFile
+   read_table
+   write_metadata
+   write_table

http://git-wip-us.apache.org/repos/asf/arrow/blob/7f20f6e7/python/pyarrow/__init__.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py
index 87f2352..4d8da9f 100644
--- a/python/pyarrow/__init__.py
+++ b/python/pyarrow/__init__.py
@@ -31,12 +31,20 @@ from pyarrow._config import cpu_count, set_cpu_count
 from pyarrow._array import (null, bool_,
                             int8, int16, int32, int64,
                             uint8, uint16, uint32, uint64,
-                            timestamp, date32, date64,
+                            time32, time64, timestamp, date32, date64,
                             float16, float32, float64,
                             binary, string, decimal,
                             list_, struct, dictionary, field,
-                            DataType, FixedSizeBinaryType,
-                            Field, Schema, schema,
+                            DataType,
+                            DecimalType,
+                            DictionaryType,
+                            FixedSizeBinaryType,
+                            TimestampType,
+                            Time32Type,
+                            Time64Type,
+                            Field,
+                            Schema,
+                            schema,
                             Array, Tensor,
                             array,
                             from_numpy_dtype,
@@ -47,25 +55,34 @@ from pyarrow._array import (null, bool_,
                             Int16Array, UInt16Array,
                             Int32Array, UInt32Array,
                             Int64Array, UInt64Array,
-                            ListArray, StringArray,
+                            ListArray,
+                            BinaryArray, StringArray,
+                            FixedSizeBinaryArray,
                             DictionaryArray,
+                            Date32Array, Date64Array,
+                            TimestampArray, Time32Array, Time64Array,
+                            DecimalArray,
                             ArrayValue, Scalar, NA, NAType,
                             BooleanValue,
                             Int8Value, Int16Value, Int32Value, Int64Value,
                             UInt8Value, UInt16Value, UInt32Value, UInt64Value,
                             FloatValue, DoubleValue, ListValue,
-                            BinaryValue, StringValue, FixedSizeBinaryValue)
+                            BinaryValue, StringValue, FixedSizeBinaryValue,
+                            DecimalValue,
+                            Date32Value, Date64Value, TimestampValue)
 
-from pyarrow._io import (HdfsFile, NativeFile, PythonFileInterface,
+from pyarrow._io import (HdfsFile, NativeFile, PythonFile,
                          Buffer, BufferReader, InMemoryOutputStream,
                          OSFile, MemoryMappedFile, memory_map,
                          frombuffer, read_tensor, write_tensor,
                          memory_map, create_memory_map,
-                         get_record_batch_size, get_tensor_size)
+                         get_record_batch_size, get_tensor_size,
+                         have_libhdfs, have_libhdfs3)
 
 from pyarrow._memory import (MemoryPool, total_allocated_bytes,
                              set_memory_pool, default_memory_pool)
-from pyarrow._table import Column, RecordBatch, Table, concat_tables
+from pyarrow._table import (ChunkedArray, Column, RecordBatch, Table,
+                            concat_tables)
 from pyarrow._error import (ArrowException,
                             ArrowKeyError,
                             ArrowInvalid,

http://git-wip-us.apache.org/repos/asf/arrow/blob/7f20f6e7/python/pyarrow/_array.pxd
----------------------------------------------------------------------
diff --git a/python/pyarrow/_array.pxd b/python/pyarrow/_array.pxd
index afb0c27..464de31 100644
--- a/python/pyarrow/_array.pxd
+++ b/python/pyarrow/_array.pxd
@@ -42,6 +42,16 @@ cdef class TimestampType(DataType):
         const CTimestampType* ts_type
 
 
+cdef class Time32Type(DataType):
+    cdef:
+        const CTime32Type* time_type
+
+
+cdef class Time64Type(DataType):
+    cdef:
+        const CTime64Type* time_type
+
+
 cdef class FixedSizeBinaryType(DataType):
     cdef:
         const CFixedSizeBinaryType* fixed_size_binary_type

http://git-wip-us.apache.org/repos/asf/arrow/blob/7f20f6e7/python/pyarrow/_array.pyx
----------------------------------------------------------------------
diff --git a/python/pyarrow/_array.pyx b/python/pyarrow/_array.pyx
index e41380d..1c571ba 100644
--- a/python/pyarrow/_array.pyx
+++ b/python/pyarrow/_array.pyx
@@ -127,6 +127,30 @@ cdef class TimestampType(DataType):
                 return None
 
 
+cdef class Time32Type(DataType):
+
+    cdef void init(self, const shared_ptr[CDataType]& type):
+        DataType.init(self, type)
+        self.time_type = <const CTime32Type*> type.get()
+
+    property unit:
+
+        def __get__(self):
+            return timeunit_to_string(self.time_type.unit())
+
+
+cdef class Time64Type(DataType):
+
+    cdef void init(self, const shared_ptr[CDataType]& type):
+        DataType.init(self, type)
+        self.time_type = <const CTime64Type*> type.get()
+
+    property unit:
+
+        def __get__(self):
+            return timeunit_to_string(self.time_type.unit())
+
+
 cdef class FixedSizeBinaryType(DataType):
 
     cdef void init(self, const shared_ptr[CDataType]& type):
@@ -342,6 +366,7 @@ def int64():
 
 
 cdef dict _timestamp_type_cache = {}
+cdef dict _time_type_cache = {}
 
 
 cdef timeunit_to_string(TimeUnit unit):
@@ -369,7 +394,7 @@ def timestamp(unit_str, tz=None):
     elif unit_str == 'ns':
         unit = TimeUnit_NANO
     else:
-        raise TypeError('Invalid TimeUnit string')
+        raise ValueError('Invalid TimeUnit string')
 
     cdef TimestampType out = TimestampType()
 
@@ -388,6 +413,50 @@ def timestamp(unit_str, tz=None):
     return out
 
 
+def time32(unit_str):
+    cdef:
+        TimeUnit unit
+        c_string c_timezone
+
+    if unit_str == "s":
+        unit = TimeUnit_SECOND
+    elif unit_str == 'ms':
+        unit = TimeUnit_MILLI
+    else:
+        raise ValueError('Invalid TimeUnit for time32: {}'.format(unit_str))
+
+    cdef Time32Type out
+    if unit in _time_type_cache:
+        return _time_type_cache[unit]
+    else:
+        out = Time32Type()
+        out.init(ctime32(unit))
+        _time_type_cache[unit] = out
+        return out
+
+
+def time64(unit_str):
+    cdef:
+        TimeUnit unit
+        c_string c_timezone
+
+    if unit_str == "us":
+        unit = TimeUnit_MICRO
+    elif unit_str == 'ns':
+        unit = TimeUnit_NANO
+    else:
+        raise ValueError('Invalid TimeUnit for time64: {}'.format(unit_str))
+
+    cdef Time64Type out
+    if unit in _time_type_cache:
+        return _time_type_cache[unit]
+    else:
+        out = Time64Type()
+        out.init(ctime64(unit))
+        _time_type_cache[unit] = out
+        return out
+
+
 def date32():
     return primitive_type(_Type_DATE32)
 
@@ -516,6 +585,9 @@ cdef Schema box_schema(const shared_ptr[CSchema]& type):
 
 
 def from_numpy_dtype(object dtype):
+    """
+    Convert NumPy dtype to pyarrow.DataType
+    """
     cdef shared_ptr[CDataType] c_type
     with nogil:
         check_status(pyarrow.NumPyDtypeToArrow(dtype, &c_type))

http://git-wip-us.apache.org/repos/asf/arrow/blob/7f20f6e7/python/pyarrow/_io.pyx
----------------------------------------------------------------------
diff --git a/python/pyarrow/_io.pyx b/python/pyarrow/_io.pyx
index 09e8233..40c76f8 100644
--- a/python/pyarrow/_io.pyx
+++ b/python/pyarrow/_io.pyx
@@ -307,7 +307,7 @@ cdef class NativeFile:
 # Python file-like objects
 
 
-cdef class PythonFileInterface(NativeFile):
+cdef class PythonFile(NativeFile):
     cdef:
         object handle
 
@@ -600,7 +600,7 @@ cdef get_reader(object source, shared_ptr[RandomAccessFile]* reader):
         source = BufferReader(source)
     elif not isinstance(source, NativeFile) and hasattr(source, 'read'):
         # Optimistically hope this is file-like
-        source = PythonFileInterface(source, mode='r')
+        source = PythonFile(source, mode='r')
 
     if isinstance(source, NativeFile):
         nf = source
@@ -622,7 +622,7 @@ cdef get_writer(object source, shared_ptr[OutputStream]* writer):
         source = OSFile(source, mode='w')
     elif not isinstance(source, NativeFile) and hasattr(source, 'write'):
         # Optimistically hope this is file-like
-        source = PythonFileInterface(source, mode='w')
+        source = PythonFile(source, mode='w')
 
     if isinstance(source, NativeFile):
         nf = source

http://git-wip-us.apache.org/repos/asf/arrow/blob/7f20f6e7/python/pyarrow/includes/libarrow.pxd
----------------------------------------------------------------------
diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd
index ea835f6..473a0b9 100644
--- a/python/pyarrow/includes/libarrow.pxd
+++ b/python/pyarrow/includes/libarrow.pxd
@@ -106,6 +106,9 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
     cdef cppclass CTime64Type" arrow::Time64Type"(CFixedWidthType):
         TimeUnit unit()
 
+    shared_ptr[CDataType] ctime32" arrow::time32"(TimeUnit unit)
+    shared_ptr[CDataType] ctime64" arrow::time64"(TimeUnit unit)
+
     cdef cppclass CDictionaryType" arrow::DictionaryType"(CFixedWidthType):
         CDictionaryType(const shared_ptr[CDataType]& index_type,
                         const shared_ptr[CArray]& dictionary)

http://git-wip-us.apache.org/repos/asf/arrow/blob/7f20f6e7/python/pyarrow/tests/test_io.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/tests/test_io.py b/python/pyarrow/tests/test_io.py
index c5d3708..a14898f 100644
--- a/python/pyarrow/tests/test_io.py
+++ b/python/pyarrow/tests/test_io.py
@@ -32,7 +32,7 @@ import pyarrow as pa
 def test_python_file_write():
     buf = BytesIO()
 
-    f = pa.PythonFileInterface(buf)
+    f = pa.PythonFile(buf)
 
     assert f.tell() == 0
 
@@ -56,7 +56,7 @@ def test_python_file_read():
     data = b'some sample data'
 
     buf = BytesIO(data)
-    f = pa.PythonFileInterface(buf, mode='r')
+    f = pa.PythonFile(buf, mode='r')
 
     assert f.size() == len(data)
 

http://git-wip-us.apache.org/repos/asf/arrow/blob/7f20f6e7/python/pyarrow/tests/test_schema.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/tests/test_schema.py b/python/pyarrow/tests/test_schema.py
index d1107fb..da704f3 100644
--- a/python/pyarrow/tests/test_schema.py
+++ b/python/pyarrow/tests/test_schema.py
@@ -77,6 +77,27 @@ def test_type_timestamp_with_tz():
     assert t.tz == tz
 
 
+def test_time_types():
+    t1 = pa.time32('s')
+    t2 = pa.time32('ms')
+    t3 = pa.time64('us')
+    t4 = pa.time64('ns')
+
+    assert t1.unit == 's'
+    assert t2.unit == 'ms'
+    assert t3.unit == 'us'
+    assert t4.unit == 'ns'
+
+    assert str(t1) == 'time32[s]'
+    assert str(t4) == 'time64[ns]'
+
+    with pytest.raises(ValueError):
+        pa.time32('us')
+
+    with pytest.raises(ValueError):
+        pa.time64('s')
+
+
 def test_type_from_numpy_dtype_timestamps():
     cases = [
         (np.dtype('datetime64[s]'), pa.timestamp('s')),