You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2018/08/21 15:51:28 UTC

[arrow] branch master updated: ARROW-3094: [Python] Easier construction of schemas and struct types

This is an automated email from the ASF dual-hosted git repository.

wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new e8e82d0  ARROW-3094: [Python] Easier construction of schemas and struct types
e8e82d0 is described below

commit e8e82d0e9188d1430f53e02520f3d55b467cae3a
Author: Antoine Pitrou <an...@python.org>
AuthorDate: Tue Aug 21 11:51:23 2018 -0400

    ARROW-3094: [Python] Easier construction of schemas and struct types
    
    Allow calling `pa.schema()` and `pa.struct()` with a list of tuples, or a mapping of strings to datatypes, instead of having to call `pa.field()` explicitly for each field. The latter is still possible if e.g. wanting to pass metadata.
    
    Author: Antoine Pitrou <an...@python.org>
    
    Closes #2450 from pitrou/ARROW-3094-easier-struct-schema-construction and squashes the following commits:
    
    4b275417 <Antoine Pitrou> Use shorthand notation more often in docs
    12151239 <Antoine Pitrou> Fix for Python 2.7
    39a21df7 <Antoine Pitrou> ARROW-3094:  Easier construction of schemas and struct types
---
 python/doc/source/data.rst          | 36 +++++++++++---------
 python/pyarrow/tests/test_schema.py | 36 ++++++++++++++++++++
 python/pyarrow/tests/test_types.py  | 18 ++++++++++
 python/pyarrow/types.pxi            | 68 ++++++++++++++++++++++++++++++++-----
 4 files changed, 133 insertions(+), 25 deletions(-)

diff --git a/python/doc/source/data.rst b/python/doc/source/data.rst
index f54cba1..3260f6d 100644
--- a/python/doc/source/data.rst
+++ b/python/doc/source/data.rst
@@ -18,8 +18,8 @@
 .. currentmodule:: pyarrow
 .. _data:
 
-In-Memory Data Model
-====================
+Data Types and In-Memory Data Model
+===================================
 
 Apache Arrow defines columnar array data structures by composing type metadata
 with memory buffers, like the ones explained in the documentation on
@@ -107,12 +107,22 @@ A `struct` is a collection of named fields:
        pa.field('s0', t1),
        pa.field('s1', t2),
        pa.field('s2', t4),
-       pa.field('s3', t6)
+       pa.field('s3', t6),
    ]
 
    t7 = pa.struct(fields)
    print(t7)
 
+For convenience, you can pass ``(name, type)`` tuples directly instead of
+:class:`~pyarrow.Field` instances:
+
+.. ipython:: python
+
+   t8 = pa.struct([('s0', t1), ('s1', t2), ('s2', t4), ('s3', t6)])
+   print(t8)
+   t8 == t7
+
+
 See :ref:`Data Types API <api.types>` for a full listing of data type
 functions.
 
@@ -123,19 +133,15 @@ Schemas
 
 The :class:`~pyarrow.Schema` type is similar to the ``struct`` array type; it
 defines the column names and types in a record batch or table data
-structure. The ``pyarrow.schema`` factory function makes new Schema objects in
+structure. The :func:`pyarrow.schema` factory function makes new Schema objects in
 Python:
 
 .. ipython:: python
 
-   fields = [
-       pa.field('s0', t1),
-       pa.field('s1', t2),
-       pa.field('s2', t4),
-       pa.field('s3', t6)
-   ]
-
-   my_schema = pa.schema(fields)
+   my_schema = pa.schema([('field0', t1),
+                          ('field1', t2),
+                          ('field2', t4),
+                          ('field3', t6)])
    my_schema
 
 In some applications, you may not create schemas directly, only using the ones
@@ -233,10 +239,8 @@ sequence of Python dicts or tuples:
 
 .. ipython:: python
 
-   ty = pa.struct([
-       pa.field('x', pa.int8()),
-       pa.field('y', pa.bool_()),
-   ])
+   ty = pa.struct([('x', pa.int8()),
+                   ('y', pa.bool_())])
    pa.array([{'x': 1, 'y': True}, {'x': 2, 'y': False}], type=ty)
    pa.array([(3, True), (4, False)], type=ty)
 
diff --git a/python/pyarrow/tests/test_schema.py b/python/pyarrow/tests/test_schema.py
index fed6787..d358f12 100644
--- a/python/pyarrow/tests/test_schema.py
+++ b/python/pyarrow/tests/test_schema.py
@@ -15,6 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 
+from collections import OrderedDict
 import pickle
 
 import pytest
@@ -206,6 +207,7 @@ def test_schema():
     sch = pa.schema(fields)
 
     assert sch.names == ['foo', 'bar', 'baz']
+    assert sch.types == [pa.int32(), pa.string(), pa.list_(pa.int8())]
 
     assert len(sch) == 3
     assert sch[0].name == 'foo'
@@ -220,6 +222,40 @@ baz: list<item: int8>
   child 0, item: int8"""
 
 
+def test_schema_from_tuples():
+    fields = [
+        ('foo', pa.int32()),
+        ('bar', pa.string()),
+        ('baz', pa.list_(pa.int8())),
+    ]
+    sch = pa.schema(fields)
+    assert sch.names == ['foo', 'bar', 'baz']
+    assert sch.types == [pa.int32(), pa.string(), pa.list_(pa.int8())]
+    assert len(sch) == 3
+    assert repr(sch) == """\
+foo: int32
+bar: string
+baz: list<item: int8>
+  child 0, item: int8"""
+
+
+def test_schema_from_mapping():
+    fields = OrderedDict([
+        ('foo', pa.int32()),
+        ('bar', pa.string()),
+        ('baz', pa.list_(pa.int8())),
+    ])
+    sch = pa.schema(fields)
+    assert sch.names == ['foo', 'bar', 'baz']
+    assert sch.types == [pa.int32(), pa.string(), pa.list_(pa.int8())]
+    assert len(sch) == 3
+    assert repr(sch) == """\
+foo: int32
+bar: string
+baz: list<item: int8>
+  child 0, item: int8"""
+
+
 def test_field_flatten():
     f0 = pa.field('foo', pa.int32()).add_metadata({b'foo': b'bar'})
     assert f0.flatten() == [f0]
diff --git a/python/pyarrow/tests/test_types.py b/python/pyarrow/tests/test_types.py
index c0f5930..aaf2e36 100644
--- a/python/pyarrow/tests/test_types.py
+++ b/python/pyarrow/tests/test_types.py
@@ -15,6 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 
+from collections import OrderedDict
 import pickle
 
 import pytest
@@ -233,6 +234,23 @@ def test_struct_type():
     for a, b in zip(ty, fields):
         a == b
 
+    # Construct from list of tuples
+    ty = pa.struct([('a', pa.int64()),
+                    ('a', pa.int32()),
+                    ('b', pa.int32())])
+    assert list(ty) == fields
+    for a, b in zip(ty, fields):
+        a == b
+
+    # Construct from mapping
+    fields = [pa.field('a', pa.int64()),
+              pa.field('b', pa.int32())]
+    ty = pa.struct(OrderedDict([('a', pa.int64()),
+                                ('b', pa.int32())]))
+    assert list(ty) == fields
+    for a, b in zip(ty, fields):
+        a == b
+
 
 def test_union_type():
     def check_fields(ty, fields):
diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi
index c8bf643..818fa74 100644
--- a/python/pyarrow/types.pxi
+++ b/python/pyarrow/types.pxi
@@ -15,6 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 
+import collections
 import re
 
 # These are imprecise because the type (in pandas 0.x) depends on the presence
@@ -498,6 +499,13 @@ cdef class Schema:
 
     @property
     def names(self):
+        """
+        The schema's field names.
+
+        Returns
+        -------
+        list of str
+        """
         cdef int i
         result = []
         for i in range(self.schema.num_fields()):
@@ -506,6 +514,17 @@ cdef class Schema:
         return result
 
     @property
+    def types(self):
+        """
+        The schema's field types.
+
+        Returns
+        -------
+        list of DataType
+        """
+        return [field.type for field in self]
+
+    @property
     def metadata(self):
         cdef shared_ptr[const CKeyValueMetadata] metadata = (
             self.schema.metadata())
@@ -1221,7 +1240,7 @@ def struct(fields):
 
     Parameters
     ----------
-    fields : sequence of Field values
+    fields : iterable of Fields or tuples, or mapping of strings to DataTypes
 
     Examples
     --------
@@ -1229,8 +1248,14 @@ def struct(fields):
 
         import pyarrow as pa
         fields = [
+            ('f1', pa.int32()),
+            ('f2', pa.string()),
+        ]
+        struct_type = pa.struct(fields)
+
+        fields = [
             pa.field('f1', pa.int32()),
-            pa.field('f2', pa.string())
+            pa.field('f2', pa.string(), nullable=false),
         ]
         struct_type = pa.struct(fields)
 
@@ -1239,12 +1264,19 @@ def struct(fields):
     type : DataType
     """
     cdef:
-        Field field
+        Field py_field
         vector[shared_ptr[CField]] c_fields
         cdef shared_ptr[CDataType] struct_type
 
-    for field in fields:
-        c_fields.push_back(field.sp_field)
+    if isinstance(fields, collections.Mapping):
+        fields = fields.items()
+
+    for item in fields:
+        if isinstance(item, tuple):
+            py_field = field(*item)
+        else:
+            py_field = item
+        c_fields.push_back(py_field.sp_field)
 
     struct_type.reset(new CStructType(c_fields))
     return pyarrow_wrap_data_type(struct_type)
@@ -1368,10 +1400,21 @@ def schema(fields, dict metadata=None):
 
     Parameters
     ----------
-    field : list or iterable
+    field : iterable of Fields or tuples, or mapping of strings to DataTypes
     metadata : dict, default None
         Keys and values must be coercible to bytes
 
+    Examples
+    --------
+    ::
+
+        import pyarrow as pa
+        fields = [
+            ('some_int', pa.int32()),
+            ('some_string', pa.string()),
+        ]
+        schema = pa.schema(fields)
+
     Returns
     -------
     schema : pyarrow.Schema
@@ -1380,11 +1423,18 @@ def schema(fields, dict metadata=None):
         shared_ptr[CKeyValueMetadata] c_meta
         shared_ptr[CSchema] c_schema
         Schema result
-        Field field
+        Field py_field
         vector[shared_ptr[CField]] c_fields
 
-    for i, field in enumerate(fields):
-        c_fields.push_back(field.sp_field)
+    if isinstance(fields, collections.Mapping):
+        fields = fields.items()
+
+    for item in fields:
+        if isinstance(item, tuple):
+            py_field = field(*item)
+        else:
+            py_field = item
+        c_fields.push_back(py_field.sp_field)
 
     if metadata is not None:
         convert_metadata(metadata, &c_meta)