You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2018/08/21 15:51:28 UTC
[arrow] branch master updated: ARROW-3094: [Python] Easier
construction of schemas and struct types
This is an automated email from the ASF dual-hosted git repository.
wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new e8e82d0 ARROW-3094: [Python] Easier construction of schemas and struct types
e8e82d0 is described below
commit e8e82d0e9188d1430f53e02520f3d55b467cae3a
Author: Antoine Pitrou <an...@python.org>
AuthorDate: Tue Aug 21 11:51:23 2018 -0400
ARROW-3094: [Python] Easier construction of schemas and struct types
Allow calling `pa.schema()` and `pa.struct()` with a list of tuples, or a mapping of strings to datatypes, instead of having to call `pa.field()` explicitly for each field. The latter is still possible if e.g. wanting to pass metadata.
Author: Antoine Pitrou <an...@python.org>
Closes #2450 from pitrou/ARROW-3094-easier-struct-schema-construction and squashes the following commits:
4b275417 <Antoine Pitrou> Use shorthand notation more often in docs
12151239 <Antoine Pitrou> Fix for Python 2.7
39a21df7 <Antoine Pitrou> ARROW-3094: Easier construction of schemas and struct types
---
python/doc/source/data.rst | 36 +++++++++++---------
python/pyarrow/tests/test_schema.py | 36 ++++++++++++++++++++
python/pyarrow/tests/test_types.py | 18 ++++++++++
python/pyarrow/types.pxi | 68 ++++++++++++++++++++++++++++++++-----
4 files changed, 133 insertions(+), 25 deletions(-)
diff --git a/python/doc/source/data.rst b/python/doc/source/data.rst
index f54cba1..3260f6d 100644
--- a/python/doc/source/data.rst
+++ b/python/doc/source/data.rst
@@ -18,8 +18,8 @@
.. currentmodule:: pyarrow
.. _data:
-In-Memory Data Model
-====================
+Data Types and In-Memory Data Model
+===================================
Apache Arrow defines columnar array data structures by composing type metadata
with memory buffers, like the ones explained in the documentation on
@@ -107,12 +107,22 @@ A `struct` is a collection of named fields:
pa.field('s0', t1),
pa.field('s1', t2),
pa.field('s2', t4),
- pa.field('s3', t6)
+ pa.field('s3', t6),
]
t7 = pa.struct(fields)
print(t7)
+For convenience, you can pass ``(name, type)`` tuples directly instead of
+:class:`~pyarrow.Field` instances:
+
+.. ipython:: python
+
+ t8 = pa.struct([('s0', t1), ('s1', t2), ('s2', t4), ('s3', t6)])
+ print(t8)
+ t8 == t7
+
+
See :ref:`Data Types API <api.types>` for a full listing of data type
functions.
@@ -123,19 +133,15 @@ Schemas
The :class:`~pyarrow.Schema` type is similar to the ``struct`` array type; it
defines the column names and types in a record batch or table data
-structure. The ``pyarrow.schema`` factory function makes new Schema objects in
+structure. The :func:`pyarrow.schema` factory function makes new Schema objects in
Python:
.. ipython:: python
- fields = [
- pa.field('s0', t1),
- pa.field('s1', t2),
- pa.field('s2', t4),
- pa.field('s3', t6)
- ]
-
- my_schema = pa.schema(fields)
+ my_schema = pa.schema([('field0', t1),
+ ('field1', t2),
+ ('field2', t4),
+ ('field3', t6)])
my_schema
In some applications, you may not create schemas directly, only using the ones
@@ -233,10 +239,8 @@ sequence of Python dicts or tuples:
.. ipython:: python
- ty = pa.struct([
- pa.field('x', pa.int8()),
- pa.field('y', pa.bool_()),
- ])
+ ty = pa.struct([('x', pa.int8()),
+ ('y', pa.bool_())])
pa.array([{'x': 1, 'y': True}, {'x': 2, 'y': False}], type=ty)
pa.array([(3, True), (4, False)], type=ty)
diff --git a/python/pyarrow/tests/test_schema.py b/python/pyarrow/tests/test_schema.py
index fed6787..d358f12 100644
--- a/python/pyarrow/tests/test_schema.py
+++ b/python/pyarrow/tests/test_schema.py
@@ -15,6 +15,7 @@
# specific language governing permissions and limitations
# under the License.
+from collections import OrderedDict
import pickle
import pytest
@@ -206,6 +207,7 @@ def test_schema():
sch = pa.schema(fields)
assert sch.names == ['foo', 'bar', 'baz']
+ assert sch.types == [pa.int32(), pa.string(), pa.list_(pa.int8())]
assert len(sch) == 3
assert sch[0].name == 'foo'
@@ -220,6 +222,40 @@ baz: list<item: int8>
child 0, item: int8"""
+def test_schema_from_tuples():
+ fields = [
+ ('foo', pa.int32()),
+ ('bar', pa.string()),
+ ('baz', pa.list_(pa.int8())),
+ ]
+ sch = pa.schema(fields)
+ assert sch.names == ['foo', 'bar', 'baz']
+ assert sch.types == [pa.int32(), pa.string(), pa.list_(pa.int8())]
+ assert len(sch) == 3
+ assert repr(sch) == """\
+foo: int32
+bar: string
+baz: list<item: int8>
+ child 0, item: int8"""
+
+
+def test_schema_from_mapping():
+ fields = OrderedDict([
+ ('foo', pa.int32()),
+ ('bar', pa.string()),
+ ('baz', pa.list_(pa.int8())),
+ ])
+ sch = pa.schema(fields)
+ assert sch.names == ['foo', 'bar', 'baz']
+ assert sch.types == [pa.int32(), pa.string(), pa.list_(pa.int8())]
+ assert len(sch) == 3
+ assert repr(sch) == """\
+foo: int32
+bar: string
+baz: list<item: int8>
+ child 0, item: int8"""
+
+
def test_field_flatten():
f0 = pa.field('foo', pa.int32()).add_metadata({b'foo': b'bar'})
assert f0.flatten() == [f0]
diff --git a/python/pyarrow/tests/test_types.py b/python/pyarrow/tests/test_types.py
index c0f5930..aaf2e36 100644
--- a/python/pyarrow/tests/test_types.py
+++ b/python/pyarrow/tests/test_types.py
@@ -15,6 +15,7 @@
# specific language governing permissions and limitations
# under the License.
+from collections import OrderedDict
import pickle
import pytest
@@ -233,6 +234,23 @@ def test_struct_type():
for a, b in zip(ty, fields):
a == b
+ # Construct from list of tuples
+ ty = pa.struct([('a', pa.int64()),
+ ('a', pa.int32()),
+ ('b', pa.int32())])
+ assert list(ty) == fields
+ for a, b in zip(ty, fields):
+ a == b
+
+ # Construct from mapping
+ fields = [pa.field('a', pa.int64()),
+ pa.field('b', pa.int32())]
+ ty = pa.struct(OrderedDict([('a', pa.int64()),
+ ('b', pa.int32())]))
+ assert list(ty) == fields
+ for a, b in zip(ty, fields):
+ a == b
+
def test_union_type():
def check_fields(ty, fields):
diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi
index c8bf643..818fa74 100644
--- a/python/pyarrow/types.pxi
+++ b/python/pyarrow/types.pxi
@@ -15,6 +15,7 @@
# specific language governing permissions and limitations
# under the License.
+import collections
import re
# These are imprecise because the type (in pandas 0.x) depends on the presence
@@ -498,6 +499,13 @@ cdef class Schema:
@property
def names(self):
+ """
+ The schema's field names.
+
+ Returns
+ -------
+ list of str
+ """
cdef int i
result = []
for i in range(self.schema.num_fields()):
@@ -506,6 +514,17 @@ cdef class Schema:
return result
@property
+ def types(self):
+ """
+ The schema's field types.
+
+ Returns
+ -------
+ list of DataType
+ """
+ return [field.type for field in self]
+
+ @property
def metadata(self):
cdef shared_ptr[const CKeyValueMetadata] metadata = (
self.schema.metadata())
@@ -1221,7 +1240,7 @@ def struct(fields):
Parameters
----------
- fields : sequence of Field values
+ fields : iterable of Fields or tuples, or mapping of strings to DataTypes
Examples
--------
@@ -1229,8 +1248,14 @@ def struct(fields):
import pyarrow as pa
fields = [
+ ('f1', pa.int32()),
+ ('f2', pa.string()),
+ ]
+ struct_type = pa.struct(fields)
+
+ fields = [
pa.field('f1', pa.int32()),
- pa.field('f2', pa.string())
+ pa.field('f2', pa.string(), nullable=false),
]
struct_type = pa.struct(fields)
@@ -1239,12 +1264,19 @@ def struct(fields):
type : DataType
"""
cdef:
- Field field
+ Field py_field
vector[shared_ptr[CField]] c_fields
cdef shared_ptr[CDataType] struct_type
- for field in fields:
- c_fields.push_back(field.sp_field)
+ if isinstance(fields, collections.Mapping):
+ fields = fields.items()
+
+ for item in fields:
+ if isinstance(item, tuple):
+ py_field = field(*item)
+ else:
+ py_field = item
+ c_fields.push_back(py_field.sp_field)
struct_type.reset(new CStructType(c_fields))
return pyarrow_wrap_data_type(struct_type)
@@ -1368,10 +1400,21 @@ def schema(fields, dict metadata=None):
Parameters
----------
- field : list or iterable
+ field : iterable of Fields or tuples, or mapping of strings to DataTypes
metadata : dict, default None
Keys and values must be coercible to bytes
+ Examples
+ --------
+ ::
+
+ import pyarrow as pa
+ fields = [
+ ('some_int', pa.int32()),
+ ('some_string', pa.string()),
+ ]
+ schema = pa.schema(fields)
+
Returns
-------
schema : pyarrow.Schema
@@ -1380,11 +1423,18 @@ def schema(fields, dict metadata=None):
shared_ptr[CKeyValueMetadata] c_meta
shared_ptr[CSchema] c_schema
Schema result
- Field field
+ Field py_field
vector[shared_ptr[CField]] c_fields
- for i, field in enumerate(fields):
- c_fields.push_back(field.sp_field)
+ if isinstance(fields, collections.Mapping):
+ fields = fields.items()
+
+ for item in fields:
+ if isinstance(item, tuple):
+ py_field = field(*item)
+ else:
+ py_field = item
+ c_fields.push_back(py_field.sp_field)
if metadata is not None:
convert_metadata(metadata, &c_meta)