You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2017/07/10 21:25:00 UTC

arrow git commit: ARROW-1168: [Python] pandas metadata may contain "mixed" data types

Repository: arrow
Updated Branches:
  refs/heads/master bc16e0e04 -> 471a85fd7


ARROW-1168: [Python] pandas metadata may contain "mixed" data types

Author: Phillip Cloud <cp...@gmail.com>

Closes #817 from cpcloud/ARROW-1168 and squashes the following commits:

4b85f3cb [Phillip Cloud] ARROW-1168: [Python] pandas metadata may contain "mixed" data types


Project: http://git-wip-us.apache.org/repos/asf/arrow/repo
Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/471a85fd
Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/471a85fd
Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/471a85fd

Branch: refs/heads/master
Commit: 471a85fd77b25f6be4b7557349d1d32b042f0dd9
Parents: bc16e0e
Author: Phillip Cloud <cp...@gmail.com>
Authored: Mon Jul 10 17:24:55 2017 -0400
Committer: Wes McKinney <we...@twosigma.com>
Committed: Mon Jul 10 17:24:55 2017 -0400

----------------------------------------------------------------------
 python/pyarrow/array.pxi                    |  44 ++++++--
 python/pyarrow/includes/libarrow.pxd        |   1 +
 python/pyarrow/lib.pxd                      |   5 +
 python/pyarrow/lib.pyx                      |  27 +++++
 python/pyarrow/pandas_compat.py             | 133 ++++++++++++++++++++---
 python/pyarrow/public-api.pxi               |   2 +
 python/pyarrow/table.pxi                    |  20 ++--
 python/pyarrow/tests/test_array.py          |  34 ++++++
 python/pyarrow/tests/test_convert_pandas.py |  44 +++++++-
 9 files changed, 277 insertions(+), 33 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/arrow/blob/471a85fd/python/pyarrow/array.pxi
----------------------------------------------------------------------
diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi
index 9e6ac8d..bf87173 100644
--- a/python/pyarrow/array.pxi
+++ b/python/pyarrow/array.pxi
@@ -19,7 +19,7 @@ from pyarrow.includes.libarrow cimport *
 
 # These are imprecise because the type (in pandas 0.x) depends on the presence
 # of nulls
-_pandas_type_map = {
+cdef dict _pandas_type_map = {
     _Type_NA: np.float64,  # NaNs
     _Type_BOOL: np.bool_,
     _Type_INT8: np.int8,
@@ -39,9 +39,11 @@ _pandas_type_map = {
     _Type_BINARY: np.object_,
     _Type_FIXED_SIZE_BINARY: np.object_,
     _Type_STRING: np.object_,
-    _Type_LIST: np.object_
+    _Type_LIST: np.object_,
+    _Type_DECIMAL: np.object_,
 }
 
+
 cdef class DataType:
 
     def __cinit__(self):
@@ -51,6 +53,11 @@ cdef class DataType:
         self.sp_type = type
         self.type = type.get()
 
+    property id:
+
+        def __get__(self):
+            return self.type.id()
+
     def __str__(self):
         if self.type is NULL:
             raise TypeError(
@@ -91,6 +98,18 @@ cdef class DictionaryType(DataType):
         self.dict_type = <const CDictionaryType*> type.get()
 
 
+cdef class ListType(DataType):
+
+    cdef void init(self, const shared_ptr[CDataType]& type):
+        DataType.init(self, type)
+        self.list_type = <const CListType*> type.get()
+
+    property value_type:
+
+        def __get__(self):
+            return pyarrow_wrap_data_type(self.list_type.value_type())
+
+
 cdef class TimestampType(DataType):
 
     cdef void init(self, const shared_ptr[CDataType]& type):
@@ -154,6 +173,16 @@ cdef class DecimalType(FixedSizeBinaryType):
         DataType.init(self, type)
         self.decimal_type = <const CDecimalType*> type.get()
 
+    property precision:
+
+        def __get__(self):
+            return self.decimal_type.precision()
+
+    property scale:
+
+        def __get__(self):
+            return self.decimal_type.scale()
+
 
 cdef class Field:
     """
@@ -630,7 +659,7 @@ def binary(int length=-1):
     return pyarrow_wrap_data_type(fixed_size_binary_type)
 
 
-def list_(value_type):
+cpdef ListType list_(value_type):
     """
     Create ListType instance from child data type or field
 
@@ -645,8 +674,8 @@ def list_(value_type):
     cdef:
         DataType data_type
         Field field
-
-    cdef shared_ptr[CDataType] list_type
+        shared_ptr[CDataType] list_type
+        ListType out = ListType()
 
     if isinstance(value_type, DataType):
         list_type.reset(new CListType((<DataType> value_type).sp_type))
@@ -655,10 +684,11 @@ def list_(value_type):
     else:
         raise ValueError('List requires DataType or Field')
 
-    return pyarrow_wrap_data_type(list_type)
+    out.init(list_type)
+    return out
 
 
-def dictionary(DataType index_type, Array dictionary):
+cpdef DictionaryType dictionary(DataType index_type, Array dictionary):
     """
     Dictionary (categorical, or simply encoded) type
 

http://git-wip-us.apache.org/repos/asf/arrow/blob/471a85fd/python/pyarrow/includes/libarrow.pxd
----------------------------------------------------------------------
diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd
index 902f98e..cc46c76 100644
--- a/python/pyarrow/includes/libarrow.pxd
+++ b/python/pyarrow/includes/libarrow.pxd
@@ -161,6 +161,7 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
     cdef cppclass CListType" arrow::ListType"(CDataType):
         CListType(const shared_ptr[CDataType]& value_type)
         CListType(const shared_ptr[CField]& field)
+        shared_ptr[CDataType] value_type()
 
     cdef cppclass CStringType" arrow::StringType"(CDataType):
         pass

http://git-wip-us.apache.org/repos/asf/arrow/blob/471a85fd/python/pyarrow/lib.pxd
----------------------------------------------------------------------
diff --git a/python/pyarrow/lib.pxd b/python/pyarrow/lib.pxd
index 8fa7fd9..3e1419b 100644
--- a/python/pyarrow/lib.pxd
+++ b/python/pyarrow/lib.pxd
@@ -52,6 +52,11 @@ cdef class DataType:
     cdef void init(self, const shared_ptr[CDataType]& type)
 
 
+cdef class ListType(DataType):
+    cdef:
+        const CListType* list_type
+
+
 cdef class DictionaryType(DataType):
     cdef:
         const CDictionaryType* dict_type

http://git-wip-us.apache.org/repos/asf/arrow/blob/471a85fd/python/pyarrow/lib.pyx
----------------------------------------------------------------------
diff --git a/python/pyarrow/lib.pyx b/python/pyarrow/lib.pyx
index ae311ac..13c1822 100644
--- a/python/pyarrow/lib.pyx
+++ b/python/pyarrow/lib.pyx
@@ -67,6 +67,33 @@ def set_cpu_count(count):
     CPU_COUNT = max(int(count), 1)
 
 
+Type_NA = _Type_NA
+Type_BOOL = _Type_BOOL
+Type_UINT8 = _Type_UINT8
+Type_INT8 = _Type_INT8
+Type_UINT16 = _Type_UINT16
+Type_INT16 = _Type_INT16
+Type_UINT32 = _Type_UINT32
+Type_INT32 = _Type_INT32
+Type_UINT64 = _Type_UINT64
+Type_INT64 = _Type_INT64
+Type_HALF_FLOAT = _Type_HALF_FLOAT
+Type_FLOAT = _Type_FLOAT
+Type_DOUBLE = _Type_DOUBLE
+Type_DECIMAL = _Type_DECIMAL
+Type_DATE32 = _Type_DATE32
+Type_DATE64 = _Type_DATE64
+Type_TIMESTAMP = _Type_TIMESTAMP
+Type_TIME32 = _Type_TIME32
+Type_TIME64 = _Type_TIME64
+Type_BINARY = _Type_BINARY
+Type_STRING = _Type_STRING
+Type_FIXED_SIZE_BINARY = _Type_FIXED_SIZE_BINARY
+Type_LIST = _Type_LIST
+Type_STRUCT = _Type_STRUCT
+Type_DICTIONARY = _Type_DICTIONARY
+
+
 # Exception types
 include "error.pxi"
 

http://git-wip-us.apache.org/repos/asf/arrow/blob/471a85fd/python/pyarrow/pandas_compat.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/pandas_compat.py b/python/pyarrow/pandas_compat.py
index 9711b72..a9569b2 100644
--- a/python/pyarrow/pandas_compat.py
+++ b/python/pyarrow/pandas_compat.py
@@ -21,6 +21,7 @@ import pandas as pd
 
 import six
 
+import pyarrow as pa
 from pyarrow.compat import PY2
 
 
@@ -38,17 +39,83 @@ def infer_dtype(column):
         return pd.lib.infer_dtype(column)
 
 
-def get_column_metadata(column, name):
-    inferred_dtype = infer_dtype(column)
+_logical_type_map = {}
+
+
+def get_logical_type_map():
+    global _logical_type_map
+
+    if not _logical_type_map:
+        _logical_type_map.update({
+            pa.lib.Type_NA: 'float64',  # NaNs
+            pa.lib.Type_BOOL: 'bool',
+            pa.lib.Type_INT8: 'int8',
+            pa.lib.Type_INT16: 'int16',
+            pa.lib.Type_INT32: 'int32',
+            pa.lib.Type_INT64: 'int64',
+            pa.lib.Type_UINT8: 'uint8',
+            pa.lib.Type_UINT16: 'uint16',
+            pa.lib.Type_UINT32: 'uint32',
+            pa.lib.Type_UINT64: 'uint64',
+            pa.lib.Type_HALF_FLOAT: 'float16',
+            pa.lib.Type_FLOAT: 'float32',
+            pa.lib.Type_DOUBLE: 'float64',
+            pa.lib.Type_DATE32: 'date',
+            pa.lib.Type_DATE64: 'date',
+            pa.lib.Type_BINARY: 'bytes',
+            pa.lib.Type_FIXED_SIZE_BINARY: 'bytes',
+            pa.lib.Type_STRING: 'unicode',
+        })
+    return _logical_type_map
+
+
+def get_logical_type(arrow_type):
+    logical_type_map = get_logical_type_map()
+
+    try:
+        return logical_type_map[arrow_type.id]
+    except KeyError:
+        if isinstance(arrow_type, pa.lib.DictionaryType):
+            return 'categorical'
+        elif isinstance(arrow_type, pa.lib.ListType):
+            return 'list[{}]'.format(get_logical_type(arrow_type.value_type))
+        elif isinstance(arrow_type, pa.lib.TimestampType):
+            return 'datetimetz' if arrow_type.tz is not None else 'datetime'
+        elif isinstance(arrow_type, pa.lib.DecimalType):
+            return 'decimal'
+        raise NotImplementedError(str(arrow_type))
+
+
+def get_column_metadata(column, name, arrow_type):
+    """Construct the metadata for a given column
+
+    Parameters
+    ----------
+    column : pandas.Series
+    name : str
+    arrow_type : pyarrow.DataType
+
+    Returns
+    -------
+    dict
+    """
     dtype = column.dtype
+    logical_type = get_logical_type(arrow_type)
 
     if hasattr(dtype, 'categories'):
+        assert logical_type == 'categorical'
         extra_metadata = {
             'num_categories': len(column.cat.categories),
             'ordered': column.cat.ordered,
         }
     elif hasattr(dtype, 'tz'):
+        assert logical_type == 'datetimetz'
         extra_metadata = {'timezone': str(dtype.tz)}
+    elif logical_type == 'decimal':
+        extra_metadata = {
+            'precision': arrow_type.precision,
+            'scale': arrow_type.scale,
+        }
     else:
         extra_metadata = None
 
@@ -61,25 +128,49 @@ def get_column_metadata(column, name):
 
     return {
         'name': name,
-        'pandas_type': {
-            'string': 'bytes' if PY2 else 'unicode',
-            'datetime64': (
-                'datetimetz' if hasattr(dtype, 'tz')
-                else 'datetime'
-            ),
-            'integer': str(dtype),
-            'floating': str(dtype),
-        }.get(inferred_dtype, inferred_dtype),
+        'pandas_type': logical_type,
         'numpy_type': str(dtype),
         'metadata': extra_metadata,
     }
 
 
 def index_level_name(index, i):
-    return index.name or '__index_level_{:d}__'.format(i)
+    """Return the name of an index level or a default name if `index.name` is
+    None.
 
+    Parameters
+    ----------
+    index : pandas.Index
+    i : int
+
+    Returns
+    -------
+    name : str
+    """
+    if index.name is not None:
+        return index.name
+    else:
+        return '__index_level_{:d}__'.format(i)
 
-def construct_metadata(df, index_levels, preserve_index):
+
+def construct_metadata(df, index_levels, preserve_index, types):
+    """Returns a dictionary containing enough metadata to reconstruct a pandas
+    DataFrame as an Arrow Table, including index columns.
+
+    Parameters
+    ----------
+    df : pandas.DataFrame
+    index_levels : List[pd.Index]
+    presere_index : bool
+    types : List[pyarrow.DataType]
+
+    Returns
+    -------
+    dict
+    """
+    ncolumns = len(df.columns)
+    df_types = types[:ncolumns]
+    index_types = types[ncolumns:ncolumns + len(index_levels)]
     return {
         b'pandas': json.dumps(
             {
@@ -88,14 +179,22 @@ def construct_metadata(df, index_levels, preserve_index):
                     for i, level in enumerate(index_levels)
                 ] if preserve_index else [],
                 'columns': [
-                    get_column_metadata(df[name], name=name)
-                    for name in df.columns
+                    get_column_metadata(
+                        df[name],
+                        name=name,
+                        arrow_type=arrow_type
+                    )
+                    for name, arrow_type in zip(df.columns, df_types)
                 ] + (
                     [
                         get_column_metadata(
-                            level, name=index_level_name(level, i)
+                            level,
+                            name=index_level_name(level, i),
+                            arrow_type=arrow_type
+                        )
+                        for i, (level, arrow_type) in enumerate(
+                            zip(index_levels, index_types)
                         )
-                        for i, level in enumerate(index_levels)
                     ] if preserve_index else []
                 ),
                 'pandas_version': pd.__version__,

http://git-wip-us.apache.org/repos/asf/arrow/blob/471a85fd/python/pyarrow/public-api.pxi
----------------------------------------------------------------------
diff --git a/python/pyarrow/public-api.pxi b/python/pyarrow/public-api.pxi
index 7b55651..637b749 100644
--- a/python/pyarrow/public-api.pxi
+++ b/python/pyarrow/public-api.pxi
@@ -37,6 +37,8 @@ cdef public api object pyarrow_wrap_data_type(
 
     if type.get().id() == _Type_DICTIONARY:
         out = DictionaryType()
+    elif type.get().id() == _Type_LIST:
+        out = ListType()
     elif type.get().id() == _Type_TIMESTAMP:
         out = TimestampType()
     elif type.get().id() == _Type_FIXED_SIZE_BINARY:

http://git-wip-us.apache.org/repos/asf/arrow/blob/471a85fd/python/pyarrow/table.pxi
----------------------------------------------------------------------
diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi
index ef83636..01e5306 100644
--- a/python/pyarrow/table.pxi
+++ b/python/pyarrow/table.pxi
@@ -321,7 +321,9 @@ cdef tuple _dataframe_to_arrays(
         list names = []
         list arrays = []
         list index_columns = []
+        list types = []
         DataType type = None
+        Array array
         dict metadata
         Py_ssize_t i
         Py_ssize_t n
@@ -336,20 +338,22 @@ cdef tuple _dataframe_to_arrays(
             field = schema.field_by_name(name)
             type = getattr(field, "type", None)
 
-        arr = arrays.append(
-            Array.from_pandas(
-                col, type=type, timestamps_to_ms=timestamps_to_ms
-            )
+        array = Array.from_pandas(
+            col, type=type, timestamps_to_ms=timestamps_to_ms
         )
+        arrays.append(array)
         names.append(name)
+        types.append(array.type)
 
     for i, column in enumerate(index_columns):
-        arrays.append(
-            Array.from_pandas(column, timestamps_to_ms=timestamps_to_ms)
-        )
+        array = Array.from_pandas(column, timestamps_to_ms=timestamps_to_ms)
+        arrays.append(array)
         names.append(pdcompat.index_level_name(column, i))
+        types.append(array.type)
 
-    metadata = pdcompat.construct_metadata(df, index_columns, preserve_index)
+    metadata = pdcompat.construct_metadata(
+        df, index_columns, preserve_index, types
+    )
     return names, arrays, metadata
 
 

http://git-wip-us.apache.org/repos/asf/arrow/blob/471a85fd/python/pyarrow/tests/test_array.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py
index af21741..e0a7416 100644
--- a/python/pyarrow/tests/test_array.py
+++ b/python/pyarrow/tests/test_array.py
@@ -23,6 +23,7 @@ import pandas as pd
 import pandas.util.testing as tm
 
 import pyarrow as pa
+from pyarrow.pandas_compat import get_logical_type
 import pyarrow.formatting as fmt
 
 
@@ -195,3 +196,36 @@ def test_simple_type_construction():
     result = pa.lib.TimestampType()
     with pytest.raises(TypeError):
         str(result)
+
+
+@pytest.mark.parametrize(
+    ('type', 'expected'),
+    [
+        (pa.null(), 'float64'),
+        (pa.bool_(), 'bool'),
+        (pa.int8(), 'int8'),
+        (pa.int16(), 'int16'),
+        (pa.int32(), 'int32'),
+        (pa.int64(), 'int64'),
+        (pa.uint8(), 'uint8'),
+        (pa.uint16(), 'uint16'),
+        (pa.uint32(), 'uint32'),
+        (pa.uint64(), 'uint64'),
+        (pa.float16(), 'float16'),
+        (pa.float32(), 'float32'),
+        (pa.float64(), 'float64'),
+        (pa.date32(), 'date'),
+        (pa.date64(), 'date'),
+        (pa.binary(), 'bytes'),
+        (pa.binary(length=4), 'bytes'),
+        (pa.string(), 'unicode'),
+        (pa.list_(pa.list_(pa.int16())), 'list[list[int16]]'),
+        (pa.decimal(18, 3), 'decimal'),
+        (pa.timestamp('ms'), 'datetime'),
+        (pa.timestamp('us', 'UTC'), 'datetimetz'),
+        pytest.mark.xfail((pa.time32('s'), None), raises=NotImplementedError),
+        pytest.mark.xfail((pa.time64('us'), None), raises=NotImplementedError),
+   ]
+)
+def test_logical_type(type, expected):
+    assert get_logical_type(type) == expected

http://git-wip-us.apache.org/repos/asf/arrow/blob/471a85fd/python/pyarrow/tests/test_convert_pandas.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/tests/test_convert_pandas.py b/python/pyarrow/tests/test_convert_pandas.py
index fb69cac..ac4ad82 100644
--- a/python/pyarrow/tests/test_convert_pandas.py
+++ b/python/pyarrow/tests/test_convert_pandas.py
@@ -18,10 +18,12 @@
 
 from collections import OrderedDict
 
-import pytest
 import datetime
 import unittest
 import decimal
+import json
+
+import pytest
 
 import numpy as np
 
@@ -721,3 +723,43 @@ class TestPandasConversion(unittest.TestCase):
         ])
 
         self._check_pandas_roundtrip(df, expected_schema=expected_schema)
+
+    def test_metadata_with_mixed_types(self):
+        df = pd.DataFrame({'data': [b'some_bytes', u'some_unicode']})
+        table = pa.Table.from_pandas(df)
+        metadata = table.schema.metadata
+        assert b'mixed' not in metadata[b'pandas']
+
+        js = json.loads(metadata[b'pandas'].decode('utf8'))
+        data_column = js['columns'][0]
+        assert data_column['pandas_type'] == 'bytes'
+        assert data_column['numpy_type'] == 'object'
+
+    def test_list_metadata(self):
+        df = pd.DataFrame({'data': [[1], [2, 3, 4], [5] * 7]})
+        schema = pa.schema([pa.field('data', type=pa.list_(pa.int64()))])
+        table = pa.Table.from_pandas(df, schema=schema)
+        metadata = table.schema.metadata
+        assert b'mixed' not in metadata[b'pandas']
+
+        js = json.loads(metadata[b'pandas'].decode('utf8'))
+        data_column = js['columns'][0]
+        assert data_column['pandas_type'] == 'list[int64]'
+        assert data_column['numpy_type'] == 'object'
+
+    def test_decimal_metadata(self):
+        expected = pd.DataFrame({
+            'decimals': [
+                decimal.Decimal('394092382910493.12341234678'),
+                -decimal.Decimal('314292388910493.12343437128'),
+            ]
+        })
+        table = pa.Table.from_pandas(expected)
+        metadata = table.schema.metadata
+        assert b'mixed' not in metadata[b'pandas']
+
+        js = json.loads(metadata[b'pandas'].decode('utf8'))
+        data_column = js['columns'][0]
+        assert data_column['pandas_type'] == 'decimal'
+        assert data_column['numpy_type'] == 'object'
+        assert data_column['metadata'] == {'precision': 26, 'scale': 11}