You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2017/07/10 21:25:00 UTC
arrow git commit: ARROW-1168: [Python] pandas metadata may contain
"mixed" data types
Repository: arrow
Updated Branches:
refs/heads/master bc16e0e04 -> 471a85fd7
ARROW-1168: [Python] pandas metadata may contain "mixed" data types
Author: Phillip Cloud <cp...@gmail.com>
Closes #817 from cpcloud/ARROW-1168 and squashes the following commits:
4b85f3cb [Phillip Cloud] ARROW-1168: [Python] pandas metadata may contain "mixed" data types
Project: http://git-wip-us.apache.org/repos/asf/arrow/repo
Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/471a85fd
Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/471a85fd
Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/471a85fd
Branch: refs/heads/master
Commit: 471a85fd77b25f6be4b7557349d1d32b042f0dd9
Parents: bc16e0e
Author: Phillip Cloud <cp...@gmail.com>
Authored: Mon Jul 10 17:24:55 2017 -0400
Committer: Wes McKinney <we...@twosigma.com>
Committed: Mon Jul 10 17:24:55 2017 -0400
----------------------------------------------------------------------
python/pyarrow/array.pxi | 44 ++++++--
python/pyarrow/includes/libarrow.pxd | 1 +
python/pyarrow/lib.pxd | 5 +
python/pyarrow/lib.pyx | 27 +++++
python/pyarrow/pandas_compat.py | 133 ++++++++++++++++++++---
python/pyarrow/public-api.pxi | 2 +
python/pyarrow/table.pxi | 20 ++--
python/pyarrow/tests/test_array.py | 34 ++++++
python/pyarrow/tests/test_convert_pandas.py | 44 +++++++-
9 files changed, 277 insertions(+), 33 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/arrow/blob/471a85fd/python/pyarrow/array.pxi
----------------------------------------------------------------------
diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi
index 9e6ac8d..bf87173 100644
--- a/python/pyarrow/array.pxi
+++ b/python/pyarrow/array.pxi
@@ -19,7 +19,7 @@ from pyarrow.includes.libarrow cimport *
# These are imprecise because the type (in pandas 0.x) depends on the presence
# of nulls
-_pandas_type_map = {
+cdef dict _pandas_type_map = {
_Type_NA: np.float64, # NaNs
_Type_BOOL: np.bool_,
_Type_INT8: np.int8,
@@ -39,9 +39,11 @@ _pandas_type_map = {
_Type_BINARY: np.object_,
_Type_FIXED_SIZE_BINARY: np.object_,
_Type_STRING: np.object_,
- _Type_LIST: np.object_
+ _Type_LIST: np.object_,
+ _Type_DECIMAL: np.object_,
}
+
cdef class DataType:
def __cinit__(self):
@@ -51,6 +53,11 @@ cdef class DataType:
self.sp_type = type
self.type = type.get()
+ property id:
+
+ def __get__(self):
+ return self.type.id()
+
def __str__(self):
if self.type is NULL:
raise TypeError(
@@ -91,6 +98,18 @@ cdef class DictionaryType(DataType):
self.dict_type = <const CDictionaryType*> type.get()
+cdef class ListType(DataType):
+
+ cdef void init(self, const shared_ptr[CDataType]& type):
+ DataType.init(self, type)
+ self.list_type = <const CListType*> type.get()
+
+ property value_type:
+
+ def __get__(self):
+ return pyarrow_wrap_data_type(self.list_type.value_type())
+
+
cdef class TimestampType(DataType):
cdef void init(self, const shared_ptr[CDataType]& type):
@@ -154,6 +173,16 @@ cdef class DecimalType(FixedSizeBinaryType):
DataType.init(self, type)
self.decimal_type = <const CDecimalType*> type.get()
+ property precision:
+
+ def __get__(self):
+ return self.decimal_type.precision()
+
+ property scale:
+
+ def __get__(self):
+ return self.decimal_type.scale()
+
cdef class Field:
"""
@@ -630,7 +659,7 @@ def binary(int length=-1):
return pyarrow_wrap_data_type(fixed_size_binary_type)
-def list_(value_type):
+cpdef ListType list_(value_type):
"""
Create ListType instance from child data type or field
@@ -645,8 +674,8 @@ def list_(value_type):
cdef:
DataType data_type
Field field
-
- cdef shared_ptr[CDataType] list_type
+ shared_ptr[CDataType] list_type
+ ListType out = ListType()
if isinstance(value_type, DataType):
list_type.reset(new CListType((<DataType> value_type).sp_type))
@@ -655,10 +684,11 @@ def list_(value_type):
else:
raise ValueError('List requires DataType or Field')
- return pyarrow_wrap_data_type(list_type)
+ out.init(list_type)
+ return out
-def dictionary(DataType index_type, Array dictionary):
+cpdef DictionaryType dictionary(DataType index_type, Array dictionary):
"""
Dictionary (categorical, or simply encoded) type
http://git-wip-us.apache.org/repos/asf/arrow/blob/471a85fd/python/pyarrow/includes/libarrow.pxd
----------------------------------------------------------------------
diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd
index 902f98e..cc46c76 100644
--- a/python/pyarrow/includes/libarrow.pxd
+++ b/python/pyarrow/includes/libarrow.pxd
@@ -161,6 +161,7 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
cdef cppclass CListType" arrow::ListType"(CDataType):
CListType(const shared_ptr[CDataType]& value_type)
CListType(const shared_ptr[CField]& field)
+ shared_ptr[CDataType] value_type()
cdef cppclass CStringType" arrow::StringType"(CDataType):
pass
http://git-wip-us.apache.org/repos/asf/arrow/blob/471a85fd/python/pyarrow/lib.pxd
----------------------------------------------------------------------
diff --git a/python/pyarrow/lib.pxd b/python/pyarrow/lib.pxd
index 8fa7fd9..3e1419b 100644
--- a/python/pyarrow/lib.pxd
+++ b/python/pyarrow/lib.pxd
@@ -52,6 +52,11 @@ cdef class DataType:
cdef void init(self, const shared_ptr[CDataType]& type)
+cdef class ListType(DataType):
+ cdef:
+ const CListType* list_type
+
+
cdef class DictionaryType(DataType):
cdef:
const CDictionaryType* dict_type
http://git-wip-us.apache.org/repos/asf/arrow/blob/471a85fd/python/pyarrow/lib.pyx
----------------------------------------------------------------------
diff --git a/python/pyarrow/lib.pyx b/python/pyarrow/lib.pyx
index ae311ac..13c1822 100644
--- a/python/pyarrow/lib.pyx
+++ b/python/pyarrow/lib.pyx
@@ -67,6 +67,33 @@ def set_cpu_count(count):
CPU_COUNT = max(int(count), 1)
+Type_NA = _Type_NA
+Type_BOOL = _Type_BOOL
+Type_UINT8 = _Type_UINT8
+Type_INT8 = _Type_INT8
+Type_UINT16 = _Type_UINT16
+Type_INT16 = _Type_INT16
+Type_UINT32 = _Type_UINT32
+Type_INT32 = _Type_INT32
+Type_UINT64 = _Type_UINT64
+Type_INT64 = _Type_INT64
+Type_HALF_FLOAT = _Type_HALF_FLOAT
+Type_FLOAT = _Type_FLOAT
+Type_DOUBLE = _Type_DOUBLE
+Type_DECIMAL = _Type_DECIMAL
+Type_DATE32 = _Type_DATE32
+Type_DATE64 = _Type_DATE64
+Type_TIMESTAMP = _Type_TIMESTAMP
+Type_TIME32 = _Type_TIME32
+Type_TIME64 = _Type_TIME64
+Type_BINARY = _Type_BINARY
+Type_STRING = _Type_STRING
+Type_FIXED_SIZE_BINARY = _Type_FIXED_SIZE_BINARY
+Type_LIST = _Type_LIST
+Type_STRUCT = _Type_STRUCT
+Type_DICTIONARY = _Type_DICTIONARY
+
+
# Exception types
include "error.pxi"
http://git-wip-us.apache.org/repos/asf/arrow/blob/471a85fd/python/pyarrow/pandas_compat.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/pandas_compat.py b/python/pyarrow/pandas_compat.py
index 9711b72..a9569b2 100644
--- a/python/pyarrow/pandas_compat.py
+++ b/python/pyarrow/pandas_compat.py
@@ -21,6 +21,7 @@ import pandas as pd
import six
+import pyarrow as pa
from pyarrow.compat import PY2
@@ -38,17 +39,83 @@ def infer_dtype(column):
return pd.lib.infer_dtype(column)
-def get_column_metadata(column, name):
- inferred_dtype = infer_dtype(column)
+_logical_type_map = {}
+
+
+def get_logical_type_map():
+ global _logical_type_map
+
+ if not _logical_type_map:
+ _logical_type_map.update({
+ pa.lib.Type_NA: 'float64', # NaNs
+ pa.lib.Type_BOOL: 'bool',
+ pa.lib.Type_INT8: 'int8',
+ pa.lib.Type_INT16: 'int16',
+ pa.lib.Type_INT32: 'int32',
+ pa.lib.Type_INT64: 'int64',
+ pa.lib.Type_UINT8: 'uint8',
+ pa.lib.Type_UINT16: 'uint16',
+ pa.lib.Type_UINT32: 'uint32',
+ pa.lib.Type_UINT64: 'uint64',
+ pa.lib.Type_HALF_FLOAT: 'float16',
+ pa.lib.Type_FLOAT: 'float32',
+ pa.lib.Type_DOUBLE: 'float64',
+ pa.lib.Type_DATE32: 'date',
+ pa.lib.Type_DATE64: 'date',
+ pa.lib.Type_BINARY: 'bytes',
+ pa.lib.Type_FIXED_SIZE_BINARY: 'bytes',
+ pa.lib.Type_STRING: 'unicode',
+ })
+ return _logical_type_map
+
+
+def get_logical_type(arrow_type):
+ logical_type_map = get_logical_type_map()
+
+ try:
+ return logical_type_map[arrow_type.id]
+ except KeyError:
+ if isinstance(arrow_type, pa.lib.DictionaryType):
+ return 'categorical'
+ elif isinstance(arrow_type, pa.lib.ListType):
+ return 'list[{}]'.format(get_logical_type(arrow_type.value_type))
+ elif isinstance(arrow_type, pa.lib.TimestampType):
+ return 'datetimetz' if arrow_type.tz is not None else 'datetime'
+ elif isinstance(arrow_type, pa.lib.DecimalType):
+ return 'decimal'
+ raise NotImplementedError(str(arrow_type))
+
+
+def get_column_metadata(column, name, arrow_type):
+ """Construct the metadata for a given column
+
+ Parameters
+ ----------
+ column : pandas.Series
+ name : str
+ arrow_type : pyarrow.DataType
+
+ Returns
+ -------
+ dict
+ """
dtype = column.dtype
+ logical_type = get_logical_type(arrow_type)
if hasattr(dtype, 'categories'):
+ assert logical_type == 'categorical'
extra_metadata = {
'num_categories': len(column.cat.categories),
'ordered': column.cat.ordered,
}
elif hasattr(dtype, 'tz'):
+ assert logical_type == 'datetimetz'
extra_metadata = {'timezone': str(dtype.tz)}
+ elif logical_type == 'decimal':
+ extra_metadata = {
+ 'precision': arrow_type.precision,
+ 'scale': arrow_type.scale,
+ }
else:
extra_metadata = None
@@ -61,25 +128,49 @@ def get_column_metadata(column, name):
return {
'name': name,
- 'pandas_type': {
- 'string': 'bytes' if PY2 else 'unicode',
- 'datetime64': (
- 'datetimetz' if hasattr(dtype, 'tz')
- else 'datetime'
- ),
- 'integer': str(dtype),
- 'floating': str(dtype),
- }.get(inferred_dtype, inferred_dtype),
+ 'pandas_type': logical_type,
'numpy_type': str(dtype),
'metadata': extra_metadata,
}
def index_level_name(index, i):
- return index.name or '__index_level_{:d}__'.format(i)
+ """Return the name of an index level or a default name if `index.name` is
+ None.
+ Parameters
+ ----------
+ index : pandas.Index
+ i : int
+
+ Returns
+ -------
+ name : str
+ """
+ if index.name is not None:
+ return index.name
+ else:
+ return '__index_level_{:d}__'.format(i)
-def construct_metadata(df, index_levels, preserve_index):
+
+def construct_metadata(df, index_levels, preserve_index, types):
+ """Returns a dictionary containing enough metadata to reconstruct a pandas
+ DataFrame as an Arrow Table, including index columns.
+
+ Parameters
+ ----------
+ df : pandas.DataFrame
+ index_levels : List[pd.Index]
+ presere_index : bool
+ types : List[pyarrow.DataType]
+
+ Returns
+ -------
+ dict
+ """
+ ncolumns = len(df.columns)
+ df_types = types[:ncolumns]
+ index_types = types[ncolumns:ncolumns + len(index_levels)]
return {
b'pandas': json.dumps(
{
@@ -88,14 +179,22 @@ def construct_metadata(df, index_levels, preserve_index):
for i, level in enumerate(index_levels)
] if preserve_index else [],
'columns': [
- get_column_metadata(df[name], name=name)
- for name in df.columns
+ get_column_metadata(
+ df[name],
+ name=name,
+ arrow_type=arrow_type
+ )
+ for name, arrow_type in zip(df.columns, df_types)
] + (
[
get_column_metadata(
- level, name=index_level_name(level, i)
+ level,
+ name=index_level_name(level, i),
+ arrow_type=arrow_type
+ )
+ for i, (level, arrow_type) in enumerate(
+ zip(index_levels, index_types)
)
- for i, level in enumerate(index_levels)
] if preserve_index else []
),
'pandas_version': pd.__version__,
http://git-wip-us.apache.org/repos/asf/arrow/blob/471a85fd/python/pyarrow/public-api.pxi
----------------------------------------------------------------------
diff --git a/python/pyarrow/public-api.pxi b/python/pyarrow/public-api.pxi
index 7b55651..637b749 100644
--- a/python/pyarrow/public-api.pxi
+++ b/python/pyarrow/public-api.pxi
@@ -37,6 +37,8 @@ cdef public api object pyarrow_wrap_data_type(
if type.get().id() == _Type_DICTIONARY:
out = DictionaryType()
+ elif type.get().id() == _Type_LIST:
+ out = ListType()
elif type.get().id() == _Type_TIMESTAMP:
out = TimestampType()
elif type.get().id() == _Type_FIXED_SIZE_BINARY:
http://git-wip-us.apache.org/repos/asf/arrow/blob/471a85fd/python/pyarrow/table.pxi
----------------------------------------------------------------------
diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi
index ef83636..01e5306 100644
--- a/python/pyarrow/table.pxi
+++ b/python/pyarrow/table.pxi
@@ -321,7 +321,9 @@ cdef tuple _dataframe_to_arrays(
list names = []
list arrays = []
list index_columns = []
+ list types = []
DataType type = None
+ Array array
dict metadata
Py_ssize_t i
Py_ssize_t n
@@ -336,20 +338,22 @@ cdef tuple _dataframe_to_arrays(
field = schema.field_by_name(name)
type = getattr(field, "type", None)
- arr = arrays.append(
- Array.from_pandas(
- col, type=type, timestamps_to_ms=timestamps_to_ms
- )
+ array = Array.from_pandas(
+ col, type=type, timestamps_to_ms=timestamps_to_ms
)
+ arrays.append(array)
names.append(name)
+ types.append(array.type)
for i, column in enumerate(index_columns):
- arrays.append(
- Array.from_pandas(column, timestamps_to_ms=timestamps_to_ms)
- )
+ array = Array.from_pandas(column, timestamps_to_ms=timestamps_to_ms)
+ arrays.append(array)
names.append(pdcompat.index_level_name(column, i))
+ types.append(array.type)
- metadata = pdcompat.construct_metadata(df, index_columns, preserve_index)
+ metadata = pdcompat.construct_metadata(
+ df, index_columns, preserve_index, types
+ )
return names, arrays, metadata
http://git-wip-us.apache.org/repos/asf/arrow/blob/471a85fd/python/pyarrow/tests/test_array.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py
index af21741..e0a7416 100644
--- a/python/pyarrow/tests/test_array.py
+++ b/python/pyarrow/tests/test_array.py
@@ -23,6 +23,7 @@ import pandas as pd
import pandas.util.testing as tm
import pyarrow as pa
+from pyarrow.pandas_compat import get_logical_type
import pyarrow.formatting as fmt
@@ -195,3 +196,36 @@ def test_simple_type_construction():
result = pa.lib.TimestampType()
with pytest.raises(TypeError):
str(result)
+
+
+@pytest.mark.parametrize(
+ ('type', 'expected'),
+ [
+ (pa.null(), 'float64'),
+ (pa.bool_(), 'bool'),
+ (pa.int8(), 'int8'),
+ (pa.int16(), 'int16'),
+ (pa.int32(), 'int32'),
+ (pa.int64(), 'int64'),
+ (pa.uint8(), 'uint8'),
+ (pa.uint16(), 'uint16'),
+ (pa.uint32(), 'uint32'),
+ (pa.uint64(), 'uint64'),
+ (pa.float16(), 'float16'),
+ (pa.float32(), 'float32'),
+ (pa.float64(), 'float64'),
+ (pa.date32(), 'date'),
+ (pa.date64(), 'date'),
+ (pa.binary(), 'bytes'),
+ (pa.binary(length=4), 'bytes'),
+ (pa.string(), 'unicode'),
+ (pa.list_(pa.list_(pa.int16())), 'list[list[int16]]'),
+ (pa.decimal(18, 3), 'decimal'),
+ (pa.timestamp('ms'), 'datetime'),
+ (pa.timestamp('us', 'UTC'), 'datetimetz'),
+ pytest.mark.xfail((pa.time32('s'), None), raises=NotImplementedError),
+ pytest.mark.xfail((pa.time64('us'), None), raises=NotImplementedError),
+ ]
+)
+def test_logical_type(type, expected):
+ assert get_logical_type(type) == expected
http://git-wip-us.apache.org/repos/asf/arrow/blob/471a85fd/python/pyarrow/tests/test_convert_pandas.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/tests/test_convert_pandas.py b/python/pyarrow/tests/test_convert_pandas.py
index fb69cac..ac4ad82 100644
--- a/python/pyarrow/tests/test_convert_pandas.py
+++ b/python/pyarrow/tests/test_convert_pandas.py
@@ -18,10 +18,12 @@
from collections import OrderedDict
-import pytest
import datetime
import unittest
import decimal
+import json
+
+import pytest
import numpy as np
@@ -721,3 +723,43 @@ class TestPandasConversion(unittest.TestCase):
])
self._check_pandas_roundtrip(df, expected_schema=expected_schema)
+
+ def test_metadata_with_mixed_types(self):
+ df = pd.DataFrame({'data': [b'some_bytes', u'some_unicode']})
+ table = pa.Table.from_pandas(df)
+ metadata = table.schema.metadata
+ assert b'mixed' not in metadata[b'pandas']
+
+ js = json.loads(metadata[b'pandas'].decode('utf8'))
+ data_column = js['columns'][0]
+ assert data_column['pandas_type'] == 'bytes'
+ assert data_column['numpy_type'] == 'object'
+
+ def test_list_metadata(self):
+ df = pd.DataFrame({'data': [[1], [2, 3, 4], [5] * 7]})
+ schema = pa.schema([pa.field('data', type=pa.list_(pa.int64()))])
+ table = pa.Table.from_pandas(df, schema=schema)
+ metadata = table.schema.metadata
+ assert b'mixed' not in metadata[b'pandas']
+
+ js = json.loads(metadata[b'pandas'].decode('utf8'))
+ data_column = js['columns'][0]
+ assert data_column['pandas_type'] == 'list[int64]'
+ assert data_column['numpy_type'] == 'object'
+
+ def test_decimal_metadata(self):
+ expected = pd.DataFrame({
+ 'decimals': [
+ decimal.Decimal('394092382910493.12341234678'),
+ -decimal.Decimal('314292388910493.12343437128'),
+ ]
+ })
+ table = pa.Table.from_pandas(expected)
+ metadata = table.schema.metadata
+ assert b'mixed' not in metadata[b'pandas']
+
+ js = json.loads(metadata[b'pandas'].decode('utf8'))
+ data_column = js['columns'][0]
+ assert data_column['pandas_type'] == 'decimal'
+ assert data_column['numpy_type'] == 'object'
+ assert data_column['metadata'] == {'precision': 26, 'scale': 11}