You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2018/11/26 14:29:25 UTC
[arrow] branch master updated: ARROW-1993: [Python] Add function
for determining implied Arrow schema from pandas.DataFrame
This is an automated email from the ASF dual-hosted git repository.
wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 54b0af8 ARROW-1993: [Python] Add function for determining implied Arrow schema from pandas.DataFrame
54b0af8 is described below
commit 54b0af837b0ad2f67e97c61a7cdeeca7fd081d0d
Author: Krisztián Szűcs <sz...@gmail.com>
AuthorDate: Mon Nov 26 09:29:18 2018 -0500
ARROW-1993: [Python] Add function for determining implied Arrow schema from pandas.DataFrame
Author: Krisztián Szűcs <sz...@gmail.com>
Author: Kee Chong Tan <keechong.tan@>
Closes #1929 from keechongtan/ARROW-1993 and squashes the following commits:
22c357e09 <Krisztián Szűcs> use tmp variable
a9d6a5c7d <Krisztián Szűcs> documentation fixes
3544e42a1 <Krisztián Szűcs> except
d81983475 <Krisztián Szűcs> fix segfault on py2
0cf42b57d <Krisztián Szűcs> fix exception handling
00e86f64c <Krisztián Szűcs> slightly rename functions
7113b6d79 <Krisztián Szűcs> rebase
c7409c6df <Kee Chong Tan> Fix incorrect variable used
d631fb308 <Kee Chong Tan> Add function for determining implied Arrow schema from pandas.DataFrame
b04a09b3d <Kee Chong Tan> Fix incorrect variable used
a5c8b9d0c <Kee Chong Tan> Add function for determining implied Arrow schema from pandas.DataFrame
---
python/doc/source/pandas.rst | 2 ++
python/pyarrow/array.pxi | 23 ++++++++++++++------
python/pyarrow/pandas_compat.py | 43 +++++++++++++++++++++++++++++++++++---
python/pyarrow/tests/test_types.py | 22 +++++++++++++++++--
python/pyarrow/types.pxi | 39 ++++++++++++++++++++++++++++++++++
5 files changed, 118 insertions(+), 11 deletions(-)
diff --git a/python/doc/source/pandas.rst b/python/doc/source/pandas.rst
index be11b5b..6ade171 100644
--- a/python/doc/source/pandas.rst
+++ b/python/doc/source/pandas.rst
@@ -52,6 +52,8 @@ Conversion from a Table to a DataFrame is done by calling
# Convert back to pandas
df_new = table.to_pandas()
+ # Infer Arrow schema from pandas
+ schema = pa.Schema.from_pandas(df)
Series
------
diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi
index c3a17a1..b86872f 100644
--- a/python/pyarrow/array.pxi
+++ b/python/pyarrow/array.pxi
@@ -49,12 +49,13 @@ cdef _is_array_like(obj):
return isinstance(obj, np.ndarray)
-cdef _ndarray_to_array(object values, object mask, DataType type,
- c_bool from_pandas, c_bool safe, CMemoryPool* pool):
- cdef:
- shared_ptr[CChunkedArray] chunked_out
- shared_ptr[CDataType] c_type
- CCastOptions cast_options = CCastOptions(safe)
+def _ndarray_to_arrow_type(object values, DataType type):
+ return pyarrow_wrap_data_type(_ndarray_to_type(values, type))
+
+
+cdef shared_ptr[CDataType] _ndarray_to_type(object values,
+ DataType type) except *:
+ cdef shared_ptr[CDataType] c_type
dtype = values.dtype
@@ -65,6 +66,16 @@ cdef _ndarray_to_array(object values, object mask, DataType type,
if type is not None:
c_type = type.sp_type
+ return c_type
+
+
+cdef _ndarray_to_array(object values, object mask, DataType type,
+ c_bool from_pandas, c_bool safe, CMemoryPool* pool):
+ cdef:
+ shared_ptr[CChunkedArray] chunked_out
+ shared_ptr[CDataType] c_type = _ndarray_to_type(values, type)
+ CCastOptions cast_options = CCastOptions(safe)
+
with nogil:
check_status(NdarrayToArrow(pool, values, mask, from_pandas,
c_type, cast_options, &chunked_out))
diff --git a/python/pyarrow/pandas_compat.py b/python/pyarrow/pandas_compat.py
index 9344e46..ec0e490 100644
--- a/python/pyarrow/pandas_compat.py
+++ b/python/pyarrow/pandas_compat.py
@@ -316,8 +316,7 @@ def _index_level_name(index, i, column_names):
return '__index_level_{:d}__'.format(i)
-def dataframe_to_arrays(df, schema, preserve_index, nthreads=1, columns=None,
- safe=True):
+def _get_columns_to_convert(df, schema, preserve_index, columns):
if schema is not None and columns is not None:
raise ValueError('Schema and columns arguments are mutually '
'exclusive, pass only one of them')
@@ -365,6 +364,44 @@ def dataframe_to_arrays(df, schema, preserve_index, nthreads=1, columns=None,
name = _index_level_name(column, i, column_names)
index_column_names.append(name)
+ names = column_names + index_column_names
+
+ return (names, column_names, index_columns, index_column_names,
+ columns_to_convert, convert_types)
+
+
+def dataframe_to_types(df, preserve_index, columns=None):
+ names, column_names, index_columns, index_column_names, \
+ columns_to_convert, _ = _get_columns_to_convert(
+ df, None, preserve_index, columns
+ )
+
+ types = []
+ # If pandas knows type, skip conversion
+ for c in columns_to_convert:
+ values = c.values
+ if isinstance(values, pd.Categorical):
+ type_ = pa.array(c, from_pandas=True).type
+ else:
+ values, type_ = get_datetimetz_type(values, c.dtype, None)
+ type_ = pa.lib._ndarray_to_arrow_type(values, type_)
+ if type_ is None:
+ type_ = pa.array(c, from_pandas=True).type
+ types.append(type_)
+
+ metadata = construct_metadata(df, column_names, index_columns,
+ index_column_names, preserve_index, types)
+
+ return names, types, metadata
+
+
+def dataframe_to_arrays(df, schema, preserve_index, nthreads=1, columns=None,
+ safe=True):
+ names, column_names, index_columns, index_column_names, \
+ columns_to_convert, convert_types = _get_columns_to_convert(
+ df, schema, preserve_index, columns
+ )
+
# NOTE(wesm): If nthreads=None, then we use a heuristic to decide whether
# using a thread pool is worth it. Currently the heuristic is whether the
# nrows > 100 * ncols.
@@ -402,7 +439,7 @@ def dataframe_to_arrays(df, schema, preserve_index, nthreads=1, columns=None,
df, column_names, index_columns, index_column_names, preserve_index,
types
)
- names = column_names + index_column_names
+
return names, arrays, metadata
diff --git a/python/pyarrow/tests/test_types.py b/python/pyarrow/tests/test_types.py
index b574713..b15cb57 100644
--- a/python/pyarrow/tests/test_types.py
+++ b/python/pyarrow/tests/test_types.py
@@ -17,10 +17,11 @@
from collections import OrderedDict
-import numpy as np
import pickle
import pytest
+import pandas as pd
+import numpy as np
import pyarrow as pa
import pyarrow.types as types
@@ -478,7 +479,7 @@ def test_field_add_remove_metadata():
def test_empty_table():
schema = pa.schema([
- pa.field("oneField", pa.int64())
+ pa.field('oneField', pa.int64())
])
table = schema.empty_table()
assert isinstance(table, pa.Table)
@@ -505,3 +506,20 @@ def test_is_boolean_value():
assert pa.types.is_boolean_value(False)
assert pa.types.is_boolean_value(np.bool_(True))
assert pa.types.is_boolean_value(np.bool_(False))
+
+
+@pytest.mark.parametrize('data', [
+ list(range(10)),
+ pd.Categorical(list(range(10))),
+ ['foo', 'bar', None, 'baz', 'qux'],
+ np.array([
+ '2007-07-13T01:23:34.123456789',
+ '2006-01-13T12:34:56.432539784',
+ '2010-08-13T05:46:57.437699912'
+ ], dtype='datetime64[ns]')
+])
+def test_schema_from_pandas(data):
+ df = pd.DataFrame({'a': data})
+ schema = pa.Schema.from_pandas(df)
+ expected = pa.Table.from_pandas(df).schema
+ assert schema == expected
diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi
index 399f15e..726bf0c 100644
--- a/python/pyarrow/types.pxi
+++ b/python/pyarrow/types.pxi
@@ -591,6 +591,45 @@ cdef class Schema:
return self.sp_schema.get().Equals(deref(_other.schema),
check_metadata)
+ @classmethod
+ def from_pandas(cls, df, bint preserve_index=True):
+ """
+ Returns implied schema from dataframe
+
+ Parameters
+ ----------
+ df : pandas.DataFrame
+ preserve_index : bool, default True
+ Whether to store the index as an additional column (or columns, for
+ MultiIndex) in the resulting `Table`.
+
+ Returns
+ -------
+ pyarrow.Schema
+
+ Examples
+ --------
+
+ >>> import pandas as pd
+ >>> import pyarrow as pa
+ >>> df = pd.DataFrame({
+ ... 'int': [1, 2],
+ ... 'str': ['a', 'b']
+ ... })
+ >>> pa.Schema.from_pandas(df)
+ int: int64
+ str: string
+ __index_level_0__: int64
+ """
+ names, types, metadata = pdcompat.dataframe_to_types(
+ df,
+ preserve_index=preserve_index
+ )
+ fields = []
+ for name, type_ in zip(names, types):
+ fields.append(field(name, type_))
+ return schema(fields, metadata)
+
def field_by_name(self, name):
"""
Access a field by its name rather than the column index.