You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2018/11/26 14:29:25 UTC

[arrow] branch master updated: ARROW-1993: [Python] Add function for determining implied Arrow schema from pandas.DataFrame

This is an automated email from the ASF dual-hosted git repository.

wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 54b0af8  ARROW-1993: [Python] Add function for determining implied Arrow schema from pandas.DataFrame
54b0af8 is described below

commit 54b0af837b0ad2f67e97c61a7cdeeca7fd081d0d
Author: Krisztián Szűcs <sz...@gmail.com>
AuthorDate: Mon Nov 26 09:29:18 2018 -0500

    ARROW-1993: [Python] Add function for determining implied Arrow schema from pandas.DataFrame
    
    Author: Krisztián Szűcs <sz...@gmail.com>
    Author: Kee Chong Tan <keechong.tan@>
    
    Closes #1929 from keechongtan/ARROW-1993 and squashes the following commits:
    
    22c357e09 <Krisztián Szűcs> use tmp variable
    a9d6a5c7d <Krisztián Szűcs> documentation fixes
    3544e42a1 <Krisztián Szűcs> except
    d81983475 <Krisztián Szűcs> fix segfault on py2
    0cf42b57d <Krisztián Szűcs> fix exception handling
    00e86f64c <Krisztián Szűcs> slightly rename functions
    7113b6d79 <Krisztián Szűcs> rebase
    c7409c6df <Kee Chong Tan> Fix incorrect variable used
    d631fb308 <Kee Chong Tan> Add function for determining implied Arrow schema from pandas.DataFrame
    b04a09b3d <Kee Chong Tan> Fix incorrect variable used
    a5c8b9d0c <Kee Chong Tan> Add function for determining implied Arrow schema from pandas.DataFrame
---
 python/doc/source/pandas.rst       |  2 ++
 python/pyarrow/array.pxi           | 23 ++++++++++++++------
 python/pyarrow/pandas_compat.py    | 43 +++++++++++++++++++++++++++++++++++---
 python/pyarrow/tests/test_types.py | 22 +++++++++++++++++--
 python/pyarrow/types.pxi           | 39 ++++++++++++++++++++++++++++++++++
 5 files changed, 118 insertions(+), 11 deletions(-)

diff --git a/python/doc/source/pandas.rst b/python/doc/source/pandas.rst
index be11b5b..6ade171 100644
--- a/python/doc/source/pandas.rst
+++ b/python/doc/source/pandas.rst
@@ -52,6 +52,8 @@ Conversion from a Table to a DataFrame is done by calling
     # Convert back to pandas
     df_new = table.to_pandas()
 
+    # Infer Arrow schema from pandas
+    schema = pa.Schema.from_pandas(df)
 
 Series
 ------
diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi
index c3a17a1..b86872f 100644
--- a/python/pyarrow/array.pxi
+++ b/python/pyarrow/array.pxi
@@ -49,12 +49,13 @@ cdef _is_array_like(obj):
         return isinstance(obj, np.ndarray)
 
 
-cdef _ndarray_to_array(object values, object mask, DataType type,
-                       c_bool from_pandas, c_bool safe, CMemoryPool* pool):
-    cdef:
-        shared_ptr[CChunkedArray] chunked_out
-        shared_ptr[CDataType] c_type
-        CCastOptions cast_options = CCastOptions(safe)
+def _ndarray_to_arrow_type(object values, DataType type):
+    return pyarrow_wrap_data_type(_ndarray_to_type(values, type))
+
+
+cdef shared_ptr[CDataType] _ndarray_to_type(object values,
+                                            DataType type) except *:
+    cdef shared_ptr[CDataType] c_type
 
     dtype = values.dtype
 
@@ -65,6 +66,16 @@ cdef _ndarray_to_array(object values, object mask, DataType type,
     if type is not None:
         c_type = type.sp_type
 
+    return c_type
+
+
+cdef _ndarray_to_array(object values, object mask, DataType type,
+                       c_bool from_pandas, c_bool safe, CMemoryPool* pool):
+    cdef:
+        shared_ptr[CChunkedArray] chunked_out
+        shared_ptr[CDataType] c_type = _ndarray_to_type(values, type)
+        CCastOptions cast_options = CCastOptions(safe)
+
     with nogil:
         check_status(NdarrayToArrow(pool, values, mask, from_pandas,
                                     c_type, cast_options, &chunked_out))
diff --git a/python/pyarrow/pandas_compat.py b/python/pyarrow/pandas_compat.py
index 9344e46..ec0e490 100644
--- a/python/pyarrow/pandas_compat.py
+++ b/python/pyarrow/pandas_compat.py
@@ -316,8 +316,7 @@ def _index_level_name(index, i, column_names):
         return '__index_level_{:d}__'.format(i)
 
 
-def dataframe_to_arrays(df, schema, preserve_index, nthreads=1, columns=None,
-                        safe=True):
+def _get_columns_to_convert(df, schema, preserve_index, columns):
     if schema is not None and columns is not None:
         raise ValueError('Schema and columns arguments are mutually '
                          'exclusive, pass only one of them')
@@ -365,6 +364,44 @@ def dataframe_to_arrays(df, schema, preserve_index, nthreads=1, columns=None,
         name = _index_level_name(column, i, column_names)
         index_column_names.append(name)
 
+    names = column_names + index_column_names
+
+    return (names, column_names, index_columns, index_column_names,
+            columns_to_convert, convert_types)
+
+
+def dataframe_to_types(df, preserve_index, columns=None):
+    names, column_names, index_columns, index_column_names, \
+        columns_to_convert, _ = _get_columns_to_convert(
+            df, None, preserve_index, columns
+        )
+
+    types = []
+    # If pandas knows type, skip conversion
+    for c in columns_to_convert:
+        values = c.values
+        if isinstance(values, pd.Categorical):
+            type_ = pa.array(c, from_pandas=True).type
+        else:
+            values, type_ = get_datetimetz_type(values, c.dtype, None)
+            type_ = pa.lib._ndarray_to_arrow_type(values, type_)
+            if type_ is None:
+                type_ = pa.array(c, from_pandas=True).type
+        types.append(type_)
+
+    metadata = construct_metadata(df, column_names, index_columns,
+                                  index_column_names, preserve_index, types)
+
+    return names, types, metadata
+
+
+def dataframe_to_arrays(df, schema, preserve_index, nthreads=1, columns=None,
+                        safe=True):
+    names, column_names, index_columns, index_column_names, \
+        columns_to_convert, convert_types = _get_columns_to_convert(
+            df, schema, preserve_index, columns
+        )
+
     # NOTE(wesm): If nthreads=None, then we use a heuristic to decide whether
     # using a thread pool is worth it. Currently the heuristic is whether the
     # nrows > 100 * ncols.
@@ -402,7 +439,7 @@ def dataframe_to_arrays(df, schema, preserve_index, nthreads=1, columns=None,
         df, column_names, index_columns, index_column_names, preserve_index,
         types
     )
-    names = column_names + index_column_names
+
     return names, arrays, metadata
 
 
diff --git a/python/pyarrow/tests/test_types.py b/python/pyarrow/tests/test_types.py
index b574713..b15cb57 100644
--- a/python/pyarrow/tests/test_types.py
+++ b/python/pyarrow/tests/test_types.py
@@ -17,10 +17,11 @@
 
 from collections import OrderedDict
 
-import numpy as np
 import pickle
 import pytest
 
+import pandas as pd
+import numpy as np
 import pyarrow as pa
 import pyarrow.types as types
 
@@ -478,7 +479,7 @@ def test_field_add_remove_metadata():
 
 def test_empty_table():
     schema = pa.schema([
-        pa.field("oneField", pa.int64())
+        pa.field('oneField', pa.int64())
     ])
     table = schema.empty_table()
     assert isinstance(table, pa.Table)
@@ -505,3 +506,20 @@ def test_is_boolean_value():
     assert pa.types.is_boolean_value(False)
     assert pa.types.is_boolean_value(np.bool_(True))
     assert pa.types.is_boolean_value(np.bool_(False))
+
+
+@pytest.mark.parametrize('data', [
+    list(range(10)),
+    pd.Categorical(list(range(10))),
+    ['foo', 'bar', None, 'baz', 'qux'],
+    np.array([
+        '2007-07-13T01:23:34.123456789',
+        '2006-01-13T12:34:56.432539784',
+        '2010-08-13T05:46:57.437699912'
+    ], dtype='datetime64[ns]')
+])
+def test_schema_from_pandas(data):
+    df = pd.DataFrame({'a': data})
+    schema = pa.Schema.from_pandas(df)
+    expected = pa.Table.from_pandas(df).schema
+    assert schema == expected
diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi
index 399f15e..726bf0c 100644
--- a/python/pyarrow/types.pxi
+++ b/python/pyarrow/types.pxi
@@ -591,6 +591,45 @@ cdef class Schema:
         return self.sp_schema.get().Equals(deref(_other.schema),
                                            check_metadata)
 
+    @classmethod
+    def from_pandas(cls, df, bint preserve_index=True):
+        """
+        Returns implied schema from dataframe
+
+        Parameters
+        ----------
+        df : pandas.DataFrame
+        preserve_index : bool, default True
+            Whether to store the index as an additional column (or columns, for
+            MultiIndex) in the resulting `Table`.
+
+        Returns
+        -------
+        pyarrow.Schema
+
+        Examples
+        --------
+
+        >>> import pandas as pd
+        >>> import pyarrow as pa
+        >>> df = pd.DataFrame({
+            ...     'int': [1, 2],
+            ...     'str': ['a', 'b']
+            ... })
+        >>> pa.Schema.from_pandas(df)
+        int: int64
+        str: string
+        __index_level_0__: int64
+        """
+        names, types, metadata = pdcompat.dataframe_to_types(
+            df,
+            preserve_index=preserve_index
+        )
+        fields = []
+        for name, type_ in zip(names, types):
+            fields.append(field(name, type_))
+        return schema(fields, metadata)
+
     def field_by_name(self, name):
         """
         Access a field by its name rather than the column index.