You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2019/06/22 20:10:29 UTC
[arrow] branch master updated: ARROW-5169: [Python] preserve field nullability of specified schema in Table.from_pandas

This is an automated email from the ASF dual-hosted git repository.

wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new a566bc2  ARROW-5169: [Python] preserve field nullability of specified schema in Table.from_pandas
a566bc2 is described below

commit a566bc21bf7b5113c76b85a5bbe4bbea13411f9b
Author: Joris Van den Bossche <jo...@gmail.com>
AuthorDate: Sat Jun 22 15:10:21 2019 -0500

    ARROW-5169: [Python] preserve field nullability of specified schema in Table.from_pandas
    
    Author: Joris Van den Bossche <jo...@gmail.com>
    
    Closes #4397 from jorisvandenbossche/ARROW-5169-table-from-pandas-schema-nullability and squashes the following commits:
    
    f4b9c71b5 <Joris Van den Bossche> Merge remote-tracking branch 'upstream/master' into ARROW-5169-table-from-pandas-schema-nullability
    4baef2a3c <Joris Van den Bossche> update return comment
    496f731ae <Joris Van den Bossche> Merge remote-tracking branch 'upstream/master' into ARROW-5169-table-from-pandas-schema-nullability
    dbdac8cbc <Joris Van den Bossche> Merge remote-tracking branch 'upstream/master' into ARROW-5169-table-from-pandas-schema-nullability
    d693d4645 <Joris Van den Bossche> fix case of None as column name
    d5b322431 <Joris Van den Bossche> Merge remote-tracking branch 'upstream/master' into ARROW-5169-table-from-pandas-schema-nullability
    fc357a642 <Joris Van den Bossche> ARROW-5169:  preserve field nullability of specified schema in Table.from_pandas
---
 python/pyarrow/pandas_compat.py     | 24 +++++++++++++++++++++---
 python/pyarrow/table.pxi            |  8 ++++----
 python/pyarrow/tests/test_pandas.py | 14 ++++++++++++++
 3 files changed, 39 insertions(+), 7 deletions(-)

diff --git a/python/pyarrow/pandas_compat.py b/python/pyarrow/pandas_compat.py
index ea38d41..50cabb8 100644
--- a/python/pyarrow/pandas_compat.py
+++ b/python/pyarrow/pandas_compat.py
@@ -374,6 +374,7 @@ def _get_columns_to_convert(df, schema, preserve_index, columns):
     # all_names : all of the columns in the resulting table including the data
     # columns and serialized index columns
     # column_names : the names of the data columns
+    # index_column_names : the names of the serialized index columns
     # index_descriptors : descriptions of each index to be used for
     # reconstruction
     # index_levels : the extracted index level values
@@ -381,8 +382,8 @@ def _get_columns_to_convert(df, schema, preserve_index, columns):
     # to be converted to Arrow format
     # columns_types : specified column types to use for coercion / casting
     # during serialization, if a Schema was provided
-    return (all_names, column_names, index_descriptors, index_levels,
-            columns_to_convert, convert_types)
+    return (all_names, column_names, index_column_names, index_descriptors,
+            index_levels, columns_to_convert, convert_types)
 
 
 def _get_range_index_descriptor(level):
@@ -418,6 +419,7 @@ def _resolve_columns_of_interest(df, schema, columns):
 def dataframe_to_types(df, preserve_index, columns=None):
     (all_names,
      column_names,
+     _,
      index_descriptors,
      index_columns,
      columns_to_convert,
@@ -446,6 +448,7 @@ def dataframe_to_arrays(df, schema, preserve_index, nthreads=1, columns=None,
                         safe=True):
     (all_names,
      column_names,
+     index_column_names,
      index_descriptors,
      index_columns,
      columns_to_convert,
@@ -485,10 +488,25 @@ def dataframe_to_arrays(df, schema, preserve_index, nthreads=1, columns=None,
 
     types = [x.type for x in arrays]
 
+    if schema is not None:
+        # add index columns
+        index_types = types[len(column_names):]
+        for name, type_ in zip(index_column_names, index_types):
+            name = name if name is not None else 'None'
+            schema = schema.append(pa.field(name, type_))
+    else:
+        fields = []
+        for name, type_ in zip(all_names, types):
+            name = name if name is not None else 'None'
+            fields.append(pa.field(name, type_))
+        schema = pa.schema(fields)
+
     metadata = construct_metadata(df, column_names, index_columns,
                                   index_descriptors, preserve_index,
                                   types)
-    return all_names, arrays, metadata
+    schema = schema.add_metadata(metadata)
+
+    return arrays, schema
 
 
 def get_datetimetz_type(values, dtype, type_):
diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi
index db26dc2..688050b 100644
--- a/python/pyarrow/table.pxi
+++ b/python/pyarrow/table.pxi
@@ -861,10 +861,10 @@ cdef class RecordBatch(_PandasConvertible):
         pyarrow.RecordBatch
         """
         from pyarrow.pandas_compat import dataframe_to_arrays
-        names, arrays, metadata = dataframe_to_arrays(
+        arrays, schema = dataframe_to_arrays(
             df, schema, preserve_index, nthreads=nthreads, columns=columns
         )
-        return cls.from_arrays(arrays, names, metadata)
+        return cls.from_arrays(arrays, schema)
 
     @staticmethod
     def from_arrays(list arrays, names, metadata=None):
@@ -1142,7 +1142,7 @@ cdef class Table(_PandasConvertible):
         <pyarrow.lib.Table object at 0x7f05d1fb1b40>
         """
         from pyarrow.pandas_compat import dataframe_to_arrays
-        names, arrays, metadata = dataframe_to_arrays(
+        arrays, schema = dataframe_to_arrays(
             df,
             schema=schema,
             preserve_index=preserve_index,
@@ -1150,7 +1150,7 @@ cdef class Table(_PandasConvertible):
             columns=columns,
             safe=safe
         )
-        return cls.from_arrays(arrays, names=names, metadata=metadata)
+        return cls.from_arrays(arrays, schema=schema)
 
     @staticmethod
     def from_arrays(arrays, names=None, schema=None, metadata=None):
diff --git a/python/pyarrow/tests/test_pandas.py b/python/pyarrow/tests/test_pandas.py
index 4af3708..5ea3d19 100644
--- a/python/pyarrow/tests/test_pandas.py
+++ b/python/pyarrow/tests/test_pandas.py
@@ -2607,6 +2607,20 @@ def test_table_from_pandas_columns_and_schema_are_mutually_exclusive():
         pa.Table.from_pandas(df, schema=schema, columns=columns)
 
 
+def test_table_from_pandas_keeps_schema_nullability():
+    # ARROW-5169
+    df = pd.DataFrame({'a': [1, 2, 3, 4]})
+
+    schema = pa.schema([
+        pa.field('a', pa.int64(), nullable=False),
+    ])
+
+    table = pa.Table.from_pandas(df)
+    assert table.schema.field_by_name('a').nullable is True
+    table = pa.Table.from_pandas(df, schema=schema)
+    assert table.schema.field_by_name('a').nullable is False
+
+
 # ----------------------------------------------------------------------
 # RecordBatch, Table