You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ks...@apache.org on 2018/12/23 00:37:25 UTC

[arrow] branch master updated: ARROW-2592: [Python] Add "ignore_metadata" option to Table.to_pandas

This is an automated email from the ASF dual-hosted git repository.

kszucs pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 6578089  ARROW-2592: [Python] Add "ignore_metadata" option to Table.to_pandas
6578089 is described below

commit 6578089472958b20126d5c56fe8f8737b02b5544
Author: Wes McKinney <we...@apache.org>
AuthorDate: Sun Dec 23 01:37:13 2018 +0100

    ARROW-2592: [Python] Add "ignore_metadata" option to Table.to_pandas
    
    This option circumvents the index reconstruction logic if there is `'pandas'` metadata. This can also be achieved using `table.cast(table.schema.remove_metadata()).to_pandas()`, but this makes it more obvious and discoverable to users.
    
    A user had an issue reading a Parquet file with some old metadata that we are no longer able to correctly process.
    
    Author: Wes McKinney <we...@apache.org>
    
    Closes #3239 from wesm/ARROW-2592 and squashes the following commits:
    
    82ac7a01 <Wes McKinney> Unit test for ignore_metadata option
    6c4246ef <Wes McKinney> Test stub
    8cf45a7a <Wes McKinney> Add ignore_metadata option to Table.to_pandas
---
 python/pyarrow/pandas_compat.py             |  6 ++++--
 python/pyarrow/table.pxi                    | 16 ++++++++++++----
 python/pyarrow/tests/test_convert_pandas.py | 11 +++++++++++
 3 files changed, 27 insertions(+), 6 deletions(-)

diff --git a/python/pyarrow/pandas_compat.py b/python/pyarrow/pandas_compat.py
index 0eebcf6..6acca0c 100644
--- a/python/pyarrow/pandas_compat.py
+++ b/python/pyarrow/pandas_compat.py
@@ -548,7 +548,8 @@ def _make_datetimetz(tz):
 # Converting pyarrow.Table efficiently to pandas.DataFrame
 
 
-def table_to_blockmanager(options, table, memory_pool, categories=None):
+def table_to_blockmanager(options, table, memory_pool, categories=None,
+                          ignore_metadata=False):
     from pyarrow.compat import DatetimeTZDtype
 
     index_columns = []
@@ -560,7 +561,8 @@ def table_to_blockmanager(options, table, memory_pool, categories=None):
     row_count = table.num_rows
     metadata = schema.metadata
 
-    has_pandas_metadata = metadata is not None and b'pandas' in metadata
+    has_pandas_metadata = (not ignore_metadata and metadata is not None
+                           and b'pandas' in metadata)
 
     if has_pandas_metadata:
         pandas_metadata = json.loads(metadata[b'pandas'].decode('utf8'))
diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi
index 4d52f26..29a784d 100644
--- a/python/pyarrow/table.pxi
+++ b/python/pyarrow/table.pxi
@@ -890,7 +890,7 @@ cdef class RecordBatch:
     def to_pandas(self, MemoryPool memory_pool=None, categories=None,
                   bint strings_to_categorical=False, bint zero_copy_only=False,
                   bint integer_object_nulls=False, bint date_as_object=False,
-                  bint use_threads=True):
+                  bint use_threads=True, bint ignore_metadata=False):
         """
         Convert the arrow::RecordBatch to a pandas DataFrame
 
@@ -911,6 +911,9 @@ cdef class RecordBatch:
             Cast dates to objects
         use_threads: boolean, default True
             Whether to parallelize the conversion using multiple threads
+        ignore_metadata : boolean, default False
+            If True, do not use the 'pandas' metadata to reconstruct the
+            DataFrame index, if present
 
         Returns
         -------
@@ -921,7 +924,8 @@ cdef class RecordBatch:
             strings_to_categorical=strings_to_categorical,
             zero_copy_only=zero_copy_only,
             integer_object_nulls=integer_object_nulls,
-            date_as_object=date_as_object, use_threads=use_threads
+            date_as_object=date_as_object, use_threads=use_threads,
+            ignore_metadata=ignore_metadata
         )
 
     @classmethod
@@ -1385,7 +1389,7 @@ cdef class Table:
     def to_pandas(self, MemoryPool memory_pool=None, categories=None,
                   bint strings_to_categorical=False, bint zero_copy_only=False,
                   bint integer_object_nulls=False, bint date_as_object=False,
-                  bint use_threads=True):
+                  bint use_threads=True, bint ignore_metadata=False):
         """
         Convert the arrow::Table to a pandas DataFrame
 
@@ -1406,6 +1410,9 @@ cdef class Table:
             Cast dates to objects
         use_threads: boolean, default True
             Whether to parallelize the conversion using multiple threads
+        ignore_metadata : boolean, default False
+            If True, do not use the 'pandas' metadata to reconstruct the
+            DataFrame index, if present
 
         Returns
         -------
@@ -1422,7 +1429,8 @@ cdef class Table:
             use_threads=use_threads)
 
         mgr = pdcompat.table_to_blockmanager(options, self, memory_pool,
-                                             categories)
+                                             categories,
+                                             ignore_metadata=ignore_metadata)
         return pd.DataFrame(mgr)
 
     def to_pydict(self):
diff --git a/python/pyarrow/tests/test_convert_pandas.py b/python/pyarrow/tests/test_convert_pandas.py
index 41bcae8..1221484 100644
--- a/python/pyarrow/tests/test_convert_pandas.py
+++ b/python/pyarrow/tests/test_convert_pandas.py
@@ -376,6 +376,17 @@ class TestConvertMetadata(object):
         assert data_column['pandas_type'] == 'bytes'
         assert data_column['numpy_type'] == 'object'
 
+    def test_ignore_metadata(self):
+        df = pd.DataFrame({'a': [1, 2, 3], 'b': ['foo', 'bar', 'baz']},
+                          index=['one', 'two', 'three'])
+        table = pa.Table.from_pandas(df)
+
+        result = table.to_pandas(ignore_metadata=True)
+        expected = (table.cast(table.schema.remove_metadata())
+                    .to_pandas())
+
+        assert result.equals(expected)
+
     def test_list_metadata(self):
         df = pd.DataFrame({'data': [[1], [2, 3, 4], [5] * 7]})
         schema = pa.schema([pa.field('data', type=pa.list_(pa.int64()))])