You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ks...@apache.org on 2018/12/23 00:37:25 UTC
[arrow] branch master updated: ARROW-2592: [Python] Add
"ignore_metadata" option to Table.to_pandas
This is an automated email from the ASF dual-hosted git repository.
kszucs pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 6578089 ARROW-2592: [Python] Add "ignore_metadata" option to Table.to_pandas
6578089 is described below
commit 6578089472958b20126d5c56fe8f8737b02b5544
Author: Wes McKinney <we...@apache.org>
AuthorDate: Sun Dec 23 01:37:13 2018 +0100
ARROW-2592: [Python] Add "ignore_metadata" option to Table.to_pandas
This option circumvents the index reconstruction logic if there is `'pandas'` metadata. This can also be achieved using `table.cast(table.schema.remove_metadata()).to_pandas()`, but this makes it more obvious and discoverable to users.
A user had an issue reading a Parquet file with some old metadata that we are no longer able to correctly process.
Author: Wes McKinney <we...@apache.org>
Closes #3239 from wesm/ARROW-2592 and squashes the following commits:
82ac7a01 <Wes McKinney> Unit test for ignore_metadata option
6c4246ef <Wes McKinney> Test stub
8cf45a7a <Wes McKinney> Add ignore_metadata option to Table.to_pandas
---
python/pyarrow/pandas_compat.py | 6 ++++--
python/pyarrow/table.pxi | 16 ++++++++++++----
python/pyarrow/tests/test_convert_pandas.py | 11 +++++++++++
3 files changed, 27 insertions(+), 6 deletions(-)
diff --git a/python/pyarrow/pandas_compat.py b/python/pyarrow/pandas_compat.py
index 0eebcf6..6acca0c 100644
--- a/python/pyarrow/pandas_compat.py
+++ b/python/pyarrow/pandas_compat.py
@@ -548,7 +548,8 @@ def _make_datetimetz(tz):
# Converting pyarrow.Table efficiently to pandas.DataFrame
-def table_to_blockmanager(options, table, memory_pool, categories=None):
+def table_to_blockmanager(options, table, memory_pool, categories=None,
+ ignore_metadata=False):
from pyarrow.compat import DatetimeTZDtype
index_columns = []
@@ -560,7 +561,8 @@ def table_to_blockmanager(options, table, memory_pool, categories=None):
row_count = table.num_rows
metadata = schema.metadata
- has_pandas_metadata = metadata is not None and b'pandas' in metadata
+ has_pandas_metadata = (not ignore_metadata and metadata is not None
+ and b'pandas' in metadata)
if has_pandas_metadata:
pandas_metadata = json.loads(metadata[b'pandas'].decode('utf8'))
diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi
index 4d52f26..29a784d 100644
--- a/python/pyarrow/table.pxi
+++ b/python/pyarrow/table.pxi
@@ -890,7 +890,7 @@ cdef class RecordBatch:
def to_pandas(self, MemoryPool memory_pool=None, categories=None,
bint strings_to_categorical=False, bint zero_copy_only=False,
bint integer_object_nulls=False, bint date_as_object=False,
- bint use_threads=True):
+ bint use_threads=True, bint ignore_metadata=False):
"""
Convert the arrow::RecordBatch to a pandas DataFrame
@@ -911,6 +911,9 @@ cdef class RecordBatch:
Cast dates to objects
use_threads: boolean, default True
Whether to parallelize the conversion using multiple threads
+ ignore_metadata : boolean, default False
+ If True, do not use the 'pandas' metadata to reconstruct the
+ DataFrame index, if present
Returns
-------
@@ -921,7 +924,8 @@ cdef class RecordBatch:
strings_to_categorical=strings_to_categorical,
zero_copy_only=zero_copy_only,
integer_object_nulls=integer_object_nulls,
- date_as_object=date_as_object, use_threads=use_threads
+ date_as_object=date_as_object, use_threads=use_threads,
+ ignore_metadata=ignore_metadata
)
@classmethod
@@ -1385,7 +1389,7 @@ cdef class Table:
def to_pandas(self, MemoryPool memory_pool=None, categories=None,
bint strings_to_categorical=False, bint zero_copy_only=False,
bint integer_object_nulls=False, bint date_as_object=False,
- bint use_threads=True):
+ bint use_threads=True, bint ignore_metadata=False):
"""
Convert the arrow::Table to a pandas DataFrame
@@ -1406,6 +1410,9 @@ cdef class Table:
Cast dates to objects
use_threads: boolean, default True
Whether to parallelize the conversion using multiple threads
+ ignore_metadata : boolean, default False
+ If True, do not use the 'pandas' metadata to reconstruct the
+ DataFrame index, if present
Returns
-------
@@ -1422,7 +1429,8 @@ cdef class Table:
use_threads=use_threads)
mgr = pdcompat.table_to_blockmanager(options, self, memory_pool,
- categories)
+ categories,
+ ignore_metadata=ignore_metadata)
return pd.DataFrame(mgr)
def to_pydict(self):
diff --git a/python/pyarrow/tests/test_convert_pandas.py b/python/pyarrow/tests/test_convert_pandas.py
index 41bcae8..1221484 100644
--- a/python/pyarrow/tests/test_convert_pandas.py
+++ b/python/pyarrow/tests/test_convert_pandas.py
@@ -376,6 +376,17 @@ class TestConvertMetadata(object):
assert data_column['pandas_type'] == 'bytes'
assert data_column['numpy_type'] == 'object'
+ def test_ignore_metadata(self):
+ df = pd.DataFrame({'a': [1, 2, 3], 'b': ['foo', 'bar', 'baz']},
+ index=['one', 'two', 'three'])
+ table = pa.Table.from_pandas(df)
+
+ result = table.to_pandas(ignore_metadata=True)
+ expected = (table.cast(table.schema.remove_metadata())
+ .to_pandas())
+
+ assert result.equals(expected)
+
def test_list_metadata(self):
df = pd.DataFrame({'data': [[1], [2, 3, 4], [5] * 7]})
schema = pa.schema([pa.field('data', type=pa.list_(pa.int64()))])