You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2017/11/11 22:09:57 UTC
[arrow] branch master updated: ARROW-1787: [Python] Support reading
parquet files into DataFrames in a backward compatible way
This is an automated email from the ASF dual-hosted git repository.
wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 7c205b0 ARROW-1787: [Python] Support reading parquet files into DataFrames in a backward compatible way
7c205b0 is described below
commit 7c205b0337cd0364a7f9e1e19a9a6d5423abfe30
Author: Phillip Cloud <cp...@gmail.com>
AuthorDate: Sat Nov 11 17:09:50 2017 -0500
ARROW-1787: [Python] Support reading parquet files into DataFrames in a backward compatible way
Author: Phillip Cloud <cp...@gmail.com>
Closes #1298 from cpcloud/ARROW-1787 and squashes the following commits:
6f5fbd55 [Phillip Cloud] Add more index naming tests
8ba06256 [Phillip Cloud] Add test data
56e7fe58 [Phillip Cloud] Use BytesIO
5f50da38 [Phillip Cloud] Implement
d61c43e6 [Phillip Cloud] Add test
9abad95f [Phillip Cloud] Add test data
---
python/pyarrow/pandas_compat.py | 13 +++-
.../tests/data/v0.7.1.all-named-index.parquet | Bin 0 -> 3948 bytes
python/pyarrow/tests/data/v0.7.1.parquet | Bin 0 -> 4372 bytes
.../tests/data/v0.7.1.some-named-index.parquet | Bin 0 -> 4008 bytes
python/pyarrow/tests/test_parquet.py | 73 +++++++++++++++++++++
5 files changed, 85 insertions(+), 1 deletion(-)
diff --git a/python/pyarrow/pandas_compat.py b/python/pyarrow/pandas_compat.py
index 87b47b8..db28ee0 100644
--- a/python/pyarrow/pandas_compat.py
+++ b/python/pyarrow/pandas_compat.py
@@ -18,6 +18,7 @@
import ast
import collections
import json
+import re
import numpy as np
import pandas as pd
@@ -353,6 +354,14 @@ def make_datetimetz(tz):
return DatetimeTZDtype('ns', tz=tz)
+def backwards_compatible_index_name(raw_name, logical_name):
+ pattern = r'^__index_level_\d+__$'
+ if raw_name == logical_name and re.match(pattern, raw_name) is not None:
+ return None
+ else:
+ return logical_name
+
+
def table_to_blockmanager(options, table, memory_pool, nthreads=1):
import pandas.core.internals as _int
import pyarrow.lib as lib
@@ -394,7 +403,9 @@ def table_to_blockmanager(options, table, memory_pool, nthreads=1):
values = values.copy()
index_arrays.append(pd.Series(values, dtype=col_pandas.dtype))
- index_names.append(logical_name)
+ index_names.append(
+ backwards_compatible_index_name(raw_name, logical_name)
+ )
block_table = block_table.remove_column(
block_table.schema.get_field_index(raw_name)
)
diff --git a/python/pyarrow/tests/data/v0.7.1.all-named-index.parquet b/python/pyarrow/tests/data/v0.7.1.all-named-index.parquet
new file mode 100644
index 0000000..e9efd9b
Binary files /dev/null and b/python/pyarrow/tests/data/v0.7.1.all-named-index.parquet differ
diff --git a/python/pyarrow/tests/data/v0.7.1.parquet b/python/pyarrow/tests/data/v0.7.1.parquet
new file mode 100644
index 0000000..44670bc
Binary files /dev/null and b/python/pyarrow/tests/data/v0.7.1.parquet differ
diff --git a/python/pyarrow/tests/data/v0.7.1.some-named-index.parquet b/python/pyarrow/tests/data/v0.7.1.some-named-index.parquet
new file mode 100644
index 0000000..34097ca
Binary files /dev/null and b/python/pyarrow/tests/data/v0.7.1.some-named-index.parquet differ
diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py
index e2e6863..6ba4fd2 100644
--- a/python/pyarrow/tests/test_parquet.py
+++ b/python/pyarrow/tests/test_parquet.py
@@ -1458,3 +1458,76 @@ def test_index_column_name_duplicate(tmpdir):
arrow_table = _read_table(path)
result_df = arrow_table.to_pandas()
tm.assert_frame_equal(result_df, dfx)
+
+
+def test_backwards_compatible_index_naming():
+ expected_string = b"""\
+carat cut color clarity depth table price x y z
+ 0.23 Ideal E SI2 61.5 55.0 326 3.95 3.98 2.43
+ 0.21 Premium E SI1 59.8 61.0 326 3.89 3.84 2.31
+ 0.23 Good E VS1 56.9 65.0 327 4.05 4.07 2.31
+ 0.29 Premium I VS2 62.4 58.0 334 4.20 4.23 2.63
+ 0.31 Good J SI2 63.3 58.0 335 4.34 4.35 2.75
+ 0.24 Very Good J VVS2 62.8 57.0 336 3.94 3.96 2.48
+ 0.24 Very Good I VVS1 62.3 57.0 336 3.95 3.98 2.47
+ 0.26 Very Good H SI1 61.9 55.0 337 4.07 4.11 2.53
+ 0.22 Fair E VS2 65.1 61.0 337 3.87 3.78 2.49
+ 0.23 Very Good H VS1 59.4 61.0 338 4.00 4.05 2.39"""
+ expected = pd.read_csv(
+ io.BytesIO(expected_string), sep=r'\s{2,}', index_col=None, header=0
+ )
+ path = os.path.join(os.path.dirname(__file__), 'data', 'v0.7.1.parquet')
+ t = _read_table(path)
+ result = t.to_pandas()
+ tm.assert_frame_equal(result, expected)
+
+
+def test_backwards_compatible_index_multi_level_named():
+ expected_string = b"""\
+carat cut color clarity depth table price x y z
+ 0.23 Ideal E SI2 61.5 55.0 326 3.95 3.98 2.43
+ 0.21 Premium E SI1 59.8 61.0 326 3.89 3.84 2.31
+ 0.23 Good E VS1 56.9 65.0 327 4.05 4.07 2.31
+ 0.29 Premium I VS2 62.4 58.0 334 4.20 4.23 2.63
+ 0.31 Good J SI2 63.3 58.0 335 4.34 4.35 2.75
+ 0.24 Very Good J VVS2 62.8 57.0 336 3.94 3.96 2.48
+ 0.24 Very Good I VVS1 62.3 57.0 336 3.95 3.98 2.47
+ 0.26 Very Good H SI1 61.9 55.0 337 4.07 4.11 2.53
+ 0.22 Fair E VS2 65.1 61.0 337 3.87 3.78 2.49
+ 0.23 Very Good H VS1 59.4 61.0 338 4.00 4.05 2.39"""
+ expected = pd.read_csv(
+ io.BytesIO(expected_string),
+ sep=r'\s{2,}', index_col=['cut', 'color', 'clarity'], header=0
+ ).sort_index()
+ path = os.path.join(
+ os.path.dirname(__file__), 'data', 'v0.7.1.all-named-index.parquet'
+ )
+ t = _read_table(path)
+ result = t.to_pandas()
+ tm.assert_frame_equal(result, expected)
+
+
+def test_backwards_compatible_index_multi_level_some_named():
+ expected_string = b"""\
+carat cut color clarity depth table price x y z
+ 0.23 Ideal E SI2 61.5 55.0 326 3.95 3.98 2.43
+ 0.21 Premium E SI1 59.8 61.0 326 3.89 3.84 2.31
+ 0.23 Good E VS1 56.9 65.0 327 4.05 4.07 2.31
+ 0.29 Premium I VS2 62.4 58.0 334 4.20 4.23 2.63
+ 0.31 Good J SI2 63.3 58.0 335 4.34 4.35 2.75
+ 0.24 Very Good J VVS2 62.8 57.0 336 3.94 3.96 2.48
+ 0.24 Very Good I VVS1 62.3 57.0 336 3.95 3.98 2.47
+ 0.26 Very Good H SI1 61.9 55.0 337 4.07 4.11 2.53
+ 0.22 Fair E VS2 65.1 61.0 337 3.87 3.78 2.49
+ 0.23 Very Good H VS1 59.4 61.0 338 4.00 4.05 2.39"""
+ expected = pd.read_csv(
+ io.BytesIO(expected_string),
+ sep=r'\s{2,}', index_col=['cut', 'color', 'clarity'], header=0
+ ).sort_index()
+ expected.index = expected.index.set_names(['cut', None, 'clarity'])
+ path = os.path.join(
+ os.path.dirname(__file__), 'data', 'v0.7.1.some-named-index.parquet'
+ )
+ t = _read_table(path)
+ result = t.to_pandas()
+ tm.assert_frame_equal(result, expected)
--
To stop receiving notification emails like this one, please contact
['"commits@arrow.apache.org" <co...@arrow.apache.org>'].