You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2017/11/11 22:09:57 UTC
[arrow] branch master updated: ARROW-1787: [Python] Support reading parquet files into DataFrames in a backward compatible way

This is an automated email from the ASF dual-hosted git repository.

wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 7c205b0  ARROW-1787: [Python] Support reading parquet files into DataFrames in a backward compatible way
7c205b0 is described below

commit 7c205b0337cd0364a7f9e1e19a9a6d5423abfe30
Author: Phillip Cloud <cp...@gmail.com>
AuthorDate: Sat Nov 11 17:09:50 2017 -0500

    ARROW-1787: [Python] Support reading parquet files into DataFrames in a backward compatible way
    
    Author: Phillip Cloud <cp...@gmail.com>
    
    Closes #1298 from cpcloud/ARROW-1787 and squashes the following commits:
    
    6f5fbd55 [Phillip Cloud] Add more index naming tests
    8ba06256 [Phillip Cloud] Add test data
    56e7fe58 [Phillip Cloud] Use BytesIO
    5f50da38 [Phillip Cloud] Implement
    d61c43e6 [Phillip Cloud] Add test
    9abad95f [Phillip Cloud] Add test data
---
 python/pyarrow/pandas_compat.py                    |  13 +++-
 .../tests/data/v0.7.1.all-named-index.parquet      | Bin 0 -> 3948 bytes
 python/pyarrow/tests/data/v0.7.1.parquet           | Bin 0 -> 4372 bytes
 .../tests/data/v0.7.1.some-named-index.parquet     | Bin 0 -> 4008 bytes
 python/pyarrow/tests/test_parquet.py               |  73 +++++++++++++++++++++
 5 files changed, 85 insertions(+), 1 deletion(-)

diff --git a/python/pyarrow/pandas_compat.py b/python/pyarrow/pandas_compat.py
index 87b47b8..db28ee0 100644
--- a/python/pyarrow/pandas_compat.py
+++ b/python/pyarrow/pandas_compat.py
@@ -18,6 +18,7 @@
 import ast
 import collections
 import json
+import re
 
 import numpy as np
 import pandas as pd
@@ -353,6 +354,14 @@ def make_datetimetz(tz):
     return DatetimeTZDtype('ns', tz=tz)
 
 
+def backwards_compatible_index_name(raw_name, logical_name):
+    pattern = r'^__index_level_\d+__$'
+    if raw_name == logical_name and re.match(pattern, raw_name) is not None:
+        return None
+    else:
+        return logical_name
+
+
 def table_to_blockmanager(options, table, memory_pool, nthreads=1):
     import pandas.core.internals as _int
     import pyarrow.lib as lib
@@ -394,7 +403,9 @@ def table_to_blockmanager(options, table, memory_pool, nthreads=1):
                 values = values.copy()
 
             index_arrays.append(pd.Series(values, dtype=col_pandas.dtype))
-            index_names.append(logical_name)
+            index_names.append(
+                backwards_compatible_index_name(raw_name, logical_name)
+            )
             block_table = block_table.remove_column(
                 block_table.schema.get_field_index(raw_name)
             )
diff --git a/python/pyarrow/tests/data/v0.7.1.all-named-index.parquet b/python/pyarrow/tests/data/v0.7.1.all-named-index.parquet
new file mode 100644
index 0000000..e9efd9b
Binary files /dev/null and b/python/pyarrow/tests/data/v0.7.1.all-named-index.parquet differ
diff --git a/python/pyarrow/tests/data/v0.7.1.parquet b/python/pyarrow/tests/data/v0.7.1.parquet
new file mode 100644
index 0000000..44670bc
Binary files /dev/null and b/python/pyarrow/tests/data/v0.7.1.parquet differ
diff --git a/python/pyarrow/tests/data/v0.7.1.some-named-index.parquet b/python/pyarrow/tests/data/v0.7.1.some-named-index.parquet
new file mode 100644
index 0000000..34097ca
Binary files /dev/null and b/python/pyarrow/tests/data/v0.7.1.some-named-index.parquet differ
diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py
index e2e6863..6ba4fd2 100644
--- a/python/pyarrow/tests/test_parquet.py
+++ b/python/pyarrow/tests/test_parquet.py
@@ -1458,3 +1458,76 @@ def test_index_column_name_duplicate(tmpdir):
     arrow_table = _read_table(path)
     result_df = arrow_table.to_pandas()
     tm.assert_frame_equal(result_df, dfx)
+
+
+def test_backwards_compatible_index_naming():
+    expected_string = b"""\
+carat        cut  color  clarity  depth  table  price     x     y     z
+ 0.23      Ideal      E      SI2   61.5   55.0    326  3.95  3.98  2.43
+ 0.21    Premium      E      SI1   59.8   61.0    326  3.89  3.84  2.31
+ 0.23       Good      E      VS1   56.9   65.0    327  4.05  4.07  2.31
+ 0.29    Premium      I      VS2   62.4   58.0    334  4.20  4.23  2.63
+ 0.31       Good      J      SI2   63.3   58.0    335  4.34  4.35  2.75
+ 0.24  Very Good      J     VVS2   62.8   57.0    336  3.94  3.96  2.48
+ 0.24  Very Good      I     VVS1   62.3   57.0    336  3.95  3.98  2.47
+ 0.26  Very Good      H      SI1   61.9   55.0    337  4.07  4.11  2.53
+ 0.22       Fair      E      VS2   65.1   61.0    337  3.87  3.78  2.49
+ 0.23  Very Good      H      VS1   59.4   61.0    338  4.00  4.05  2.39"""
+    expected = pd.read_csv(
+        io.BytesIO(expected_string), sep=r'\s{2,}', index_col=None, header=0
+    )
+    path = os.path.join(os.path.dirname(__file__), 'data', 'v0.7.1.parquet')
+    t = _read_table(path)
+    result = t.to_pandas()
+    tm.assert_frame_equal(result, expected)
+
+
+def test_backwards_compatible_index_multi_level_named():
+    expected_string = b"""\
+carat        cut  color  clarity  depth  table  price     x     y     z
+ 0.23      Ideal      E      SI2   61.5   55.0    326  3.95  3.98  2.43
+ 0.21    Premium      E      SI1   59.8   61.0    326  3.89  3.84  2.31
+ 0.23       Good      E      VS1   56.9   65.0    327  4.05  4.07  2.31
+ 0.29    Premium      I      VS2   62.4   58.0    334  4.20  4.23  2.63
+ 0.31       Good      J      SI2   63.3   58.0    335  4.34  4.35  2.75
+ 0.24  Very Good      J     VVS2   62.8   57.0    336  3.94  3.96  2.48
+ 0.24  Very Good      I     VVS1   62.3   57.0    336  3.95  3.98  2.47
+ 0.26  Very Good      H      SI1   61.9   55.0    337  4.07  4.11  2.53
+ 0.22       Fair      E      VS2   65.1   61.0    337  3.87  3.78  2.49
+ 0.23  Very Good      H      VS1   59.4   61.0    338  4.00  4.05  2.39"""
+    expected = pd.read_csv(
+        io.BytesIO(expected_string),
+        sep=r'\s{2,}', index_col=['cut', 'color', 'clarity'], header=0
+    ).sort_index()
+    path = os.path.join(
+        os.path.dirname(__file__), 'data', 'v0.7.1.all-named-index.parquet'
+    )
+    t = _read_table(path)
+    result = t.to_pandas()
+    tm.assert_frame_equal(result, expected)
+
+
+def test_backwards_compatible_index_multi_level_some_named():
+    expected_string = b"""\
+carat        cut  color  clarity  depth  table  price     x     y     z
+ 0.23      Ideal      E      SI2   61.5   55.0    326  3.95  3.98  2.43
+ 0.21    Premium      E      SI1   59.8   61.0    326  3.89  3.84  2.31
+ 0.23       Good      E      VS1   56.9   65.0    327  4.05  4.07  2.31
+ 0.29    Premium      I      VS2   62.4   58.0    334  4.20  4.23  2.63
+ 0.31       Good      J      SI2   63.3   58.0    335  4.34  4.35  2.75
+ 0.24  Very Good      J     VVS2   62.8   57.0    336  3.94  3.96  2.48
+ 0.24  Very Good      I     VVS1   62.3   57.0    336  3.95  3.98  2.47
+ 0.26  Very Good      H      SI1   61.9   55.0    337  4.07  4.11  2.53
+ 0.22       Fair      E      VS2   65.1   61.0    337  3.87  3.78  2.49
+ 0.23  Very Good      H      VS1   59.4   61.0    338  4.00  4.05  2.39"""
+    expected = pd.read_csv(
+        io.BytesIO(expected_string),
+        sep=r'\s{2,}', index_col=['cut', 'color', 'clarity'], header=0
+    ).sort_index()
+    expected.index = expected.index.set_names(['cut', None, 'clarity'])
+    path = os.path.join(
+        os.path.dirname(__file__), 'data', 'v0.7.1.some-named-index.parquet'
+    )
+    t = _read_table(path)
+    result = t.to_pandas()
+    tm.assert_frame_equal(result, expected)

-- 
To stop receiving notification emails like this one, please contact
['"commits@arrow.apache.org" <co...@arrow.apache.org>'].