You are viewing a plain text version of this content. The canonical link for it is here.

Posted to commits@arrow.apache.org by uw...@apache.org on 2018/01/25 22:17:33 UTC

[arrow] branch master updated: ARROW-1961: [Python] Preserve pre-existing schema metadata in Parquet files when passing flavor='spark'

This is an automated email from the ASF dual-hosted git repository.

uwe pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 51046a0  ARROW-1961: [Python] Preserve pre-existing schema metadata in Parquet files when passing flavor='spark'
51046a0 is described below

commit 51046a0ac80913df99605ca4d78d8561fe3101d5
Author: Wes McKinney <we...@twosigma.com>
AuthorDate: Thu Jan 25 23:17:28 2018 +0100

    ARROW-1961: [Python] Preserve pre-existing schema metadata in Parquet files when passing flavor='spark'
    
    Author: Wes McKinney <we...@twosigma.com>
    
    Closes #1511 from wesm/ARROW-1961 and squashes the following commits:
    
    e13b6b4 [Wes McKinney] Preserve pre-existing schema metadata when sanitizing fields when passing flavor='spark'
---
 python/pyarrow/parquet.py            |  4 +++-
 python/pyarrow/tests/test_parquet.py | 22 ++++++++++++++++++++++
 2 files changed, 25 insertions(+), 1 deletion(-)

diff --git a/python/pyarrow/parquet.py b/python/pyarrow/parquet.py
index 151e0df..3a0924a 100644
--- a/python/pyarrow/parquet.py
+++ b/python/pyarrow/parquet.py
@@ -215,7 +215,9 @@ def _sanitize_schema(schema, flavor):
                 sanitized_fields.append(sanitized_field)
             else:
                 sanitized_fields.append(field)
-        return pa.schema(sanitized_fields), schema_changed
+
+        new_schema = pa.schema(sanitized_fields, metadata=schema.metadata)
+        return new_schema, schema_changed
     else:
         return schema, False
 
diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py
index c2bb31c..7c2edb3 100644
--- a/python/pyarrow/tests/test_parquet.py
+++ b/python/pyarrow/tests/test_parquet.py
@@ -748,6 +748,28 @@ def test_sanitized_spark_field_names():
     assert result.schema[0].name == expected_name
 
 
+def _roundtrip_pandas_dataframe(df, write_kwargs):
+    table = pa.Table.from_pandas(df)
+
+    buf = io.BytesIO()
+    _write_table(table, buf, **write_kwargs)
+
+    buf.seek(0)
+    table1 = _read_table(buf)
+    return table1.to_pandas()
+
+
+@parquet
+def test_spark_flavor_preserves_pandas_metadata():
+    df = _test_dataframe(size=100)
+    df.index = np.arange(0, 10 * len(df), 10)
+    df.index.name = 'foo'
+
+    result = _roundtrip_pandas_dataframe(df, {'version': '2.0',
+                                              'flavor': 'spark'})
+    tm.assert_frame_equal(result, df)
+
+
 @parquet
 def test_fixed_size_binary():
     t0 = pa.binary(10)

-- 
To stop receiving notification emails like this one, please contact
uwe@apache.org.