You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by uw...@apache.org on 2018/01/25 22:17:33 UTC
[arrow] branch master updated: ARROW-1961: [Python] Preserve
pre-existing schema metadata in Parquet files when passing flavor='spark'
This is an automated email from the ASF dual-hosted git repository.
uwe pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 51046a0 ARROW-1961: [Python] Preserve pre-existing schema metadata in Parquet files when passing flavor='spark'
51046a0 is described below
commit 51046a0ac80913df99605ca4d78d8561fe3101d5
Author: Wes McKinney <we...@twosigma.com>
AuthorDate: Thu Jan 25 23:17:28 2018 +0100
ARROW-1961: [Python] Preserve pre-existing schema metadata in Parquet files when passing flavor='spark'
Author: Wes McKinney <we...@twosigma.com>
Closes #1511 from wesm/ARROW-1961 and squashes the following commits:
e13b6b4 [Wes McKinney] Preserve pre-existing schema metadata when sanitizing fields when passing flavor='spark'
---
python/pyarrow/parquet.py | 4 +++-
python/pyarrow/tests/test_parquet.py | 22 ++++++++++++++++++++++
2 files changed, 25 insertions(+), 1 deletion(-)
diff --git a/python/pyarrow/parquet.py b/python/pyarrow/parquet.py
index 151e0df..3a0924a 100644
--- a/python/pyarrow/parquet.py
+++ b/python/pyarrow/parquet.py
@@ -215,7 +215,9 @@ def _sanitize_schema(schema, flavor):
sanitized_fields.append(sanitized_field)
else:
sanitized_fields.append(field)
- return pa.schema(sanitized_fields), schema_changed
+
+ new_schema = pa.schema(sanitized_fields, metadata=schema.metadata)
+ return new_schema, schema_changed
else:
return schema, False
diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py
index c2bb31c..7c2edb3 100644
--- a/python/pyarrow/tests/test_parquet.py
+++ b/python/pyarrow/tests/test_parquet.py
@@ -748,6 +748,28 @@ def test_sanitized_spark_field_names():
assert result.schema[0].name == expected_name
+def _roundtrip_pandas_dataframe(df, write_kwargs):
+ table = pa.Table.from_pandas(df)
+
+ buf = io.BytesIO()
+ _write_table(table, buf, **write_kwargs)
+
+ buf.seek(0)
+ table1 = _read_table(buf)
+ return table1.to_pandas()
+
+
+@parquet
+def test_spark_flavor_preserves_pandas_metadata():
+ df = _test_dataframe(size=100)
+ df.index = np.arange(0, 10 * len(df), 10)
+ df.index.name = 'foo'
+
+ result = _roundtrip_pandas_dataframe(df, {'version': '2.0',
+ 'flavor': 'spark'})
+ tm.assert_frame_equal(result, df)
+
+
@parquet
def test_fixed_size_binary():
t0 = pa.binary(10)
--
To stop receiving notification emails like this one, please contact
uwe@apache.org.