You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2018/07/30 00:19:33 UTC

[arrow] branch master updated: ARROW-2926: [Python] Do not attempt to write tables with invalid schemas in ParquetWriter.write_table

This is an automated email from the ASF dual-hosted git repository.

wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 161d1f0  ARROW-2926: [Python] Do not attempt to write tables with invalid schemas in ParquetWriter.write_table
161d1f0 is described below

commit 161d1f02cf6954f78abd240c16680d4bddcc7864
Author: Wes McKinney <we...@apache.org>
AuthorDate: Sun Jul 29 20:19:27 2018 -0400

    ARROW-2926: [Python] Do not attempt to write tables with invalid schemas in ParquetWriter.write_table
    
    To be honest, `parquet::arrow::FileWriter` should probably also validate. I will open a corresponding issue "upstream"
    
    Author: Wes McKinney <we...@apache.org>
    
    Closes #2339 from wesm/ARROW-2926 and squashes the following commits:
    
    8d2ee259 <Wes McKinney> Validate schemas in ParquetWriter.write_table
---
 python/pyarrow/parquet.py            |  7 +++++++
 python/pyarrow/tests/test_parquet.py | 24 ++++++++++++++++++++++++
 2 files changed, 31 insertions(+)

diff --git a/python/pyarrow/parquet.py b/python/pyarrow/parquet.py
index 2c1aef0..343758a 100644
--- a/python/pyarrow/parquet.py
+++ b/python/pyarrow/parquet.py
@@ -327,6 +327,13 @@ schema : arrow Schema
         if self.schema_changed:
             table = _sanitize_table(table, self.schema, self.flavor)
         assert self.is_open
+
+        if not table.schema.equals(self.schema):
+            msg = ('Table schema does not match schema used to create file: '
+                   '\ntable:\n{0!s} vs. \nfile:\n{1!s}'.format(table.schema,
+                                                               self.schema))
+            raise ValueError(msg)
+
         self.writer.write_table(table, row_group_size=row_group_size)
 
     def close(self):
diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py
index cc86ef1..1d3a6c1 100644
--- a/python/pyarrow/tests/test_parquet.py
+++ b/python/pyarrow/tests/test_parquet.py
@@ -685,6 +685,30 @@ def test_compare_schemas():
     assert fileh.schema[0] != 'arbitrary object'
 
 
+def test_validate_schema_write_table(tmpdir):
+    # ARROW-2926
+    import pyarrow.parquet as pq
+
+    simple_fields = [
+        pa.field('POS', pa.uint32()),
+        pa.field('desc', pa.string())
+    ]
+
+    simple_schema = pa.schema(simple_fields)
+
+    # simple_table schema does not match simple_schema
+    simple_from_array = [pa.array([1]), pa.array(['bla'])]
+    simple_table = pa.Table.from_arrays(simple_from_array, ['POS', 'desc'])
+
+    path = tmpdir.join('simple_validate_schema.parquet').strpath
+
+    with pq.ParquetWriter(path, simple_schema,
+                          version='2.0',
+                          compression='snappy', flavor='spark') as w:
+        with pytest.raises(ValueError):
+            w.write_table(simple_table)
+
+
 def test_column_of_arrays(tmpdir):
     df, schema = dataframe_with_arrays()