You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2019/06/18 17:04:18 UTC
[arrow] branch master updated: ARROW-4076: [Python] Validate ParquetDataset schema after filtering

This is an automated email from the ASF dual-hosted git repository.

wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 694bfc8  ARROW-4076: [Python] Validate ParquetDataset schema after filtering
694bfc8 is described below

commit 694bfc824164f2a6a27089f769451abb6299f2d5
Author: Joris Van den Bossche <jo...@gmail.com>
AuthorDate: Tue Jun 18 12:04:10 2019 -0500

    ARROW-4076: [Python] Validate ParquetDataset schema after filtering
    
    This is adding a test for https://github.com/apache/arrow/pull/3576 (doing new PR since I can't push to that PR directly) cc @gsakkis
    
    Author: Joris Van den Bossche <jo...@gmail.com>
    Author: George Sakkis <ge...@gmail.com>
    
    Closes #4600 from jorisvandenbossche/ARROW-4076 and squashes the following commits:
    
    e884cf07a <Joris Van den Bossche> add test
    411956cbd <Joris Van den Bossche> Merge remote-tracking branch 'upstream/master' into ARROW-4076
    47ab8962e <George Sakkis> ARROW-4076:  Validate ParquetDataset schema after filtering
---
 python/pyarrow/parquet.py            |  6 +++---
 python/pyarrow/tests/test_parquet.py | 21 +++++++++++++++++++++
 2 files changed, 24 insertions(+), 3 deletions(-)

diff --git a/python/pyarrow/parquet.py b/python/pyarrow/parquet.py
index 754d3c1..8727fb4 100644
--- a/python/pyarrow/parquet.py
+++ b/python/pyarrow/parquet.py
@@ -990,13 +990,13 @@ class ParquetDataset(object):
         if split_row_groups:
             raise NotImplementedError("split_row_groups not yet implemented")
 
-        if validate_schema:
-            self.validate_schemas()
-
         if filters is not None:
             filters = _check_filters(filters)
             self._filter(filters)
 
+        if validate_schema:
+            self.validate_schemas()
+
     def equals(self, other):
         if not isinstance(other, ParquetDataset):
             raise TypeError('`other` must be an instance of ParquetDataset')
diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py
index 4598bb9..9398ea8 100644
--- a/python/pyarrow/tests/test_parquet.py
+++ b/python/pyarrow/tests/test_parquet.py
@@ -2760,3 +2760,24 @@ def test_multi_dataset_metadata(tempdir):
     assert _md['num_row_groups'] == 2
     assert _md['serialized_size'] == 0
     assert md['serialized_size'] > 0
+
+
+def test_filter_before_validate_schema(tempdir):
+    # ARROW-4076 apply filter before schema validation
+    # to avoid checking unneeded schemas
+
+    # create partitioned dataset with mismatching schemas which would
+    # otherwise raise if first validation all schemas
+    dir1 = tempdir / 'A=0'
+    dir1.mkdir()
+    table1 = pa.Table.from_pandas(pd.DataFrame({'B': [1, 2, 3]}))
+    pq.write_table(table1, dir1 / 'data.parquet')
+
+    dir2 = tempdir / 'A=1'
+    dir2.mkdir()
+    table2 = pa.Table.from_pandas(pd.DataFrame({'B': ['a', 'b', 'c']}))
+    pq.write_table(table2, dir2 / 'data.parquet')
+
+    # read single file using filter
+    table = pq.read_table(tempdir, filters=[[('A', '==', 0)]])
+    assert table.column('B').equals(pa.column('B', pa.array([1, 2, 3])))