You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2019/06/18 17:04:18 UTC
[arrow] branch master updated: ARROW-4076: [Python] Validate
ParquetDataset schema after filtering
This is an automated email from the ASF dual-hosted git repository.
wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 694bfc8 ARROW-4076: [Python] Validate ParquetDataset schema after filtering
694bfc8 is described below
commit 694bfc824164f2a6a27089f769451abb6299f2d5
Author: Joris Van den Bossche <jo...@gmail.com>
AuthorDate: Tue Jun 18 12:04:10 2019 -0500
ARROW-4076: [Python] Validate ParquetDataset schema after filtering
This is adding a test for https://github.com/apache/arrow/pull/3576 (doing new PR since I can't push to that PR directly) cc @gsakkis
Author: Joris Van den Bossche <jo...@gmail.com>
Author: George Sakkis <ge...@gmail.com>
Closes #4600 from jorisvandenbossche/ARROW-4076 and squashes the following commits:
e884cf07a <Joris Van den Bossche> add test
411956cbd <Joris Van den Bossche> Merge remote-tracking branch 'upstream/master' into ARROW-4076
47ab8962e <George Sakkis> ARROW-4076: Validate ParquetDataset schema after filtering
---
python/pyarrow/parquet.py | 6 +++---
python/pyarrow/tests/test_parquet.py | 21 +++++++++++++++++++++
2 files changed, 24 insertions(+), 3 deletions(-)
diff --git a/python/pyarrow/parquet.py b/python/pyarrow/parquet.py
index 754d3c1..8727fb4 100644
--- a/python/pyarrow/parquet.py
+++ b/python/pyarrow/parquet.py
@@ -990,13 +990,13 @@ class ParquetDataset(object):
if split_row_groups:
raise NotImplementedError("split_row_groups not yet implemented")
- if validate_schema:
- self.validate_schemas()
-
if filters is not None:
filters = _check_filters(filters)
self._filter(filters)
+ if validate_schema:
+ self.validate_schemas()
+
def equals(self, other):
if not isinstance(other, ParquetDataset):
raise TypeError('`other` must be an instance of ParquetDataset')
diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py
index 4598bb9..9398ea8 100644
--- a/python/pyarrow/tests/test_parquet.py
+++ b/python/pyarrow/tests/test_parquet.py
@@ -2760,3 +2760,24 @@ def test_multi_dataset_metadata(tempdir):
assert _md['num_row_groups'] == 2
assert _md['serialized_size'] == 0
assert md['serialized_size'] > 0
+
+
+def test_filter_before_validate_schema(tempdir):
+ # ARROW-4076 apply filter before schema validation
+ # to avoid checking unneeded schemas
+
+ # create partitioned dataset with mismatching schemas which would
+ # otherwise raise if first validation all schemas
+ dir1 = tempdir / 'A=0'
+ dir1.mkdir()
+ table1 = pa.Table.from_pandas(pd.DataFrame({'B': [1, 2, 3]}))
+ pq.write_table(table1, dir1 / 'data.parquet')
+
+ dir2 = tempdir / 'A=1'
+ dir2.mkdir()
+ table2 = pa.Table.from_pandas(pd.DataFrame({'B': ['a', 'b', 'c']}))
+ pq.write_table(table2, dir2 / 'data.parquet')
+
+ # read single file using filter
+ table = pq.read_table(tempdir, filters=[[('A', '==', 0)]])
+ assert table.column('B').equals(pa.column('B', pa.array([1, 2, 3])))