You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by jo...@apache.org on 2022/04/29 16:13:40 UTC
[arrow] branch master updated: ARROW-15796: [Python] Pickling ParquetFileFragment shouldn't fetch metadata
This is an automated email from the ASF dual-hosted git repository.
jorisvandenbossche pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new edf7334fc3 ARROW-15796: [Python] Pickling ParquetFileFragment shouldn't fetch metadata
edf7334fc3 is described below
commit edf7334fc38ec9bc2e019bf400403e7c61fb585e
Author: Salonijain27 <sa...@gmail.com>
AuthorDate: Fri Apr 29 18:13:31 2022 +0200
ARROW-15796: [Python] Pickling ParquetFileFragment shouldn't fetch metadata
Closes #12909 from Salonijain27/ARROW-15796_parquet_update
Lead-authored-by: Salonijain27 <sa...@gmail.com>
Co-authored-by: salonijain27 <sa...@Salonis-MacBook-Pro.local>
Co-authored-by: Joris Van den Bossche <jo...@gmail.com>
Signed-off-by: Joris Van den Bossche <jo...@gmail.com>
---
python/pyarrow/_dataset_parquet.pyx | 8 +++++++-
python/pyarrow/tests/test_dataset.py | 19 +++++++++++++++++++
2 files changed, 26 insertions(+), 1 deletion(-)
diff --git a/python/pyarrow/_dataset_parquet.pyx b/python/pyarrow/_dataset_parquet.pyx
index f3bccf57f6..9f097947c1 100644
--- a/python/pyarrow/_dataset_parquet.pyx
+++ b/python/pyarrow/_dataset_parquet.pyx
@@ -301,7 +301,13 @@ cdef class ParquetFileFragment(FileFragment):
def __reduce__(self):
buffer = self.buffer
- row_groups = [row_group.id for row_group in self.row_groups]
+ # parquet_file_fragment.row_groups() is empty if the metadata
+ # information of the file is not yet populated
+ if not bool(self.parquet_file_fragment.row_groups()):
+ row_groups = None
+ else:
+ row_groups = [row_group.id for row_group in self.row_groups]
+
return self.format.make_fragment, (
self.path if buffer is None else buffer,
self.filesystem,
diff --git a/python/pyarrow/tests/test_dataset.py b/python/pyarrow/tests/test_dataset.py
index c5cec13f91..b8e15c597f 100644
--- a/python/pyarrow/tests/test_dataset.py
+++ b/python/pyarrow/tests/test_dataset.py
@@ -1197,6 +1197,25 @@ def test_fragments_parquet_ensure_metadata(tempdir, open_logging_fs):
assert row_group.statistics is not None
+@pytest.mark.pandas
+@pytest.mark.parquet
+def test_fragments_parquet_pickle_no_metadata(tempdir, open_logging_fs):
+ # https://issues.apache.org/jira/browse/ARROW-15796
+ fs, assert_opens = open_logging_fs
+ _, dataset = _create_dataset_for_fragments(tempdir, filesystem=fs)
+ fragment = list(dataset.get_fragments())[1]
+
+ # second fragment hasn't yet loaded the metadata,
+ # and pickling it also should not read the metadata
+ with assert_opens([]):
+ pickled_fragment = pickle.loads(pickle.dumps(fragment))
+
+ # then accessing the row group info reads the metadata
+ with assert_opens([pickled_fragment.path]):
+ row_groups = pickled_fragment.row_groups
+ assert row_groups == [0]
+
+
def _create_dataset_all_types(tempdir, chunk_size=None):
table = pa.table(
[