You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by jo...@apache.org on 2022/04/29 16:13:40 UTC
[arrow] branch master updated: ARROW-15796: [Python] Pickling ParquetFileFragment shouldn't fetch metadata

This is an automated email from the ASF dual-hosted git repository.

jorisvandenbossche pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new edf7334fc3 ARROW-15796: [Python] Pickling ParquetFileFragment shouldn't fetch metadata
edf7334fc3 is described below

commit edf7334fc38ec9bc2e019bf400403e7c61fb585e
Author: Salonijain27 <sa...@gmail.com>
AuthorDate: Fri Apr 29 18:13:31 2022 +0200

    ARROW-15796: [Python] Pickling ParquetFileFragment shouldn't fetch metadata
    
    Closes #12909 from Salonijain27/ARROW-15796_parquet_update
    
    Lead-authored-by: Salonijain27 <sa...@gmail.com>
    Co-authored-by: salonijain27 <sa...@Salonis-MacBook-Pro.local>
    Co-authored-by: Joris Van den Bossche <jo...@gmail.com>
    Signed-off-by: Joris Van den Bossche <jo...@gmail.com>
---
 python/pyarrow/_dataset_parquet.pyx  |  8 +++++++-
 python/pyarrow/tests/test_dataset.py | 19 +++++++++++++++++++
 2 files changed, 26 insertions(+), 1 deletion(-)

diff --git a/python/pyarrow/_dataset_parquet.pyx b/python/pyarrow/_dataset_parquet.pyx
index f3bccf57f6..9f097947c1 100644
--- a/python/pyarrow/_dataset_parquet.pyx
+++ b/python/pyarrow/_dataset_parquet.pyx
@@ -301,7 +301,13 @@ cdef class ParquetFileFragment(FileFragment):
 
     def __reduce__(self):
         buffer = self.buffer
-        row_groups = [row_group.id for row_group in self.row_groups]
+        # parquet_file_fragment.row_groups() is empty if the metadata
+        # information of the file is not yet populated
+        if not bool(self.parquet_file_fragment.row_groups()):
+            row_groups = None
+        else:
+            row_groups = [row_group.id for row_group in self.row_groups]
+
         return self.format.make_fragment, (
             self.path if buffer is None else buffer,
             self.filesystem,
diff --git a/python/pyarrow/tests/test_dataset.py b/python/pyarrow/tests/test_dataset.py
index c5cec13f91..b8e15c597f 100644
--- a/python/pyarrow/tests/test_dataset.py
+++ b/python/pyarrow/tests/test_dataset.py
@@ -1197,6 +1197,25 @@ def test_fragments_parquet_ensure_metadata(tempdir, open_logging_fs):
         assert row_group.statistics is not None
 
 
+@pytest.mark.pandas
+@pytest.mark.parquet
+def test_fragments_parquet_pickle_no_metadata(tempdir, open_logging_fs):
+    # https://issues.apache.org/jira/browse/ARROW-15796
+    fs, assert_opens = open_logging_fs
+    _, dataset = _create_dataset_for_fragments(tempdir, filesystem=fs)
+    fragment = list(dataset.get_fragments())[1]
+
+    # second fragment hasn't yet loaded the metadata,
+    # and pickling it also should not read the metadata
+    with assert_opens([]):
+        pickled_fragment = pickle.loads(pickle.dumps(fragment))
+
+    # then accessing the row group info reads the metadata
+    with assert_opens([pickled_fragment.path]):
+        row_groups = pickled_fragment.row_groups
+    assert row_groups == [0]
+
+
 def _create_dataset_all_types(tempdir, chunk_size=None):
     table = pa.table(
         [