You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ap...@apache.org on 2020/12/07 16:34:38 UTC

[arrow] branch master updated: ARROW-10146: [Python] Fix parquet FileMetadata.to_dict in case statistics is not set

This is an automated email from the ASF dual-hosted git repository.

apitrou pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 49fb0f5  ARROW-10146: [Python] Fix parquet FileMetadata.to_dict in case statistics is not set
49fb0f5 is described below

commit 49fb0f5218790fa3eec899eafd84054971a4784a
Author: Joris Van den Bossche <jo...@gmail.com>
AuthorDate: Mon Dec 7 17:33:20 2020 +0100

    ARROW-10146: [Python] Fix parquet FileMetadata.to_dict in case statistics is not set
    
    Closes #8861 from jorisvandenbossche/ARROW-10146
    
    Authored-by: Joris Van den Bossche <jo...@gmail.com>
    Signed-off-by: Antoine Pitrou <an...@python.org>
---
 python/pyarrow/_parquet.pyx          |  3 ++-
 python/pyarrow/tests/test_parquet.py | 12 ++++++++++++
 2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/python/pyarrow/_parquet.pyx b/python/pyarrow/_parquet.pyx
index be8e363..029df3e 100644
--- a/python/pyarrow/_parquet.pyx
+++ b/python/pyarrow/_parquet.pyx
@@ -318,6 +318,7 @@ cdef class ColumnChunkMetaData(_Weakrefable):
                                           self.total_uncompressed_size)
 
     def to_dict(self):
+        statistics = self.statistics.to_dict() if self.is_stats_set else None
         d = dict(
             file_offset=self.file_offset,
             file_path=self.file_path,
@@ -325,7 +326,7 @@ cdef class ColumnChunkMetaData(_Weakrefable):
             num_values=self.num_values,
             path_in_schema=self.path_in_schema,
             is_stats_set=self.is_stats_set,
-            statistics=self.statistics.to_dict(),
+            statistics=statistics,
             compression=self.compression,
             encodings=self.encodings,
             has_dictionary_page=self.has_dictionary_page,
diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py
index baacb74..9422594 100644
--- a/python/pyarrow/tests/test_parquet.py
+++ b/python/pyarrow/tests/test_parquet.py
@@ -1136,6 +1136,18 @@ def test_statistics_convert_logical_types(tempdir):
         assert stats.max == max_val
 
 
+def test_parquet_metadata_empty_to_dict(tempdir):
+    # https://issues.apache.org/jira/browse/ARROW-10146
+    table = pa.table({"a": pa.array([], type="int64")})
+    pq.write_table(table, tempdir / "data.parquet")
+    metadata = pq.read_metadata(tempdir / "data.parquet")
+    # ensure this doesn't error / statistics set to None
+    metadata_dict = metadata.to_dict()
+    assert len(metadata_dict["row_groups"]) == 1
+    assert len(metadata_dict["row_groups"][0]["columns"]) == 1
+    assert metadata_dict["row_groups"][0]["columns"][0]["statistics"] is None
+
+
 def test_parquet_write_disable_statistics(tempdir):
     table = pa.Table.from_pydict(
         OrderedDict([