You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ap...@apache.org on 2020/12/07 16:34:38 UTC
[arrow] branch master updated: ARROW-10146: [Python] Fix parquet
FileMetadata.to_dict in case statistics is not set
This is an automated email from the ASF dual-hosted git repository.
apitrou pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 49fb0f5 ARROW-10146: [Python] Fix parquet FileMetadata.to_dict in case statistics is not set
49fb0f5 is described below
commit 49fb0f5218790fa3eec899eafd84054971a4784a
Author: Joris Van den Bossche <jo...@gmail.com>
AuthorDate: Mon Dec 7 17:33:20 2020 +0100
ARROW-10146: [Python] Fix parquet FileMetadata.to_dict in case statistics is not set
Closes #8861 from jorisvandenbossche/ARROW-10146
Authored-by: Joris Van den Bossche <jo...@gmail.com>
Signed-off-by: Antoine Pitrou <an...@python.org>
---
python/pyarrow/_parquet.pyx | 3 ++-
python/pyarrow/tests/test_parquet.py | 12 ++++++++++++
2 files changed, 14 insertions(+), 1 deletion(-)
diff --git a/python/pyarrow/_parquet.pyx b/python/pyarrow/_parquet.pyx
index be8e363..029df3e 100644
--- a/python/pyarrow/_parquet.pyx
+++ b/python/pyarrow/_parquet.pyx
@@ -318,6 +318,7 @@ cdef class ColumnChunkMetaData(_Weakrefable):
self.total_uncompressed_size)
def to_dict(self):
+ statistics = self.statistics.to_dict() if self.is_stats_set else None
d = dict(
file_offset=self.file_offset,
file_path=self.file_path,
@@ -325,7 +326,7 @@ cdef class ColumnChunkMetaData(_Weakrefable):
num_values=self.num_values,
path_in_schema=self.path_in_schema,
is_stats_set=self.is_stats_set,
- statistics=self.statistics.to_dict(),
+ statistics=statistics,
compression=self.compression,
encodings=self.encodings,
has_dictionary_page=self.has_dictionary_page,
diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py
index baacb74..9422594 100644
--- a/python/pyarrow/tests/test_parquet.py
+++ b/python/pyarrow/tests/test_parquet.py
@@ -1136,6 +1136,18 @@ def test_statistics_convert_logical_types(tempdir):
assert stats.max == max_val
+def test_parquet_metadata_empty_to_dict(tempdir):
+ # https://issues.apache.org/jira/browse/ARROW-10146
+ table = pa.table({"a": pa.array([], type="int64")})
+ pq.write_table(table, tempdir / "data.parquet")
+ metadata = pq.read_metadata(tempdir / "data.parquet")
+ # ensure this doesn't error / statistics set to None
+ metadata_dict = metadata.to_dict()
+ assert len(metadata_dict["row_groups"]) == 1
+ assert len(metadata_dict["row_groups"][0]["columns"]) == 1
+ assert metadata_dict["row_groups"][0]["columns"][0]["statistics"] is None
+
+
def test_parquet_write_disable_statistics(tempdir):
table = pa.Table.from_pydict(
OrderedDict([