You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by jo...@apache.org on 2020/12/01 10:31:08 UTC
[arrow] branch master updated: ARROW-10778: [Python] Fix
RowGroupInfo.statistics for empty row groups
This is an automated email from the ASF dual-hosted git repository.
jorisvandenbossche pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 5eae11f ARROW-10778: [Python] Fix RowGroupInfo.statistics for empty row groups
5eae11f is described below
commit 5eae11fc6fdd2eea16fa3048b8f954c76558ea1b
Author: rjzamora <rz...@gmail.com>
AuthorDate: Tue Dec 1 11:29:51 2020 +0100
ARROW-10778: [Python] Fix RowGroupInfo.statistics for empty row groups
Simple fix to avoid calling `statistics.has_min_max` when `statistics == None` in `RowGroupInfo.statistics`.
Closes #8809 from rjzamora/fix-empty-stats
Authored-by: rjzamora <rz...@gmail.com>
Signed-off-by: Joris Van den Bossche <jo...@gmail.com>
---
python/pyarrow/_dataset.pyx | 7 ++++---
python/pyarrow/tests/test_dataset.py | 12 ++++++++++++
2 files changed, 16 insertions(+), 3 deletions(-)
diff --git a/python/pyarrow/_dataset.pyx b/python/pyarrow/_dataset.pyx
index 393071a..bbe0485 100644
--- a/python/pyarrow/_dataset.pyx
+++ b/python/pyarrow/_dataset.pyx
@@ -929,7 +929,8 @@ class RowGroupInfo:
def name_stats(i):
col = self.metadata.column(i)
- if not col.statistics.has_min_max:
+ stats = col.statistics
+ if stats is None or not stats.has_min_max:
return None, None
name = col.path_in_schema
@@ -939,8 +940,8 @@ class RowGroupInfo:
typ = self.schema.field(field_index).type
return col.path_in_schema, {
- 'min': pa.scalar(col.statistics.min, type=typ).as_py(),
- 'max': pa.scalar(col.statistics.max, type=typ).as_py()
+ 'min': pa.scalar(stats.min, type=typ).as_py(),
+ 'max': pa.scalar(stats.max, type=typ).as_py()
}
return {
diff --git a/python/pyarrow/tests/test_dataset.py b/python/pyarrow/tests/test_dataset.py
index 92a77d9..1abd1c9 100644
--- a/python/pyarrow/tests/test_dataset.py
+++ b/python/pyarrow/tests/test_dataset.py
@@ -1022,6 +1022,18 @@ def test_parquet_fragment_statistics_nulls(tempdir):
@pytest.mark.pandas
@pytest.mark.parquet
+def test_parquet_empty_row_group_statistics(tempdir):
+ df = pd.DataFrame({"a": ["a", "b", "b"], "b": [4, 5, 6]})[:0]
+ df.to_parquet(tempdir / "test.parquet", engine="pyarrow")
+
+ dataset = ds.dataset(tempdir / "test.parquet", format="parquet")
+ fragments = list(dataset.get_fragments())[0].split_by_row_group()
+ # Only row group is empty
+ assert fragments[0].row_groups[0].statistics == {}
+
+
+@pytest.mark.pandas
+@pytest.mark.parquet
def test_fragments_parquet_row_groups_predicate(tempdir):
table, dataset = _create_dataset_for_fragments(tempdir, chunk_size=2)