You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by jo...@apache.org on 2020/12/01 10:31:08 UTC

[arrow] branch master updated: ARROW-10778: [Python] Fix RowGroupInfo.statistics for empty row groups

This is an automated email from the ASF dual-hosted git repository.

jorisvandenbossche pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 5eae11f  ARROW-10778: [Python] Fix RowGroupInfo.statistics for empty row groups
5eae11f is described below

commit 5eae11fc6fdd2eea16fa3048b8f954c76558ea1b
Author: rjzamora <rz...@gmail.com>
AuthorDate: Tue Dec 1 11:29:51 2020 +0100

    ARROW-10778: [Python] Fix RowGroupInfo.statistics for empty row groups
    
    Simple fix to avoid calling `statistics.has_min_max` when `statistics == None` in `RowGroupInfo.statistics`.
    
    Closes #8809 from rjzamora/fix-empty-stats
    
    Authored-by: rjzamora <rz...@gmail.com>
    Signed-off-by: Joris Van den Bossche <jo...@gmail.com>
---
 python/pyarrow/_dataset.pyx          |  7 ++++---
 python/pyarrow/tests/test_dataset.py | 12 ++++++++++++
 2 files changed, 16 insertions(+), 3 deletions(-)

diff --git a/python/pyarrow/_dataset.pyx b/python/pyarrow/_dataset.pyx
index 393071a..bbe0485 100644
--- a/python/pyarrow/_dataset.pyx
+++ b/python/pyarrow/_dataset.pyx
@@ -929,7 +929,8 @@ class RowGroupInfo:
         def name_stats(i):
             col = self.metadata.column(i)
 
-            if not col.statistics.has_min_max:
+            stats = col.statistics
+            if stats is None or not stats.has_min_max:
                 return None, None
 
             name = col.path_in_schema
@@ -939,8 +940,8 @@ class RowGroupInfo:
 
             typ = self.schema.field(field_index).type
             return col.path_in_schema, {
-                'min': pa.scalar(col.statistics.min, type=typ).as_py(),
-                'max': pa.scalar(col.statistics.max, type=typ).as_py()
+                'min': pa.scalar(stats.min, type=typ).as_py(),
+                'max': pa.scalar(stats.max, type=typ).as_py()
             }
 
         return {
diff --git a/python/pyarrow/tests/test_dataset.py b/python/pyarrow/tests/test_dataset.py
index 92a77d9..1abd1c9 100644
--- a/python/pyarrow/tests/test_dataset.py
+++ b/python/pyarrow/tests/test_dataset.py
@@ -1022,6 +1022,18 @@ def test_parquet_fragment_statistics_nulls(tempdir):
 
 @pytest.mark.pandas
 @pytest.mark.parquet
+def test_parquet_empty_row_group_statistics(tempdir):
+    df = pd.DataFrame({"a": ["a", "b", "b"], "b": [4, 5, 6]})[:0]
+    df.to_parquet(tempdir / "test.parquet", engine="pyarrow")
+
+    dataset = ds.dataset(tempdir / "test.parquet", format="parquet")
+    fragments = list(dataset.get_fragments())[0].split_by_row_group()
+    # Only row group is empty
+    assert fragments[0].row_groups[0].statistics == {}
+
+
+@pytest.mark.pandas
+@pytest.mark.parquet
 def test_fragments_parquet_row_groups_predicate(tempdir):
     table, dataset = _create_dataset_for_fragments(tempdir, chunk_size=2)