You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2023/06/22 21:19:09 UTC
[arrow] branch main updated: GH-35859: [Python] Actually change the default row group size to 1Mi (#36012)

This is an automated email from the ASF dual-hosted git repository.

westonpace pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
     new 0d1f723412 GH-35859: [Python] Actually change the default row group size to 1Mi (#36012)
0d1f723412 is described below

commit 0d1f7234124e67b98b3aa0aa3e521d42c2f3165d
Author: Weston Pace <we...@gmail.com>
AuthorDate: Thu Jun 22 14:19:01 2023 -0700

    GH-35859: [Python] Actually change the default row group size to 1Mi (#36012)
    
    ### Rationale for this change
    
    In #34280 the default row group size was changed to 1Mi.  However, this was accidentally reverted (for python, but not C++) in #34435
    
    The problem is that there is both an "absolute max row group size for the writer" and a "row group size to use for this table"  The pyarrow user is unable to set the former property.
    
    The behavior in pyarrow was previously "If no value is given in the call to write_table then don't specify anything and let the absolute max apply"
    
    The first fix changed the absolute max to 1Mi.  However, this made it impossible for the user to use a larger row group size.  The second fix changed the absolute max back to 64Mi.  However, this meant the default didn't change.
    
    ### What changes are included in this PR?
    
    This change leaves the absolute max at 64Mi.  However, if the user does not specify a row group size, we no longer "just use the table size" and instead use 1Mi.
    
    ### Are these changes tested?
    
    Yes, a unit test was added.
    
    ### Are there any user-facing changes?
    
    Yes, the default row group size now truly changes to 1Mi.  This change was already announced as part of #34280
    * Closes: #35859
    
    Authored-by: Weston Pace <we...@gmail.com>
    Signed-off-by: Weston Pace <we...@gmail.com>
---
 python/pyarrow/_parquet.pyx                         |  6 ++++--
 python/pyarrow/tests/parquet/test_parquet_writer.py | 15 ++++++++++++---
 2 files changed, 16 insertions(+), 5 deletions(-)

diff --git a/python/pyarrow/_parquet.pyx b/python/pyarrow/_parquet.pyx
index f9cd5289c7..781c7e23be 100644
--- a/python/pyarrow/_parquet.pyx
+++ b/python/pyarrow/_parquet.pyx
@@ -42,6 +42,8 @@ from pyarrow.lib import (ArrowException, NativeFile, BufferOutputStream,
 
 cimport cpython as cp
 
+_DEFAULT_ROW_GROUP_SIZE = 1024*1024
+_MAX_ROW_GROUP_SIZE = 64*1024*1024
 
 cdef class Statistics(_Weakrefable):
     """Statistics for a single column in a single row group."""
@@ -1595,7 +1597,7 @@ cdef shared_ptr[WriterProperties] _create_writer_properties(
     # The user can always specify a smaller row group size (and the default
     # is smaller) when calling write_table.  If the call to write_table uses
     # a size larger than this then it will be latched to this value.
-    props.max_row_group_length(64*1024*1024)
+    props.max_row_group_length(_MAX_ROW_GROUP_SIZE)
 
     properties = props.build()
 
@@ -1767,7 +1769,7 @@ cdef class ParquetWriter(_Weakrefable):
             int64_t c_row_group_size
 
         if row_group_size is None or row_group_size == -1:
-            c_row_group_size = ctable.num_rows()
+            c_row_group_size = min(ctable.num_rows(), _DEFAULT_ROW_GROUP_SIZE)
         elif row_group_size == 0:
             raise ValueError('Row group size cannot be 0')
         else:
diff --git a/python/pyarrow/tests/parquet/test_parquet_writer.py b/python/pyarrow/tests/parquet/test_parquet_writer.py
index 9ac1bbe8d6..5e6895c8dc 100644
--- a/python/pyarrow/tests/parquet/test_parquet_writer.py
+++ b/python/pyarrow/tests/parquet/test_parquet_writer.py
@@ -223,15 +223,19 @@ def test_parquet_writer_chunk_size(tempdir):
         table = pa.Table.from_arrays([
             _range_integers(data_size, 'b')
         ], names=['x'])
-        pq.write_table(table, tempdir / 'test.parquet', row_group_size=chunk_size)
+        if chunk_size is None:
+            pq.write_table(table, tempdir / 'test.parquet')
+        else:
+            pq.write_table(table, tempdir / 'test.parquet', row_group_size=chunk_size)
         metadata = pq.read_metadata(tempdir / 'test.parquet')
+        expected_chunk_size = default_chunk_size if chunk_size is None else chunk_size
         assert metadata.num_row_groups == expect_num_chunks
-        latched_chunk_size = min(chunk_size, abs_max_chunk_size)
+        latched_chunk_size = min(expected_chunk_size, abs_max_chunk_size)
         # First chunks should be full size
         for chunk_idx in range(expect_num_chunks - 1):
             assert metadata.row_group(chunk_idx).num_rows == latched_chunk_size
         # Last chunk may be smaller
-        remainder = data_size - (chunk_size * (expect_num_chunks - 1))
+        remainder = data_size - (expected_chunk_size * (expect_num_chunks - 1))
         if remainder == 0:
             assert metadata.row_group(
                 expect_num_chunks - 1).num_rows == latched_chunk_size
@@ -246,6 +250,11 @@ def test_parquet_writer_chunk_size(tempdir):
     # by the absolute max chunk size
     check_chunk_size(abs_max_chunk_size * 2, abs_max_chunk_size * 2, 2)
 
+    # These tests don't pass a chunk_size to write_table and so the chunk size
+    # should be default_chunk_size
+    check_chunk_size(default_chunk_size, None, 1)
+    check_chunk_size(default_chunk_size + 1, None, 2)
+
 
 @pytest.mark.pandas
 @pytest.mark.parametrize("filesystem", [