You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by am...@apache.org on 2022/04/13 10:29:20 UTC

[arrow] branch master updated: ARROW-7174: [Python] Expose parquet dictionary_pagesize_limit write parameter

This is an automated email from the ASF dual-hosted git repository.

amolina pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 5441c4b10c ARROW-7174: [Python] Expose parquet dictionary_pagesize_limit write parameter
5441c4b10c is described below

commit 5441c4b10c0ce44315f18b2ff6c6970ad62258de
Author: Raúl Cumplido <ra...@gmail.com>
AuthorDate: Wed Apr 13 12:29:12 2022 +0200

    ARROW-7174: [Python] Expose parquet dictionary_pagesize_limit write parameter
    
    This PR exposes the parquet `dictionary_pagesize_limit` `WriterProperties` in Python.
    
    Closes #12825 from raulcd/ARROW-7174
    
    Authored-by: Raúl Cumplido <ra...@gmail.com>
    Signed-off-by: Alessandro Molina <am...@turbogears.org>
---
 python/pyarrow/_parquet.pxd                |  4 +++-
 python/pyarrow/_parquet.pyx                | 13 ++++++++++---
 python/pyarrow/parquet.py                  |  7 +++++++
 python/pyarrow/tests/parquet/test_basic.py | 14 ++++++++++++++
 4 files changed, 34 insertions(+), 4 deletions(-)

diff --git a/python/pyarrow/_parquet.pxd b/python/pyarrow/_parquet.pxd
index e4146b8ad8..d1fbeab72b 100644
--- a/python/pyarrow/_parquet.pxd
+++ b/python/pyarrow/_parquet.pxd
@@ -410,6 +410,7 @@ cdef extern from "parquet/api/writer.h" namespace "parquet" nogil:
             Builder* encoding(const c_string& path,
                               ParquetEncoding encoding)
             Builder* write_batch_size(int64_t batch_size)
+            Builder* dictionary_pagesize_limit(int64_t dictionary_pagesize_limit)
             shared_ptr[WriterProperties] build()
 
     cdef cppclass ArrowWriterProperties:
@@ -553,7 +554,8 @@ cdef shared_ptr[WriterProperties] _create_writer_properties(
     column_encoding=*,
     data_page_version=*,
     FileEncryptionProperties encryption_properties=*,
-    write_batch_size=*) except *
+    write_batch_size=*,
+    dictionary_pagesize_limit=*) except *
 
 
 cdef shared_ptr[ArrowWriterProperties] _create_arrow_writer_properties(
diff --git a/python/pyarrow/_parquet.pyx b/python/pyarrow/_parquet.pyx
index 44ca4e1c8d..73ba5bc49d 100644
--- a/python/pyarrow/_parquet.pyx
+++ b/python/pyarrow/_parquet.pyx
@@ -1261,7 +1261,8 @@ cdef shared_ptr[WriterProperties] _create_writer_properties(
         column_encoding=None,
         data_page_version=None,
         FileEncryptionProperties encryption_properties=None,
-        write_batch_size=None) except *:
+        write_batch_size=None,
+        dictionary_pagesize_limit=None) except *:
     """General writer properties"""
     cdef:
         shared_ptr[WriterProperties] properties
@@ -1387,6 +1388,9 @@ cdef shared_ptr[WriterProperties] _create_writer_properties(
     if write_batch_size is not None:
         props.write_batch_size(write_batch_size)
 
+    if dictionary_pagesize_limit is not None:
+        props.dictionary_pagesize_limit(dictionary_pagesize_limit)
+
     # encryption
 
     if encryption_properties is not None:
@@ -1482,6 +1486,7 @@ cdef class ParquetWriter(_Weakrefable):
         int64_t data_page_size
         FileEncryptionProperties encryption_properties
         int64_t write_batch_size
+        int64_t dictionary_pagesize_limit
 
     def __cinit__(self, where, Schema schema, use_dictionary=None,
                   compression=None, version=None,
@@ -1498,7 +1503,8 @@ cdef class ParquetWriter(_Weakrefable):
                   data_page_version=None,
                   use_compliant_nested_type=False,
                   encryption_properties=None,
-                  write_batch_size=None):
+                  write_batch_size=None,
+                  dictionary_pagesize_limit=None):
         cdef:
             shared_ptr[WriterProperties] properties
             shared_ptr[ArrowWriterProperties] arrow_properties
@@ -1527,7 +1533,8 @@ cdef class ParquetWriter(_Weakrefable):
             column_encoding=column_encoding,
             data_page_version=data_page_version,
             encryption_properties=encryption_properties,
-            write_batch_size=write_batch_size
+            write_batch_size=write_batch_size,
+            dictionary_pagesize_limit=dictionary_pagesize_limit
         )
         arrow_properties = _create_arrow_writer_properties(
             use_deprecated_int96_timestamps=use_deprecated_int96_timestamps,
diff --git a/python/pyarrow/parquet.py b/python/pyarrow/parquet.py
index f68979c87c..33094dabe6 100644
--- a/python/pyarrow/parquet.py
+++ b/python/pyarrow/parquet.py
@@ -781,6 +781,9 @@ write_batch_size : int, default None
     1024. ``write_batch_size`` is complementary to ``data_page_size``. If pages
     are exceeding the ``data_page_size`` due to large column values, lowering
     the batch size can help keep page sizes closer to the intended size.
+dictionary_pagesize_limit : int, default None
+    Specify the dictionary page size limit per row group. If None, use the
+    default 1MB.
 """
 
 _parquet_writer_example_doc = """\
@@ -871,6 +874,7 @@ Examples
                  use_compliant_nested_type=False,
                  encryption_properties=None,
                  write_batch_size=None,
+                 dictionary_pagesize_limit=None,
                  **options):
         if use_deprecated_int96_timestamps is None:
             # Use int96 timestamps for Spark
@@ -925,6 +929,7 @@ Examples
             use_compliant_nested_type=use_compliant_nested_type,
             encryption_properties=encryption_properties,
             write_batch_size=write_batch_size,
+            dictionary_pagesize_limit=dictionary_pagesize_limit,
             **options)
         self.is_open = True
 
@@ -2820,6 +2825,7 @@ def write_table(table, where, row_group_size=None, version='1.0',
                 use_compliant_nested_type=False,
                 encryption_properties=None,
                 write_batch_size=None,
+                dictionary_pagesize_limit=None,
                 **kwargs):
     row_group_size = kwargs.pop('chunk_size', row_group_size)
     use_int96 = use_deprecated_int96_timestamps
@@ -2843,6 +2849,7 @@ def write_table(table, where, row_group_size=None, version='1.0',
                 use_compliant_nested_type=use_compliant_nested_type,
                 encryption_properties=encryption_properties,
                 write_batch_size=write_batch_size,
+                dictionary_pagesize_limit=dictionary_pagesize_limit,
                 **kwargs) as writer:
             writer.write_table(table, row_group_size=row_group_size)
     except Exception:
diff --git a/python/pyarrow/tests/parquet/test_basic.py b/python/pyarrow/tests/parquet/test_basic.py
index 8c7afb8355..e82e3a36df 100644
--- a/python/pyarrow/tests/parquet/test_basic.py
+++ b/python/pyarrow/tests/parquet/test_basic.py
@@ -79,6 +79,20 @@ def test_set_write_batch_size(use_legacy_dataset):
     )
 
 
+@pytest.mark.pandas
+@parametrize_legacy_dataset
+def test_set_dictionary_pagesize_limit(use_legacy_dataset):
+    df = _test_dataframe(100)
+    table = pa.Table.from_pandas(df, preserve_index=False)
+
+    _check_roundtrip(table, dictionary_pagesize_limit=1,
+                     data_page_size=10, version='2.4')
+
+    with pytest.raises(TypeError):
+        _check_roundtrip(table, dictionary_pagesize_limit="a",
+                         data_page_size=10, version='2.4')
+
+
 @pytest.mark.pandas
 @parametrize_legacy_dataset
 def test_chunked_table_write(use_legacy_dataset):