You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by tu...@apache.org on 2023/05/31 10:49:08 UTC

[arrow-rs] branch master updated: Use `page_size` consistently, deprecate `pagesize` in parquet WriterProperties (#4313)

This is an automated email from the ASF dual-hosted git repository.

tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/master by this push:
     new 768e726df Use `page_size` consistently, deprecate `pagesize` in parquet WriterProperties (#4313)
768e726df is described below

commit 768e726df3077403f4653dffdc961328a22a5ef4
Author: Andrew Lamb <an...@nerdnetworks.org>
AuthorDate: Wed May 31 06:49:02 2023 -0400

    Use `page_size` consistently, deprecate `pagesize` in parquet WriterProperties (#4313)
    
    * Use `page_size` consistently, deprecate `pagesize`
    
    * doc tweaks
    
    * Update parquet/src/file/properties.rs
    
    Co-authored-by: Liang-Chi Hsieh <vi...@gmail.com>
    
    ---------
    
    Co-authored-by: Raphael Taylor-Davies <17...@users.noreply.github.com>
    Co-authored-by: Liang-Chi Hsieh <vi...@gmail.com>
---
 parquet/src/arrow/arrow_reader/mod.rs |  4 +-
 parquet/src/arrow/arrow_writer/mod.rs | 10 ++--
 parquet/src/bin/parquet-rewrite.rs    | 12 ++---
 parquet/src/column/writer/mod.rs      | 11 ++--
 parquet/src/file/properties.rs        | 99 ++++++++++++++++++++++++++---------
 parquet/tests/arrow_writer_layout.rs  | 20 +++----
 6 files changed, 104 insertions(+), 52 deletions(-)

diff --git a/parquet/src/arrow/arrow_reader/mod.rs b/parquet/src/arrow/arrow_reader/mod.rs
index 819e96c0a..f3e178bdf 100644
--- a/parquet/src/arrow/arrow_reader/mod.rs
+++ b/parquet/src/arrow/arrow_reader/mod.rs
@@ -1240,7 +1240,7 @@ mod tests {
 
         fn writer_props(&self) -> WriterProperties {
             let builder = WriterProperties::builder()
-                .set_data_pagesize_limit(self.max_data_page_size)
+                .set_data_page_size_limit(self.max_data_page_size)
                 .set_write_batch_size(self.write_batch_size)
                 .set_writer_version(self.writer_version)
                 .set_statistics_enabled(self.enabled_statistics);
@@ -1248,7 +1248,7 @@ mod tests {
             let builder = match self.encoding {
                 Encoding::RLE_DICTIONARY | Encoding::PLAIN_DICTIONARY => builder
                     .set_dictionary_enabled(true)
-                    .set_dictionary_pagesize_limit(self.max_dict_page_size),
+                    .set_dictionary_page_size_limit(self.max_dict_page_size),
                 _ => builder
                     .set_dictionary_enabled(false)
                     .set_encoding(self.encoding),
diff --git a/parquet/src/arrow/arrow_writer/mod.rs b/parquet/src/arrow/arrow_writer/mod.rs
index 5f2750a55..0aca77f5b 100644
--- a/parquet/src/arrow/arrow_writer/mod.rs
+++ b/parquet/src/arrow/arrow_writer/mod.rs
@@ -1320,8 +1320,8 @@ mod tests {
 
         // Set everything very low so we fallback to PLAIN encoding after the first row
         let props = WriterProperties::builder()
-            .set_data_pagesize_limit(1)
-            .set_dictionary_pagesize_limit(1)
+            .set_data_page_size_limit(1)
+            .set_dictionary_page_size_limit(1)
             .set_write_batch_size(1)
             .build();
 
@@ -1494,7 +1494,7 @@ mod tests {
                             .set_writer_version(version)
                             .set_max_row_group_size(row_group_size)
                             .set_dictionary_enabled(dictionary_size != 0)
-                            .set_dictionary_pagesize_limit(dictionary_size.max(1))
+                            .set_dictionary_page_size_limit(dictionary_size.max(1))
                             .set_encoding(*encoding)
                             .set_bloom_filter_enabled(bloom_filter)
                             .build();
@@ -2043,7 +2043,7 @@ mod tests {
         let expected_batch = RecordBatch::try_new(schema, vec![values]).unwrap();
 
         let row_group_sizes = [1024, SMALL_SIZE, SMALL_SIZE / 2, SMALL_SIZE / 2 + 1, 10];
-        let data_pagesize_limit: usize = 32;
+        let data_page_size_limit: usize = 32;
         let write_batch_size: usize = 16;
 
         for encoding in &encodings {
@@ -2053,7 +2053,7 @@ mod tests {
                     .set_max_row_group_size(row_group_size)
                     .set_dictionary_enabled(false)
                     .set_encoding(*encoding)
-                    .set_data_pagesize_limit(data_pagesize_limit)
+                    .set_data_page_size_limit(data_page_size_limit)
                     .set_write_batch_size(write_batch_size)
                     .build();
 
diff --git a/parquet/src/bin/parquet-rewrite.rs b/parquet/src/bin/parquet-rewrite.rs
index 57e8885c3..e4a80e7af 100644
--- a/parquet/src/bin/parquet-rewrite.rs
+++ b/parquet/src/bin/parquet-rewrite.rs
@@ -164,7 +164,7 @@ struct Args {
 
     /// Sets best effort maximum size of a data page in bytes.
     #[clap(long)]
-    data_pagesize_limit: Option<usize>,
+    data_page_size_limit: Option<usize>,
 
     /// Sets max statistics size for any column.
     ///
@@ -174,7 +174,7 @@ struct Args {
 
     /// Sets best effort maximum dictionary page size, in bytes.
     #[clap(long)]
-    dictionary_pagesize_limit: Option<usize>,
+    dictionary_page_size_limit: Option<usize>,
 
     /// Sets whether bloom filter is enabled for any column.
     #[clap(long)]
@@ -237,13 +237,13 @@ fn main() {
         writer_properties_builder =
             writer_properties_builder.set_data_page_row_count_limit(value);
     }
-    if let Some(value) = args.data_pagesize_limit {
+    if let Some(value) = args.data_page_size_limit {
         writer_properties_builder =
-            writer_properties_builder.set_data_pagesize_limit(value);
+            writer_properties_builder.set_data_page_size_limit(value);
     }
-    if let Some(value) = args.dictionary_pagesize_limit {
+    if let Some(value) = args.dictionary_page_size_limit {
         writer_properties_builder =
-            writer_properties_builder.set_dictionary_pagesize_limit(value);
+            writer_properties_builder.set_dictionary_page_size_limit(value);
     }
     if let Some(value) = args.max_statistics_size {
         writer_properties_builder =
diff --git a/parquet/src/column/writer/mod.rs b/parquet/src/column/writer/mod.rs
index 5e623d281..310519f4a 100644
--- a/parquet/src/column/writer/mod.rs
+++ b/parquet/src/column/writer/mod.rs
@@ -609,7 +609,7 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> {
     #[inline]
     fn should_dict_fallback(&self) -> bool {
         match self.encoder.estimated_dict_page_size() {
-            Some(size) => size >= self.props.dictionary_pagesize_limit(),
+            Some(size) => size >= self.props.dictionary_page_size_limit(),
             None => false,
         }
     }
@@ -627,7 +627,8 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> {
 
         self.page_metrics.num_buffered_rows as usize
             >= self.props.data_page_row_count_limit()
-            || self.encoder.estimated_data_page_size() >= self.props.data_pagesize_limit()
+            || self.encoder.estimated_data_page_size()
+                >= self.props.data_page_size_limit()
     }
 
     /// Performs dictionary fallback.
@@ -1839,8 +1840,8 @@ mod tests {
     #[test]
     fn test_column_writer_dictionary_fallback_small_data_page() {
         let props = WriterProperties::builder()
-            .set_dictionary_pagesize_limit(32)
-            .set_data_pagesize_limit(32)
+            .set_dictionary_page_size_limit(32)
+            .set_data_page_size_limit(32)
             .build();
         column_roundtrip_random::<Int32Type>(props, 1024, i32::MIN, i32::MAX, 10, 10);
     }
@@ -1899,7 +1900,7 @@ mod tests {
         let page_writer = Box::new(SerializedPageWriter::new(&mut write));
         let props = Arc::new(
             WriterProperties::builder()
-                .set_data_pagesize_limit(10)
+                .set_data_page_size_limit(10)
                 .set_write_batch_size(3) // write 3 values at a time
                 .build(),
         );
diff --git a/parquet/src/file/properties.rs b/parquet/src/file/properties.rs
index c09503987..66690463a 100644
--- a/parquet/src/file/properties.rs
+++ b/parquet/src/file/properties.rs
@@ -117,8 +117,8 @@ pub type WriterPropertiesPtr = Arc<WriterProperties>;
 /// use [`WriterPropertiesBuilder`] to assemble these properties.
 #[derive(Debug, Clone)]
 pub struct WriterProperties {
-    data_pagesize_limit: usize,
-    dictionary_pagesize_limit: usize,
+    data_page_size_limit: usize,
+    dictionary_page_size_limit: usize,
     data_page_row_count_limit: usize,
     write_batch_size: usize,
     max_row_group_size: usize,
@@ -152,23 +152,42 @@ impl WriterProperties {
     /// Returns data page size limit.
     ///
     /// Note: this is a best effort limit based on the write batch size
+    #[deprecated(since = "41.0.0", note = "Use data_page_size_limit")]
     pub fn data_pagesize_limit(&self) -> usize {
-        self.data_pagesize_limit
+        self.data_page_size_limit
+    }
+
+    /// Returns data page size limit.
+    ///
+    /// Note: this is a best effort limit based on the write batch size
+    ///
+    /// For more details see [`WriterPropertiesBuilder::set_data_page_size_limit`]
+    pub fn data_page_size_limit(&self) -> usize {
+        self.data_page_size_limit
     }
 
     /// Returns dictionary page size limit.
     ///
     /// Note: this is a best effort limit based on the write batch size
+    #[deprecated(since = "41.0.0", note = "Use dictionary_page_size_limit")]
     pub fn dictionary_pagesize_limit(&self) -> usize {
-        self.dictionary_pagesize_limit
+        self.dictionary_page_size_limit
     }
 
-    /// Returns the maximum page row count
+    /// Returns dictionary page size limit.
     ///
-    /// This can be used to limit the number of rows within a page to
-    /// yield better page pruning
+    /// Note: this is a best effort limit based on the write batch size
+    ///
+    /// For more details see [`WriterPropertiesBuilder::set_dictionary_page_size_limit`]
+    pub fn dictionary_page_size_limit(&self) -> usize {
+        self.dictionary_page_size_limit
+    }
+
+    /// Returns the maximum page row count
     ///
     /// Note: this is a best effort limit based on the write batch size
+    ///
+    /// For more details see [`WriterPropertiesBuilder::set_data_page_row_count_limit`]
     pub fn data_page_row_count_limit(&self) -> usize {
         self.data_page_row_count_limit
     }
@@ -290,8 +309,8 @@ impl WriterProperties {
 
 /// Writer properties builder.
 pub struct WriterPropertiesBuilder {
-    data_pagesize_limit: usize,
-    dictionary_pagesize_limit: usize,
+    data_page_size_limit: usize,
+    dictionary_page_size_limit: usize,
     data_page_row_count_limit: usize,
     write_batch_size: usize,
     max_row_group_size: usize,
@@ -307,8 +326,8 @@ impl WriterPropertiesBuilder {
     /// Returns default state of the builder.
     fn with_defaults() -> Self {
         Self {
-            data_pagesize_limit: DEFAULT_PAGE_SIZE,
-            dictionary_pagesize_limit: DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT,
+            data_page_size_limit: DEFAULT_PAGE_SIZE,
+            dictionary_page_size_limit: DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT,
             data_page_row_count_limit: usize::MAX,
             write_batch_size: DEFAULT_WRITE_BATCH_SIZE,
             max_row_group_size: DEFAULT_MAX_ROW_GROUP_SIZE,
@@ -324,8 +343,8 @@ impl WriterPropertiesBuilder {
     /// Finalizes the configuration and returns immutable writer properties struct.
     pub fn build(self) -> WriterProperties {
         WriterProperties {
-            data_pagesize_limit: self.data_pagesize_limit,
-            dictionary_pagesize_limit: self.dictionary_pagesize_limit,
+            data_page_size_limit: self.data_page_size_limit,
+            dictionary_page_size_limit: self.dictionary_page_size_limit,
             data_page_row_count_limit: self.data_page_row_count_limit,
             write_batch_size: self.write_batch_size,
             max_row_group_size: self.max_row_group_size,
@@ -351,16 +370,32 @@ impl WriterPropertiesBuilder {
     ///
     /// Note: this is a best effort limit based on value of
     /// [`set_write_batch_size`](Self::set_write_batch_size).
+    #[deprecated(since = "41.0.0", note = "Use set_data_page_size_limit")]
     pub fn set_data_pagesize_limit(mut self, value: usize) -> Self {
-        self.data_pagesize_limit = value;
+        self.data_page_size_limit = value;
         self
     }
 
-    /// Sets best effort maximum number of rows in a data page.
+    /// Sets best effort maximum size of a data page in bytes.
     ///
+    /// The parquet writer will attempt to limit the sizes of each
+    /// `DataPage` to this many bytes. Reducing this value will result
+    /// in larger parquet files, but may improve the effectiveness of
+    /// page index based predicate pushdown during reading.
     ///
-    /// This can be used to limit the number of rows within a page to
-    /// yield better page pruning.
+    /// Note: this is a best effort limit based on value of
+    /// [`set_write_batch_size`](Self::set_write_batch_size).
+    pub fn set_data_page_size_limit(mut self, value: usize) -> Self {
+        self.data_page_size_limit = value;
+        self
+    }
+
+    /// Sets best effort maximum number of rows in a data page.
+    ///
+    /// The parquet writer will attempt to limit the number of rows in
+    /// each `DataPage` to this value. Reducing this value will result
+    /// in larger parquet files, but may improve the effectiveness of
+    /// page index based predicate pushdown during reading.
     ///
     /// Note: this is a best effort limit based on value of
     /// [`set_write_batch_size`](Self::set_write_batch_size).
@@ -373,8 +408,24 @@ impl WriterPropertiesBuilder {
     ///
     /// Note: this is a best effort limit based on value of
     /// [`set_write_batch_size`](Self::set_write_batch_size).
+    #[deprecated(since = "41.0.0", note = "Use set_dictionary_page_size_limit")]
     pub fn set_dictionary_pagesize_limit(mut self, value: usize) -> Self {
-        self.dictionary_pagesize_limit = value;
+        self.dictionary_page_size_limit = value;
+        self
+    }
+
+    /// Sets best effort maximum dictionary page size, in bytes.
+    ///
+    /// The parquet writer will attempt to limit the size of each
+    /// `DataPage` used to store dictionaries to this many
+    /// bytes. Reducing this value will result in larger parquet
+    /// files, but may improve the effectiveness of page index based
+    /// predicate pushdown during reading.
+    ///
+    /// Note: this is a best effort limit based on value of
+    /// [`set_write_batch_size`](Self::set_write_batch_size).
+    pub fn set_dictionary_page_size_limit(mut self, value: usize) -> Self {
+        self.dictionary_page_size_limit = value;
         self
     }
 
@@ -850,9 +901,9 @@ mod tests {
     #[test]
     fn test_writer_properties_default_settings() {
         let props = WriterProperties::default();
-        assert_eq!(props.data_pagesize_limit(), DEFAULT_PAGE_SIZE);
+        assert_eq!(props.data_page_size_limit(), DEFAULT_PAGE_SIZE);
         assert_eq!(
-            props.dictionary_pagesize_limit(),
+            props.dictionary_page_size_limit(),
             DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT
         );
         assert_eq!(props.write_batch_size(), DEFAULT_WRITE_BATCH_SIZE);
@@ -939,8 +990,8 @@ mod tests {
         let props = WriterProperties::builder()
             // file settings
             .set_writer_version(WriterVersion::PARQUET_2_0)
-            .set_data_pagesize_limit(10)
-            .set_dictionary_pagesize_limit(20)
+            .set_data_page_size_limit(10)
+            .set_dictionary_page_size_limit(20)
             .set_write_batch_size(30)
             .set_max_row_group_size(40)
             .set_created_by("default".to_owned())
@@ -969,8 +1020,8 @@ mod tests {
             .build();
 
         assert_eq!(props.writer_version(), WriterVersion::PARQUET_2_0);
-        assert_eq!(props.data_pagesize_limit(), 10);
-        assert_eq!(props.dictionary_pagesize_limit(), 20);
+        assert_eq!(props.data_page_size_limit(), 10);
+        assert_eq!(props.dictionary_page_size_limit(), 20);
         assert_eq!(props.write_batch_size(), 30);
         assert_eq!(props.max_row_group_size(), 40);
         assert_eq!(props.created_by(), "default");
diff --git a/parquet/tests/arrow_writer_layout.rs b/parquet/tests/arrow_writer_layout.rs
index 4bf649f24..142112b7b 100644
--- a/parquet/tests/arrow_writer_layout.rs
+++ b/parquet/tests/arrow_writer_layout.rs
@@ -175,7 +175,7 @@ fn test_primitive() {
     let batch = RecordBatch::try_from_iter([("col", array)]).unwrap();
     let props = WriterProperties::builder()
         .set_dictionary_enabled(false)
-        .set_data_pagesize_limit(1000)
+        .set_data_page_size_limit(1000)
         .set_write_batch_size(10)
         .build();
 
@@ -204,8 +204,8 @@ fn test_primitive() {
     // Test spill dictionary
     let props = WriterProperties::builder()
         .set_dictionary_enabled(true)
-        .set_dictionary_pagesize_limit(1000)
-        .set_data_pagesize_limit(10000)
+        .set_dictionary_page_size_limit(1000)
+        .set_data_page_size_limit(10000)
         .set_write_batch_size(10)
         .build();
 
@@ -246,8 +246,8 @@ fn test_primitive() {
     // Test spill dictionary encoded pages
     let props = WriterProperties::builder()
         .set_dictionary_enabled(true)
-        .set_dictionary_pagesize_limit(10000)
-        .set_data_pagesize_limit(500)
+        .set_dictionary_page_size_limit(10000)
+        .set_data_page_size_limit(500)
         .set_write_batch_size(10)
         .build();
 
@@ -350,7 +350,7 @@ fn test_string() {
     let batch = RecordBatch::try_from_iter([("col", array)]).unwrap();
     let props = WriterProperties::builder()
         .set_dictionary_enabled(false)
-        .set_data_pagesize_limit(1000)
+        .set_data_page_size_limit(1000)
         .set_write_batch_size(10)
         .build();
 
@@ -386,8 +386,8 @@ fn test_string() {
     // Test spill dictionary
     let props = WriterProperties::builder()
         .set_dictionary_enabled(true)
-        .set_dictionary_pagesize_limit(1000)
-        .set_data_pagesize_limit(10000)
+        .set_dictionary_page_size_limit(1000)
+        .set_data_page_size_limit(10000)
         .set_write_batch_size(10)
         .build();
 
@@ -435,8 +435,8 @@ fn test_string() {
     // Test spill dictionary encoded pages
     let props = WriterProperties::builder()
         .set_dictionary_enabled(true)
-        .set_dictionary_pagesize_limit(20000)
-        .set_data_pagesize_limit(500)
+        .set_dictionary_page_size_limit(20000)
+        .set_data_page_size_limit(500)
         .set_write_batch_size(10)
         .build();