You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by tu...@apache.org on 2023/05/31 10:49:08 UTC
[arrow-rs] branch master updated: Use `page_size` consistently, deprecate `pagesize` in parquet WriterProperties (#4313)
This is an automated email from the ASF dual-hosted git repository.
tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new 768e726df Use `page_size` consistently, deprecate `pagesize` in parquet WriterProperties (#4313)
768e726df is described below
commit 768e726df3077403f4653dffdc961328a22a5ef4
Author: Andrew Lamb <an...@nerdnetworks.org>
AuthorDate: Wed May 31 06:49:02 2023 -0400
Use `page_size` consistently, deprecate `pagesize` in parquet WriterProperties (#4313)
* Use `page_size` consistently, deprecate `pagesize`
* doc tweaks
* Update parquet/src/file/properties.rs
Co-authored-by: Liang-Chi Hsieh <vi...@gmail.com>
---------
Co-authored-by: Raphael Taylor-Davies <17...@users.noreply.github.com>
Co-authored-by: Liang-Chi Hsieh <vi...@gmail.com>
---
parquet/src/arrow/arrow_reader/mod.rs | 4 +-
parquet/src/arrow/arrow_writer/mod.rs | 10 ++--
parquet/src/bin/parquet-rewrite.rs | 12 ++---
parquet/src/column/writer/mod.rs | 11 ++--
parquet/src/file/properties.rs | 99 ++++++++++++++++++++++++++---------
parquet/tests/arrow_writer_layout.rs | 20 +++----
6 files changed, 104 insertions(+), 52 deletions(-)
diff --git a/parquet/src/arrow/arrow_reader/mod.rs b/parquet/src/arrow/arrow_reader/mod.rs
index 819e96c0a..f3e178bdf 100644
--- a/parquet/src/arrow/arrow_reader/mod.rs
+++ b/parquet/src/arrow/arrow_reader/mod.rs
@@ -1240,7 +1240,7 @@ mod tests {
fn writer_props(&self) -> WriterProperties {
let builder = WriterProperties::builder()
- .set_data_pagesize_limit(self.max_data_page_size)
+ .set_data_page_size_limit(self.max_data_page_size)
.set_write_batch_size(self.write_batch_size)
.set_writer_version(self.writer_version)
.set_statistics_enabled(self.enabled_statistics);
@@ -1248,7 +1248,7 @@ mod tests {
let builder = match self.encoding {
Encoding::RLE_DICTIONARY | Encoding::PLAIN_DICTIONARY => builder
.set_dictionary_enabled(true)
- .set_dictionary_pagesize_limit(self.max_dict_page_size),
+ .set_dictionary_page_size_limit(self.max_dict_page_size),
_ => builder
.set_dictionary_enabled(false)
.set_encoding(self.encoding),
diff --git a/parquet/src/arrow/arrow_writer/mod.rs b/parquet/src/arrow/arrow_writer/mod.rs
index 5f2750a55..0aca77f5b 100644
--- a/parquet/src/arrow/arrow_writer/mod.rs
+++ b/parquet/src/arrow/arrow_writer/mod.rs
@@ -1320,8 +1320,8 @@ mod tests {
// Set everything very low so we fallback to PLAIN encoding after the first row
let props = WriterProperties::builder()
- .set_data_pagesize_limit(1)
- .set_dictionary_pagesize_limit(1)
+ .set_data_page_size_limit(1)
+ .set_dictionary_page_size_limit(1)
.set_write_batch_size(1)
.build();
@@ -1494,7 +1494,7 @@ mod tests {
.set_writer_version(version)
.set_max_row_group_size(row_group_size)
.set_dictionary_enabled(dictionary_size != 0)
- .set_dictionary_pagesize_limit(dictionary_size.max(1))
+ .set_dictionary_page_size_limit(dictionary_size.max(1))
.set_encoding(*encoding)
.set_bloom_filter_enabled(bloom_filter)
.build();
@@ -2043,7 +2043,7 @@ mod tests {
let expected_batch = RecordBatch::try_new(schema, vec![values]).unwrap();
let row_group_sizes = [1024, SMALL_SIZE, SMALL_SIZE / 2, SMALL_SIZE / 2 + 1, 10];
- let data_pagesize_limit: usize = 32;
+ let data_page_size_limit: usize = 32;
let write_batch_size: usize = 16;
for encoding in &encodings {
@@ -2053,7 +2053,7 @@ mod tests {
.set_max_row_group_size(row_group_size)
.set_dictionary_enabled(false)
.set_encoding(*encoding)
- .set_data_pagesize_limit(data_pagesize_limit)
+ .set_data_page_size_limit(data_page_size_limit)
.set_write_batch_size(write_batch_size)
.build();
diff --git a/parquet/src/bin/parquet-rewrite.rs b/parquet/src/bin/parquet-rewrite.rs
index 57e8885c3..e4a80e7af 100644
--- a/parquet/src/bin/parquet-rewrite.rs
+++ b/parquet/src/bin/parquet-rewrite.rs
@@ -164,7 +164,7 @@ struct Args {
/// Sets best effort maximum size of a data page in bytes.
#[clap(long)]
- data_pagesize_limit: Option<usize>,
+ data_page_size_limit: Option<usize>,
/// Sets max statistics size for any column.
///
@@ -174,7 +174,7 @@ struct Args {
/// Sets best effort maximum dictionary page size, in bytes.
#[clap(long)]
- dictionary_pagesize_limit: Option<usize>,
+ dictionary_page_size_limit: Option<usize>,
/// Sets whether bloom filter is enabled for any column.
#[clap(long)]
@@ -237,13 +237,13 @@ fn main() {
writer_properties_builder =
writer_properties_builder.set_data_page_row_count_limit(value);
}
- if let Some(value) = args.data_pagesize_limit {
+ if let Some(value) = args.data_page_size_limit {
writer_properties_builder =
- writer_properties_builder.set_data_pagesize_limit(value);
+ writer_properties_builder.set_data_page_size_limit(value);
}
- if let Some(value) = args.dictionary_pagesize_limit {
+ if let Some(value) = args.dictionary_page_size_limit {
writer_properties_builder =
- writer_properties_builder.set_dictionary_pagesize_limit(value);
+ writer_properties_builder.set_dictionary_page_size_limit(value);
}
if let Some(value) = args.max_statistics_size {
writer_properties_builder =
diff --git a/parquet/src/column/writer/mod.rs b/parquet/src/column/writer/mod.rs
index 5e623d281..310519f4a 100644
--- a/parquet/src/column/writer/mod.rs
+++ b/parquet/src/column/writer/mod.rs
@@ -609,7 +609,7 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> {
#[inline]
fn should_dict_fallback(&self) -> bool {
match self.encoder.estimated_dict_page_size() {
- Some(size) => size >= self.props.dictionary_pagesize_limit(),
+ Some(size) => size >= self.props.dictionary_page_size_limit(),
None => false,
}
}
@@ -627,7 +627,8 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> {
self.page_metrics.num_buffered_rows as usize
>= self.props.data_page_row_count_limit()
- || self.encoder.estimated_data_page_size() >= self.props.data_pagesize_limit()
+ || self.encoder.estimated_data_page_size()
+ >= self.props.data_page_size_limit()
}
/// Performs dictionary fallback.
@@ -1839,8 +1840,8 @@ mod tests {
#[test]
fn test_column_writer_dictionary_fallback_small_data_page() {
let props = WriterProperties::builder()
- .set_dictionary_pagesize_limit(32)
- .set_data_pagesize_limit(32)
+ .set_dictionary_page_size_limit(32)
+ .set_data_page_size_limit(32)
.build();
column_roundtrip_random::<Int32Type>(props, 1024, i32::MIN, i32::MAX, 10, 10);
}
@@ -1899,7 +1900,7 @@ mod tests {
let page_writer = Box::new(SerializedPageWriter::new(&mut write));
let props = Arc::new(
WriterProperties::builder()
- .set_data_pagesize_limit(10)
+ .set_data_page_size_limit(10)
.set_write_batch_size(3) // write 3 values at a time
.build(),
);
diff --git a/parquet/src/file/properties.rs b/parquet/src/file/properties.rs
index c09503987..66690463a 100644
--- a/parquet/src/file/properties.rs
+++ b/parquet/src/file/properties.rs
@@ -117,8 +117,8 @@ pub type WriterPropertiesPtr = Arc<WriterProperties>;
/// use [`WriterPropertiesBuilder`] to assemble these properties.
#[derive(Debug, Clone)]
pub struct WriterProperties {
- data_pagesize_limit: usize,
- dictionary_pagesize_limit: usize,
+ data_page_size_limit: usize,
+ dictionary_page_size_limit: usize,
data_page_row_count_limit: usize,
write_batch_size: usize,
max_row_group_size: usize,
@@ -152,23 +152,42 @@ impl WriterProperties {
/// Returns data page size limit.
///
/// Note: this is a best effort limit based on the write batch size
+ #[deprecated(since = "41.0.0", note = "Use data_page_size_limit")]
pub fn data_pagesize_limit(&self) -> usize {
- self.data_pagesize_limit
+ self.data_page_size_limit
+ }
+
+ /// Returns data page size limit.
+ ///
+ /// Note: this is a best effort limit based on the write batch size
+ ///
+ /// For more details see [`WriterPropertiesBuilder::set_data_page_size_limit`]
+ pub fn data_page_size_limit(&self) -> usize {
+ self.data_page_size_limit
}
/// Returns dictionary page size limit.
///
/// Note: this is a best effort limit based on the write batch size
+ #[deprecated(since = "41.0.0", note = "Use dictionary_page_size_limit")]
pub fn dictionary_pagesize_limit(&self) -> usize {
- self.dictionary_pagesize_limit
+ self.dictionary_page_size_limit
}
- /// Returns the maximum page row count
+ /// Returns dictionary page size limit.
///
- /// This can be used to limit the number of rows within a page to
- /// yield better page pruning
+ /// Note: this is a best effort limit based on the write batch size
+ ///
+ /// For more details see [`WriterPropertiesBuilder::set_dictionary_page_size_limit`]
+ pub fn dictionary_page_size_limit(&self) -> usize {
+ self.dictionary_page_size_limit
+ }
+
+ /// Returns the maximum page row count
///
/// Note: this is a best effort limit based on the write batch size
+ ///
+ /// For more details see [`WriterPropertiesBuilder::set_data_page_row_count_limit`]
pub fn data_page_row_count_limit(&self) -> usize {
self.data_page_row_count_limit
}
@@ -290,8 +309,8 @@ impl WriterProperties {
/// Writer properties builder.
pub struct WriterPropertiesBuilder {
- data_pagesize_limit: usize,
- dictionary_pagesize_limit: usize,
+ data_page_size_limit: usize,
+ dictionary_page_size_limit: usize,
data_page_row_count_limit: usize,
write_batch_size: usize,
max_row_group_size: usize,
@@ -307,8 +326,8 @@ impl WriterPropertiesBuilder {
/// Returns default state of the builder.
fn with_defaults() -> Self {
Self {
- data_pagesize_limit: DEFAULT_PAGE_SIZE,
- dictionary_pagesize_limit: DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT,
+ data_page_size_limit: DEFAULT_PAGE_SIZE,
+ dictionary_page_size_limit: DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT,
data_page_row_count_limit: usize::MAX,
write_batch_size: DEFAULT_WRITE_BATCH_SIZE,
max_row_group_size: DEFAULT_MAX_ROW_GROUP_SIZE,
@@ -324,8 +343,8 @@ impl WriterPropertiesBuilder {
/// Finalizes the configuration and returns immutable writer properties struct.
pub fn build(self) -> WriterProperties {
WriterProperties {
- data_pagesize_limit: self.data_pagesize_limit,
- dictionary_pagesize_limit: self.dictionary_pagesize_limit,
+ data_page_size_limit: self.data_page_size_limit,
+ dictionary_page_size_limit: self.dictionary_page_size_limit,
data_page_row_count_limit: self.data_page_row_count_limit,
write_batch_size: self.write_batch_size,
max_row_group_size: self.max_row_group_size,
@@ -351,16 +370,32 @@ impl WriterPropertiesBuilder {
///
/// Note: this is a best effort limit based on value of
/// [`set_write_batch_size`](Self::set_write_batch_size).
+ #[deprecated(since = "41.0.0", note = "Use set_data_page_size_limit")]
pub fn set_data_pagesize_limit(mut self, value: usize) -> Self {
- self.data_pagesize_limit = value;
+ self.data_page_size_limit = value;
self
}
- /// Sets best effort maximum number of rows in a data page.
+ /// Sets best effort maximum size of a data page in bytes.
///
+ /// The parquet writer will attempt to limit the sizes of each
+ /// `DataPage` to this many bytes. Reducing this value will result
+ /// in larger parquet files, but may improve the effectiveness of
+ /// page index based predicate pushdown during reading.
///
- /// This can be used to limit the number of rows within a page to
- /// yield better page pruning.
+ /// Note: this is a best effort limit based on value of
+ /// [`set_write_batch_size`](Self::set_write_batch_size).
+ pub fn set_data_page_size_limit(mut self, value: usize) -> Self {
+ self.data_page_size_limit = value;
+ self
+ }
+
+ /// Sets best effort maximum number of rows in a data page.
+ ///
+ /// The parquet writer will attempt to limit the number of rows in
+ /// each `DataPage` to this value. Reducing this value will result
+ /// in larger parquet files, but may improve the effectiveness of
+ /// page index based predicate pushdown during reading.
///
/// Note: this is a best effort limit based on value of
/// [`set_write_batch_size`](Self::set_write_batch_size).
@@ -373,8 +408,24 @@ impl WriterPropertiesBuilder {
///
/// Note: this is a best effort limit based on value of
/// [`set_write_batch_size`](Self::set_write_batch_size).
+ #[deprecated(since = "41.0.0", note = "Use set_dictionary_page_size_limit")]
pub fn set_dictionary_pagesize_limit(mut self, value: usize) -> Self {
- self.dictionary_pagesize_limit = value;
+ self.dictionary_page_size_limit = value;
+ self
+ }
+
+ /// Sets best effort maximum dictionary page size, in bytes.
+ ///
+ /// The parquet writer will attempt to limit the size of each
+ /// `DataPage` used to store dictionaries to this many
+ /// bytes. Reducing this value will result in larger parquet
+ /// files, but may improve the effectiveness of page index based
+ /// predicate pushdown during reading.
+ ///
+ /// Note: this is a best effort limit based on value of
+ /// [`set_write_batch_size`](Self::set_write_batch_size).
+ pub fn set_dictionary_page_size_limit(mut self, value: usize) -> Self {
+ self.dictionary_page_size_limit = value;
self
}
@@ -850,9 +901,9 @@ mod tests {
#[test]
fn test_writer_properties_default_settings() {
let props = WriterProperties::default();
- assert_eq!(props.data_pagesize_limit(), DEFAULT_PAGE_SIZE);
+ assert_eq!(props.data_page_size_limit(), DEFAULT_PAGE_SIZE);
assert_eq!(
- props.dictionary_pagesize_limit(),
+ props.dictionary_page_size_limit(),
DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT
);
assert_eq!(props.write_batch_size(), DEFAULT_WRITE_BATCH_SIZE);
@@ -939,8 +990,8 @@ mod tests {
let props = WriterProperties::builder()
// file settings
.set_writer_version(WriterVersion::PARQUET_2_0)
- .set_data_pagesize_limit(10)
- .set_dictionary_pagesize_limit(20)
+ .set_data_page_size_limit(10)
+ .set_dictionary_page_size_limit(20)
.set_write_batch_size(30)
.set_max_row_group_size(40)
.set_created_by("default".to_owned())
@@ -969,8 +1020,8 @@ mod tests {
.build();
assert_eq!(props.writer_version(), WriterVersion::PARQUET_2_0);
- assert_eq!(props.data_pagesize_limit(), 10);
- assert_eq!(props.dictionary_pagesize_limit(), 20);
+ assert_eq!(props.data_page_size_limit(), 10);
+ assert_eq!(props.dictionary_page_size_limit(), 20);
assert_eq!(props.write_batch_size(), 30);
assert_eq!(props.max_row_group_size(), 40);
assert_eq!(props.created_by(), "default");
diff --git a/parquet/tests/arrow_writer_layout.rs b/parquet/tests/arrow_writer_layout.rs
index 4bf649f24..142112b7b 100644
--- a/parquet/tests/arrow_writer_layout.rs
+++ b/parquet/tests/arrow_writer_layout.rs
@@ -175,7 +175,7 @@ fn test_primitive() {
let batch = RecordBatch::try_from_iter([("col", array)]).unwrap();
let props = WriterProperties::builder()
.set_dictionary_enabled(false)
- .set_data_pagesize_limit(1000)
+ .set_data_page_size_limit(1000)
.set_write_batch_size(10)
.build();
@@ -204,8 +204,8 @@ fn test_primitive() {
// Test spill dictionary
let props = WriterProperties::builder()
.set_dictionary_enabled(true)
- .set_dictionary_pagesize_limit(1000)
- .set_data_pagesize_limit(10000)
+ .set_dictionary_page_size_limit(1000)
+ .set_data_page_size_limit(10000)
.set_write_batch_size(10)
.build();
@@ -246,8 +246,8 @@ fn test_primitive() {
// Test spill dictionary encoded pages
let props = WriterProperties::builder()
.set_dictionary_enabled(true)
- .set_dictionary_pagesize_limit(10000)
- .set_data_pagesize_limit(500)
+ .set_dictionary_page_size_limit(10000)
+ .set_data_page_size_limit(500)
.set_write_batch_size(10)
.build();
@@ -350,7 +350,7 @@ fn test_string() {
let batch = RecordBatch::try_from_iter([("col", array)]).unwrap();
let props = WriterProperties::builder()
.set_dictionary_enabled(false)
- .set_data_pagesize_limit(1000)
+ .set_data_page_size_limit(1000)
.set_write_batch_size(10)
.build();
@@ -386,8 +386,8 @@ fn test_string() {
// Test spill dictionary
let props = WriterProperties::builder()
.set_dictionary_enabled(true)
- .set_dictionary_pagesize_limit(1000)
- .set_data_pagesize_limit(10000)
+ .set_dictionary_page_size_limit(1000)
+ .set_data_page_size_limit(10000)
.set_write_batch_size(10)
.build();
@@ -435,8 +435,8 @@ fn test_string() {
// Test spill dictionary encoded pages
let props = WriterProperties::builder()
.set_dictionary_enabled(true)
- .set_dictionary_pagesize_limit(20000)
- .set_data_pagesize_limit(500)
+ .set_dictionary_page_size_limit(20000)
+ .set_data_page_size_limit(500)
.set_write_batch_size(10)
.build();