You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by tu...@apache.org on 2023/06/09 17:18:06 UTC
[arrow-rs] branch master updated: Improve parquet WriterProperites and ReaderProperties docs (#4392)
This is an automated email from the ASF dual-hosted git repository.
tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new 7bedb0a3a Improve parquet WriterProperites and ReaderProperties docs (#4392)
7bedb0a3a is described below
commit 7bedb0a3ac642395daaa5318cb71f8f5c994657b
Author: Andrew Lamb <an...@nerdnetworks.org>
AuthorDate: Fri Jun 9 13:18:00 2023 -0400
Improve parquet WriterProperites and ReaderProperties docs (#4392)
---
parquet/src/file/properties.rs | 139 ++++++++++++++++++++++-------------------
1 file changed, 74 insertions(+), 65 deletions(-)
diff --git a/parquet/src/file/properties.rs b/parquet/src/file/properties.rs
index 66690463a..9724fd7f4 100644
--- a/parquet/src/file/properties.rs
+++ b/parquet/src/file/properties.rs
@@ -15,55 +15,7 @@
// specific language governing permissions and limitations
// under the License.
-//! [`WriterProperties`]
-//!
-//! # Usage
-//!
-//! ```rust
-//! use parquet::{
-//! basic::{Compression, Encoding},
-//! file::properties::*,
-//! schema::types::ColumnPath,
-//! };
-//!
-//! // Create properties with default configuration.
-//! let props = WriterProperties::default();
-//!
-//! // Use properties builder to set certain options and assemble the configuration.
-//! let props = WriterProperties::builder()
-//! .set_writer_version(WriterVersion::PARQUET_1_0)
-//! .set_encoding(Encoding::PLAIN)
-//! .set_column_encoding(ColumnPath::from("col1"), Encoding::DELTA_BINARY_PACKED)
-//! .set_compression(Compression::SNAPPY)
-//! .build();
-//!
-//! assert_eq!(props.writer_version(), WriterVersion::PARQUET_1_0);
-//! assert_eq!(
-//! props.encoding(&ColumnPath::from("col1")),
-//! Some(Encoding::DELTA_BINARY_PACKED)
-//! );
-//! assert_eq!(
-//! props.encoding(&ColumnPath::from("col2")),
-//! Some(Encoding::PLAIN)
-//! );
-//! ```
-//!
-//! Reader properties.
-//!
-//! # Usage
-//!
-//! ```rust
-//! use parquet::file::properties::ReaderProperties;
-//!
-//! // Create properties with default configuration.
-//! let props = ReaderProperties::builder().build();
-//!
-//! // Use properties builder to set certain options and assemble the configuration.
-//! let props = ReaderProperties::builder()
-//! .set_backward_compatible_lz4(false)
-//! .build();
-//! ```
-
+//! Configuration via [`WriterProperties`] and [`ReaderProperties`]
use std::{collections::HashMap, sync::Arc};
use crate::basic::{Compression, Encoding};
@@ -72,20 +24,30 @@ use crate::file::metadata::KeyValue;
use crate::format::SortingColumn;
use crate::schema::types::ColumnPath;
-const DEFAULT_PAGE_SIZE: usize = 1024 * 1024;
-const DEFAULT_WRITE_BATCH_SIZE: usize = 1024;
-const DEFAULT_WRITER_VERSION: WriterVersion = WriterVersion::PARQUET_1_0;
-const DEFAULT_COMPRESSION: Compression = Compression::UNCOMPRESSED;
-const DEFAULT_DICTIONARY_ENABLED: bool = true;
-const DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT: usize = DEFAULT_PAGE_SIZE;
-const DEFAULT_STATISTICS_ENABLED: EnabledStatistics = EnabledStatistics::Page;
-const DEFAULT_MAX_STATISTICS_SIZE: usize = 4096;
-const DEFAULT_MAX_ROW_GROUP_SIZE: usize = 1024 * 1024;
-const DEFAULT_CREATED_BY: &str =
+/// Default value for [`WriterProperties::data_page_size_limit`]
+pub const DEFAULT_PAGE_SIZE: usize = 1024 * 1024;
+/// Default value for [`WriterProperties::write_batch_size`]
+pub const DEFAULT_WRITE_BATCH_SIZE: usize = 1024;
+/// Default value for [`WriterProperties::writer_version`]
+pub const DEFAULT_WRITER_VERSION: WriterVersion = WriterVersion::PARQUET_1_0;
+/// Default value for [`WriterProperties::compression`]
+pub const DEFAULT_COMPRESSION: Compression = Compression::UNCOMPRESSED;
+/// Default value for [`WriterProperties::dictionary_enabled`]
+pub const DEFAULT_DICTIONARY_ENABLED: bool = true;
+/// Default value for [`WriterProperties::dictionary_page_size_limit`]
+pub const DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT: usize = DEFAULT_PAGE_SIZE;
+/// Default value for [`WriterProperties::statistics_enabled`]
+pub const DEFAULT_STATISTICS_ENABLED: EnabledStatistics = EnabledStatistics::Page;
+/// Default value for [`WriterProperties::max_statistics_size`]
+pub const DEFAULT_MAX_STATISTICS_SIZE: usize = 4096;
+/// Default value for [`WriterProperties::max_row_group_size`]
+pub const DEFAULT_MAX_ROW_GROUP_SIZE: usize = 1024 * 1024;
+/// Default value for [`WriterProperties::created_by`]
+pub const DEFAULT_CREATED_BY: &str =
concat!("parquet-rs version ", env!("CARGO_PKG_VERSION"));
-/// default value for the false positive probability used in a bloom filter.
+/// Default value for [`BloomFilterProperties::fpp`]
pub const DEFAULT_BLOOM_FILTER_FPP: f64 = 0.05;
-/// default value for the expected number of distinct values used in a bloom filter.
+/// Default value for [`BloomFilterProperties::ndv`]
pub const DEFAULT_BLOOM_FILTER_NDV: u64 = 1_000_000_u64;
/// Parquet writer version.
@@ -111,10 +73,41 @@ impl WriterVersion {
/// Reference counted writer properties.
pub type WriterPropertiesPtr = Arc<WriterProperties>;
-/// Writer properties.
+/// Configuration settings for writing parquet files.
///
/// All properties except the key-value metadata are immutable,
/// use [`WriterPropertiesBuilder`] to assemble these properties.
+///
+/// # Example
+///
+/// ```rust
+/// use parquet::{
+/// basic::{Compression, Encoding},
+/// file::properties::*,
+/// schema::types::ColumnPath,
+/// };
+///
+/// // Create properties with default configuration.
+/// let props = WriterProperties::default();
+///
+/// // Use properties builder to set certain options and assemble the configuration.
+/// let props = WriterProperties::builder()
+/// .set_writer_version(WriterVersion::PARQUET_1_0)
+/// .set_encoding(Encoding::PLAIN)
+/// .set_column_encoding(ColumnPath::from("col1"), Encoding::DELTA_BINARY_PACKED)
+/// .set_compression(Compression::SNAPPY)
+/// .build();
+///
+/// assert_eq!(props.writer_version(), WriterVersion::PARQUET_1_0);
+/// assert_eq!(
+/// props.encoding(&ColumnPath::from("col1")),
+/// Some(Encoding::DELTA_BINARY_PACKED)
+/// );
+/// assert_eq!(
+/// props.encoding(&ColumnPath::from("col2")),
+/// Some(Encoding::PLAIN)
+/// );
+/// ```
#[derive(Debug, Clone)]
pub struct WriterProperties {
data_page_size_limit: usize,
@@ -307,7 +300,8 @@ impl WriterProperties {
}
}
-/// Writer properties builder.
+/// Builder for parquet file writer configuration. See example on
+/// [`WriterProperties`]
pub struct WriterPropertiesBuilder {
data_page_size_limit: usize,
dictionary_page_size_limit: usize,
@@ -809,10 +803,24 @@ pub type ReaderPropertiesPtr = Arc<ReaderProperties>;
const DEFAULT_READ_BLOOM_FILTER: bool = false;
-/// Reader properties.
+/// Configuration settings for reading parquet files.
///
/// All properties are immutable and `Send` + `Sync`.
/// Use [`ReaderPropertiesBuilder`] to assemble these properties.
+///
+/// # Example
+///
+/// ```rust
+/// use parquet::file::properties::ReaderProperties;
+///
+/// // Create properties with default configuration.
+/// let props = ReaderProperties::builder().build();
+///
+/// // Use properties builder to set certain options and assemble the configuration.
+/// let props = ReaderProperties::builder()
+/// .set_backward_compatible_lz4(false)
+/// .build();
+/// ```
pub struct ReaderProperties {
codec_options: CodecOptions,
read_bloom_filter: bool,
@@ -835,7 +843,8 @@ impl ReaderProperties {
}
}
-/// Reader properties builder.
+/// Builder for parquet file reader configuration. See example on
+/// [`ReaderProperties`]
pub struct ReaderPropertiesBuilder {
codec_options_builder: CodecOptionsBuilder,
read_bloom_filter: Option<bool>,