You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ji...@apache.org on 2022/11/15 15:01:42 UTC
[arrow-rs] 01/04: add column setter
This is an automated email from the ASF dual-hosted git repository.
jiayuliu pushed a commit to branch add-bloom-filter-3
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
commit 777b0dc6f7d4a08af896772893071681c9d17b21
Author: Jiayu Liu <ji...@hey.com>
AuthorDate: Tue Nov 15 20:53:32 2022 +0800
add column setter
---
parquet/Cargo.toml | 1 +
parquet/src/file/properties.rs | 102 +++++++++++++++++++++++++++++++++++------
2 files changed, 89 insertions(+), 14 deletions(-)
diff --git a/parquet/Cargo.toml b/parquet/Cargo.toml
index fc7c8218a..72baaf338 100644
--- a/parquet/Cargo.toml
+++ b/parquet/Cargo.toml
@@ -58,6 +58,7 @@ futures = { version = "0.3", default-features = false, features = ["std"], optio
tokio = { version = "1.0", optional = true, default-features = false, features = ["macros", "rt", "io-util"] }
hashbrown = { version = "0.13", default-features = false }
twox-hash = { version = "1.6", optional = true }
+paste = "1.0"
[dev-dependencies]
base64 = { version = "0.13", default-features = false, features = ["std"] }
diff --git a/parquet/src/file/properties.rs b/parquet/src/file/properties.rs
index cf821df21..c0e789ca1 100644
--- a/parquet/src/file/properties.rs
+++ b/parquet/src/file/properties.rs
@@ -248,6 +248,15 @@ impl WriterProperties {
.or_else(|| self.default_column_properties.max_statistics_size())
.unwrap_or(DEFAULT_MAX_STATISTICS_SIZE)
}
+
+ /// Returns `true` if bloom filter is enabled for a column.
+ pub fn bloom_filter_enabled(&self, col: &ColumnPath) -> bool {
+ self.column_properties
+ .get(col)
+ .and_then(|c| c.bloom_filter_enabled())
+ .or_else(|| self.default_column_properties.bloom_filter_enabled())
+ .unwrap_or(false)
+ }
}
/// Writer properties builder.
@@ -264,6 +273,16 @@ pub struct WriterPropertiesBuilder {
column_properties: HashMap<ColumnPath, ColumnProperties>,
}
+macro_rules! def_per_col_setter {
+ ($field:ident, $field_type:expr) => {
+ // The macro will expand into the contents of this block.
+ pub fn concat_idents!(set_, $field)(mut self, value: $field_type) -> Self {
+ self.$field = value;
+ self
+ }
+ };
+}
+
impl WriterPropertiesBuilder {
/// Returns default state of the builder.
fn with_defaults() -> Self {
@@ -276,7 +295,7 @@ impl WriterPropertiesBuilder {
writer_version: DEFAULT_WRITER_VERSION,
created_by: DEFAULT_CREATED_BY.to_string(),
key_value_metadata: None,
- default_column_properties: ColumnProperties::new(),
+ default_column_properties: Default::default(),
column_properties: HashMap::new(),
}
}
@@ -306,6 +325,8 @@ impl WriterPropertiesBuilder {
self
}
+ def_per_col_setter!(writer_version, WriterVersion);
+
/// Sets best effort maximum size of a data page in bytes.
///
/// Note: this is a best effort limit based on value of
@@ -423,7 +444,7 @@ impl WriterPropertiesBuilder {
fn get_mut_props(&mut self, col: ColumnPath) -> &mut ColumnProperties {
self.column_properties
.entry(col)
- .or_insert_with(ColumnProperties::new)
+ .or_insert_with(Default::default)
}
/// Sets encoding for a column.
@@ -476,6 +497,17 @@ impl WriterPropertiesBuilder {
self.get_mut_props(col).set_max_statistics_size(value);
self
}
+
+ /// Sets bloom filter enabled for a column.
+ /// Takes precedence over globally defined settings.
+ pub fn set_column_bloom_filter_enabled(
+ mut self,
+ col: ColumnPath,
+ value: bool,
+ ) -> Self {
+ self.get_mut_props(col).set_bloom_filter_enabled(value);
+ self
+ }
}
/// Controls the level of statistics to be computed by the writer
@@ -499,27 +531,24 @@ impl Default for EnabledStatistics {
///
/// If a field is `None`, it means that no specific value has been set for this column,
/// so some subsequent or default value must be used.
-#[derive(Debug, Clone, PartialEq)]
+#[derive(Debug, Clone, Default, PartialEq)]
struct ColumnProperties {
encoding: Option<Encoding>,
codec: Option<Compression>,
dictionary_enabled: Option<bool>,
statistics_enabled: Option<EnabledStatistics>,
max_statistics_size: Option<usize>,
+ /// bloom filter enabled
+ bloom_filter_enabled: Option<bool>,
+ /// bloom filter expected number of distinct values
+ bloom_filter_ndv: Option<u64>,
+ /// bloom filter false positive probability
+ bloom_filter_fpp: Option<f64>,
+ /// bloom filter max number of bytes
+ bloom_filter_max_bytes: Option<u32>,
}
impl ColumnProperties {
- /// Initialise column properties with default values.
- fn new() -> Self {
- Self {
- encoding: None,
- codec: None,
- dictionary_enabled: None,
- statistics_enabled: None,
- max_statistics_size: None,
- }
- }
-
/// Sets encoding for this column.
///
/// If dictionary is not enabled, this is treated as a primary encoding for a column.
@@ -556,6 +585,26 @@ impl ColumnProperties {
self.max_statistics_size = Some(value);
}
+ /// Sets bloom filter enabled
+ fn set_bloom_filter_enabled(&mut self, enabled: bool) {
+ self.bloom_filter_enabled = Some(enabled);
+ }
+
+ /// Sets bloom filter max size in bytes
+ fn set_bloom_filter_max_size(&mut self, value: u32) {
+ self.bloom_filter_max_bytes = Some(value);
+ }
+
+ /// Sets bloom filter expected number of distinct values
+ fn set_bloom_filter_ndv(&mut self, value: u64) {
+ self.bloom_filter_ndv = Some(value);
+ }
+
+ /// Sets bloom filter false positive probability
+ fn set_bloom_filter_fpp(&mut self, value: f64) {
+ self.bloom_filter_fpp = Some(value);
+ }
+
/// Returns optional encoding for this column.
fn encoding(&self) -> Option<Encoding> {
self.encoding
@@ -583,6 +632,30 @@ impl ColumnProperties {
fn max_statistics_size(&self) -> Option<usize> {
self.max_statistics_size
}
+
+ /// Returns `Some(true)` if bloom filter is enabled for this column, if disabled then
+ /// returns `Some(false)`. If result is `None`, then no setting has been provided.
+ fn bloom_filter_enabled(&self) -> Option<bool> {
+ self.bloom_filter_enabled
+ }
+
+ /// Returns `Some(u32)` if bloom filter max size in bytes is set for this column,
+ /// if not set then returns `None`.
+ fn bloom_filter_max_bytes(&self) -> Option<u32> {
+ self.bloom_filter_max_bytes
+ }
+
+ /// Returns `Some(u64)` if bloom filter number of distinct values is set for this column,
+ /// if not set then returns `None`.
+ fn bloom_filter_ndv(&self) -> Option<u64> {
+ self.bloom_filter_ndv
+ }
+
+ /// Returns `Some(f64)` if bloom filter false positive probability is set for this column,
+ /// if not set then returns `None`.
+ fn bloom_filter_fpp(&self) -> Option<f64> {
+ self.bloom_filter_fpp
+ }
}
/// Reference counted reader properties.
@@ -685,6 +758,7 @@ mod tests {
props.max_statistics_size(&ColumnPath::from("col")),
DEFAULT_MAX_STATISTICS_SIZE
);
+ assert_eq!(props.bloom_filter_enabled(&ColumnPath::from("col")), false);
}
#[test]