You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by tu...@apache.org on 2022/11/24 13:40:33 UTC
[arrow-rs] branch master updated: Bloom filter config tweaks (#3023) (#3175)
This is an automated email from the ASF dual-hosted git repository.
tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new eefbdce22 Bloom filter config tweaks (#3023) (#3175)
eefbdce22 is described below
commit eefbdce229eb355bbdfd5ccbc24247be882fda15
Author: Raphael Taylor-Davies <17...@users.noreply.github.com>
AuthorDate: Thu Nov 24 13:40:28 2022 +0000
Bloom filter config tweaks (#3023) (#3175)
* Bloom filter config tweaks (#3023)
* Further tweaks
---
parquet/src/column/writer/encoder.rs | 16 ++-----
parquet/src/file/properties.rs | 86 ++++++++++++++++++------------------
2 files changed, 46 insertions(+), 56 deletions(-)
diff --git a/parquet/src/column/writer/encoder.rs b/parquet/src/column/writer/encoder.rs
index 0d0716d7a..c343f1d6c 100644
--- a/parquet/src/column/writer/encoder.rs
+++ b/parquet/src/column/writer/encoder.rs
@@ -25,7 +25,6 @@ use crate::data_type::private::ParquetValueType;
use crate::data_type::DataType;
use crate::encodings::encoding::{get_encoder, DictEncoder, Encoder};
use crate::errors::{ParquetError, Result};
-use crate::file::properties::BloomFilterProperties;
use crate::file::properties::{EnabledStatistics, WriterProperties};
use crate::schema::types::{ColumnDescPtr, ColumnDescriptor};
use crate::util::memory::ByteBufferPtr;
@@ -194,17 +193,10 @@ impl<T: DataType> ColumnValueEncoder for ColumnValueEncoderImpl<T> {
let statistics_enabled = props.statistics_enabled(descr.path());
- let bloom_filter_enabled = props.bloom_filter_enabled(descr.path());
- let bloom_filter =
- if let Some(BloomFilterProperties { ndv, fpp }) = bloom_filter_enabled {
- Sbbf::new_with_ndv_fpp(ndv, fpp)
- .map_err(|e| {
- eprintln!("invalid bloom filter properties: {}", e);
- })
- .ok()
- } else {
- None
- };
+ let bloom_filter = props
+ .bloom_filter_properties(descr.path())
+ .map(|props| Sbbf::new_with_ndv_fpp(props.ndv, props.fpp))
+ .transpose()?;
Ok(Self {
encoder,
diff --git a/parquet/src/file/properties.rs b/parquet/src/file/properties.rs
index 6d30be2e4..c8083fcf3 100644
--- a/parquet/src/file/properties.rs
+++ b/parquet/src/file/properties.rs
@@ -260,16 +260,17 @@ impl WriterProperties {
.unwrap_or(DEFAULT_MAX_STATISTICS_SIZE)
}
- /// Returns whether bloom filter is enabled for a given column. Bloom filter can be enabled over
- /// all or for a specific column, and is by default set to be disabled.
- pub fn bloom_filter_enabled(
+ /// Returns the [`BloomFilterProperties`] for the given column
+ ///
+ /// Returns `None` if bloom filter is disabled
+ pub fn bloom_filter_properties(
&self,
col: &ColumnPath,
- ) -> Option<BloomFilterProperties> {
+ ) -> Option<&BloomFilterProperties> {
self.column_properties
.get(col)
- .and_then(|c| c.bloom_filter_enabled())
- .or_else(|| self.default_column_properties.bloom_filter_enabled())
+ .and_then(|c| c.bloom_filter_properties())
+ .or_else(|| self.default_column_properties.bloom_filter_properties())
}
}
@@ -628,7 +629,7 @@ struct ColumnProperties {
statistics_enabled: Option<EnabledStatistics>,
max_statistics_size: Option<usize>,
/// bloom filter related properties
- bloom_filter_enabled: Option<BloomFilterProperties>,
+ bloom_filter_properies: Option<BloomFilterProperties>,
}
impl ColumnProperties {
@@ -672,40 +673,37 @@ impl ColumnProperties {
/// otherwise it is a no-op.
/// If `value` is `false`, resets bloom filter properties to `None`.
fn set_bloom_filter_enabled(&mut self, value: bool) {
- if value {
- self.bloom_filter_enabled = self
- .bloom_filter_enabled()
- .or_else(|| Some(Default::default()));
- } else {
- self.bloom_filter_enabled = None;
+ if value && self.bloom_filter_properies.is_none() {
+ self.bloom_filter_properies = Some(Default::default())
+ } else if !value {
+ self.bloom_filter_properies = None
}
}
/// Sets the false positive probability for bloom filter for this column, and implicitly enables
- /// bloom filter if not previously enabled. If the `value` is not between 0 and 1 exclusive, it is
- /// discarded as no-op.
+ /// bloom filter if not previously enabled.
+ ///
+ /// # Panics
+ ///
+ /// Panics if the `value` is not between 0 and 1 exclusive
fn set_bloom_filter_fpp(&mut self, value: f64) {
- if (0.0..1.0).contains(&value) {
- self.bloom_filter_enabled = self
- .bloom_filter_enabled()
- .or_else(|| Some(Default::default()))
- .map(|BloomFilterProperties { ndv, .. }| BloomFilterProperties {
- ndv,
- fpp: value,
- });
- }
+ assert!(
+ value > 0. && value < 1.0,
+ "fpp must be between 0 and 1 exclusive, got {}",
+ value
+ );
+
+ self.bloom_filter_properies
+ .get_or_insert_with(Default::default)
+ .fpp = value;
}
/// Sets the number of distinct (unique) values for bloom filter for this column, and implicitly
/// enables bloom filter if not previously enabled.
fn set_bloom_filter_ndv(&mut self, value: u64) {
- self.bloom_filter_enabled = self
- .bloom_filter_enabled()
- .or_else(|| Some(Default::default()))
- .map(|BloomFilterProperties { fpp, .. }| BloomFilterProperties {
- ndv: value,
- fpp,
- });
+ self.bloom_filter_properies
+ .get_or_insert_with(Default::default)
+ .ndv = value;
}
/// Returns optional encoding for this column.
@@ -736,9 +734,9 @@ impl ColumnProperties {
self.max_statistics_size
}
- /// Returns bloom filter properties if set.
- fn bloom_filter_enabled(&self) -> Option<BloomFilterProperties> {
- self.bloom_filter_enabled.clone()
+ /// Returns the bloom filter properties, or `None` if not enabled
+ fn bloom_filter_properties(&self) -> Option<&BloomFilterProperties> {
+ self.bloom_filter_properies.as_ref()
}
}
@@ -867,7 +865,7 @@ mod tests {
DEFAULT_MAX_STATISTICS_SIZE
);
assert!(props
- .bloom_filter_enabled(&ColumnPath::from("col"))
+ .bloom_filter_properties(&ColumnPath::from("col"))
.is_none());
}
@@ -997,8 +995,8 @@ mod tests {
);
assert_eq!(props.max_statistics_size(&ColumnPath::from("col")), 123);
assert_eq!(
- props.bloom_filter_enabled(&ColumnPath::from("col")),
- Some(BloomFilterProperties { fpp: 0.1, ndv: 100 })
+ props.bloom_filter_properties(&ColumnPath::from("col")),
+ Some(&BloomFilterProperties { fpp: 0.1, ndv: 100 })
);
}
@@ -1024,8 +1022,8 @@ mod tests {
DEFAULT_DICTIONARY_ENABLED
);
assert_eq!(
- props.bloom_filter_enabled(&ColumnPath::from("col")),
- Some(BloomFilterProperties {
+ props.bloom_filter_properties(&ColumnPath::from("col")),
+ Some(&BloomFilterProperties {
fpp: 0.05,
ndv: 1_000_000_u64
})
@@ -1037,15 +1035,15 @@ mod tests {
assert_eq!(
WriterProperties::builder()
.build()
- .bloom_filter_enabled(&ColumnPath::from("col")),
+ .bloom_filter_properties(&ColumnPath::from("col")),
None
);
assert_eq!(
WriterProperties::builder()
.set_bloom_filter_ndv(100)
.build()
- .bloom_filter_enabled(&ColumnPath::from("col")),
- Some(BloomFilterProperties {
+ .bloom_filter_properties(&ColumnPath::from("col")),
+ Some(&BloomFilterProperties {
fpp: 0.05,
ndv: 100
})
@@ -1054,8 +1052,8 @@ mod tests {
WriterProperties::builder()
.set_bloom_filter_fpp(0.1)
.build()
- .bloom_filter_enabled(&ColumnPath::from("col")),
- Some(BloomFilterProperties {
+ .bloom_filter_properties(&ColumnPath::from("col")),
+ Some(&BloomFilterProperties {
fpp: 0.1,
ndv: 1_000_000_u64
})