You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by tu...@apache.org on 2022/11/24 13:40:33 UTC

[arrow-rs] branch master updated: Bloom filter config tweaks (#3023) (#3175)

This is an automated email from the ASF dual-hosted git repository.

tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/master by this push:
     new eefbdce22 Bloom filter config tweaks (#3023) (#3175)
eefbdce22 is described below

commit eefbdce229eb355bbdfd5ccbc24247be882fda15
Author: Raphael Taylor-Davies <17...@users.noreply.github.com>
AuthorDate: Thu Nov 24 13:40:28 2022 +0000

    Bloom filter config tweaks (#3023) (#3175)
    
    * Bloom filter config tweaks (#3023)
    
    * Further tweaks
---
 parquet/src/column/writer/encoder.rs | 16 ++-----
 parquet/src/file/properties.rs       | 86 ++++++++++++++++++------------------
 2 files changed, 46 insertions(+), 56 deletions(-)

diff --git a/parquet/src/column/writer/encoder.rs b/parquet/src/column/writer/encoder.rs
index 0d0716d7a..c343f1d6c 100644
--- a/parquet/src/column/writer/encoder.rs
+++ b/parquet/src/column/writer/encoder.rs
@@ -25,7 +25,6 @@ use crate::data_type::private::ParquetValueType;
 use crate::data_type::DataType;
 use crate::encodings::encoding::{get_encoder, DictEncoder, Encoder};
 use crate::errors::{ParquetError, Result};
-use crate::file::properties::BloomFilterProperties;
 use crate::file::properties::{EnabledStatistics, WriterProperties};
 use crate::schema::types::{ColumnDescPtr, ColumnDescriptor};
 use crate::util::memory::ByteBufferPtr;
@@ -194,17 +193,10 @@ impl<T: DataType> ColumnValueEncoder for ColumnValueEncoderImpl<T> {
 
         let statistics_enabled = props.statistics_enabled(descr.path());
 
-        let bloom_filter_enabled = props.bloom_filter_enabled(descr.path());
-        let bloom_filter =
-            if let Some(BloomFilterProperties { ndv, fpp }) = bloom_filter_enabled {
-                Sbbf::new_with_ndv_fpp(ndv, fpp)
-                    .map_err(|e| {
-                        eprintln!("invalid bloom filter properties: {}", e);
-                    })
-                    .ok()
-            } else {
-                None
-            };
+        let bloom_filter = props
+            .bloom_filter_properties(descr.path())
+            .map(|props| Sbbf::new_with_ndv_fpp(props.ndv, props.fpp))
+            .transpose()?;
 
         Ok(Self {
             encoder,
diff --git a/parquet/src/file/properties.rs b/parquet/src/file/properties.rs
index 6d30be2e4..c8083fcf3 100644
--- a/parquet/src/file/properties.rs
+++ b/parquet/src/file/properties.rs
@@ -260,16 +260,17 @@ impl WriterProperties {
             .unwrap_or(DEFAULT_MAX_STATISTICS_SIZE)
     }
 
-    /// Returns whether bloom filter is enabled for a given column. Bloom filter can be enabled over
-    /// all or for a specific column, and is by default set to be disabled.
-    pub fn bloom_filter_enabled(
+    /// Returns the [`BloomFilterProperties`] for the given column
+    ///
+    /// Returns `None` if bloom filter is disabled
+    pub fn bloom_filter_properties(
         &self,
         col: &ColumnPath,
-    ) -> Option<BloomFilterProperties> {
+    ) -> Option<&BloomFilterProperties> {
         self.column_properties
             .get(col)
-            .and_then(|c| c.bloom_filter_enabled())
-            .or_else(|| self.default_column_properties.bloom_filter_enabled())
+            .and_then(|c| c.bloom_filter_properties())
+            .or_else(|| self.default_column_properties.bloom_filter_properties())
     }
 }
 
@@ -628,7 +629,7 @@ struct ColumnProperties {
     statistics_enabled: Option<EnabledStatistics>,
     max_statistics_size: Option<usize>,
     /// bloom filter related properties
-    bloom_filter_enabled: Option<BloomFilterProperties>,
+    bloom_filter_properies: Option<BloomFilterProperties>,
 }
 
 impl ColumnProperties {
@@ -672,40 +673,37 @@ impl ColumnProperties {
     /// otherwise it is a no-op.
     /// If `value` is `false`, resets bloom filter properties to `None`.
     fn set_bloom_filter_enabled(&mut self, value: bool) {
-        if value {
-            self.bloom_filter_enabled = self
-                .bloom_filter_enabled()
-                .or_else(|| Some(Default::default()));
-        } else {
-            self.bloom_filter_enabled = None;
+        if value && self.bloom_filter_properies.is_none() {
+            self.bloom_filter_properies = Some(Default::default())
+        } else if !value {
+            self.bloom_filter_properies = None
         }
     }
 
     /// Sets the false positive probability for bloom filter for this column, and implicitly enables
-    /// bloom filter if not previously enabled. If the `value` is not between 0 and 1 exclusive, it is
-    /// discarded as no-op.
+    /// bloom filter if not previously enabled.
+    ///
+    /// # Panics
+    ///
+    /// Panics if the `value` is not between 0 and 1 exclusive
     fn set_bloom_filter_fpp(&mut self, value: f64) {
-        if (0.0..1.0).contains(&value) {
-            self.bloom_filter_enabled = self
-                .bloom_filter_enabled()
-                .or_else(|| Some(Default::default()))
-                .map(|BloomFilterProperties { ndv, .. }| BloomFilterProperties {
-                    ndv,
-                    fpp: value,
-                });
-        }
+        assert!(
+            value > 0. && value < 1.0,
+            "fpp must be between 0 and 1 exclusive, got {}",
+            value
+        );
+
+        self.bloom_filter_properies
+            .get_or_insert_with(Default::default)
+            .fpp = value;
     }
 
     /// Sets the number of distinct (unique) values for bloom filter for this column, and implicitly
     /// enables bloom filter if not previously enabled.
     fn set_bloom_filter_ndv(&mut self, value: u64) {
-        self.bloom_filter_enabled = self
-            .bloom_filter_enabled()
-            .or_else(|| Some(Default::default()))
-            .map(|BloomFilterProperties { fpp, .. }| BloomFilterProperties {
-                ndv: value,
-                fpp,
-            });
+        self.bloom_filter_properies
+            .get_or_insert_with(Default::default)
+            .ndv = value;
     }
 
     /// Returns optional encoding for this column.
@@ -736,9 +734,9 @@ impl ColumnProperties {
         self.max_statistics_size
     }
 
-    /// Returns bloom filter properties if set.
-    fn bloom_filter_enabled(&self) -> Option<BloomFilterProperties> {
-        self.bloom_filter_enabled.clone()
+    /// Returns the bloom filter properties, or `None` if not enabled
+    fn bloom_filter_properties(&self) -> Option<&BloomFilterProperties> {
+        self.bloom_filter_properies.as_ref()
     }
 }
 
@@ -867,7 +865,7 @@ mod tests {
             DEFAULT_MAX_STATISTICS_SIZE
         );
         assert!(props
-            .bloom_filter_enabled(&ColumnPath::from("col"))
+            .bloom_filter_properties(&ColumnPath::from("col"))
             .is_none());
     }
 
@@ -997,8 +995,8 @@ mod tests {
         );
         assert_eq!(props.max_statistics_size(&ColumnPath::from("col")), 123);
         assert_eq!(
-            props.bloom_filter_enabled(&ColumnPath::from("col")),
-            Some(BloomFilterProperties { fpp: 0.1, ndv: 100 })
+            props.bloom_filter_properties(&ColumnPath::from("col")),
+            Some(&BloomFilterProperties { fpp: 0.1, ndv: 100 })
         );
     }
 
@@ -1024,8 +1022,8 @@ mod tests {
             DEFAULT_DICTIONARY_ENABLED
         );
         assert_eq!(
-            props.bloom_filter_enabled(&ColumnPath::from("col")),
-            Some(BloomFilterProperties {
+            props.bloom_filter_properties(&ColumnPath::from("col")),
+            Some(&BloomFilterProperties {
                 fpp: 0.05,
                 ndv: 1_000_000_u64
             })
@@ -1037,15 +1035,15 @@ mod tests {
         assert_eq!(
             WriterProperties::builder()
                 .build()
-                .bloom_filter_enabled(&ColumnPath::from("col")),
+                .bloom_filter_properties(&ColumnPath::from("col")),
             None
         );
         assert_eq!(
             WriterProperties::builder()
                 .set_bloom_filter_ndv(100)
                 .build()
-                .bloom_filter_enabled(&ColumnPath::from("col")),
-            Some(BloomFilterProperties {
+                .bloom_filter_properties(&ColumnPath::from("col")),
+            Some(&BloomFilterProperties {
                 fpp: 0.05,
                 ndv: 100
             })
@@ -1054,8 +1052,8 @@ mod tests {
             WriterProperties::builder()
                 .set_bloom_filter_fpp(0.1)
                 .build()
-                .bloom_filter_enabled(&ColumnPath::from("col")),
-            Some(BloomFilterProperties {
+                .bloom_filter_properties(&ColumnPath::from("col")),
+            Some(&BloomFilterProperties {
                 fpp: 0.1,
                 ndv: 1_000_000_u64
             })