You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ji...@apache.org on 2022/11/15 15:01:42 UTC

[arrow-rs] 01/04: add column setter

This is an automated email from the ASF dual-hosted git repository.

jiayuliu pushed a commit to branch add-bloom-filter-3
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git

commit 777b0dc6f7d4a08af896772893071681c9d17b21
Author: Jiayu Liu <ji...@hey.com>
AuthorDate: Tue Nov 15 20:53:32 2022 +0800

    add column setter
---
 parquet/Cargo.toml             |   1 +
 parquet/src/file/properties.rs | 102 +++++++++++++++++++++++++++++++++++------
 2 files changed, 89 insertions(+), 14 deletions(-)

diff --git a/parquet/Cargo.toml b/parquet/Cargo.toml
index fc7c8218a..72baaf338 100644
--- a/parquet/Cargo.toml
+++ b/parquet/Cargo.toml
@@ -58,6 +58,7 @@ futures = { version = "0.3", default-features = false, features = ["std"], optio
 tokio = { version = "1.0", optional = true, default-features = false, features = ["macros", "rt", "io-util"] }
 hashbrown = { version = "0.13", default-features = false }
 twox-hash = { version = "1.6", optional = true }
+paste = "1.0"
 
 [dev-dependencies]
 base64 = { version = "0.13", default-features = false, features = ["std"] }
diff --git a/parquet/src/file/properties.rs b/parquet/src/file/properties.rs
index cf821df21..c0e789ca1 100644
--- a/parquet/src/file/properties.rs
+++ b/parquet/src/file/properties.rs
@@ -248,6 +248,15 @@ impl WriterProperties {
             .or_else(|| self.default_column_properties.max_statistics_size())
             .unwrap_or(DEFAULT_MAX_STATISTICS_SIZE)
     }
+
+    /// Returns `true` if bloom filter is enabled for a column.
+    pub fn bloom_filter_enabled(&self, col: &ColumnPath) -> bool {
+        self.column_properties
+            .get(col)
+            .and_then(|c| c.bloom_filter_enabled())
+            .or_else(|| self.default_column_properties.bloom_filter_enabled())
+            .unwrap_or(false)
+    }
 }
 
 /// Writer properties builder.
@@ -264,6 +273,16 @@ pub struct WriterPropertiesBuilder {
     column_properties: HashMap<ColumnPath, ColumnProperties>,
 }
 
+macro_rules! def_per_col_setter {
+    ($field:ident, $field_type:expr) => {
+        // The macro will expand into the contents of this block.
+        pub fn concat_idents!(set_, $field)(mut self, value: $field_type) -> Self {
+            self.$field = value;
+            self
+        }
+    };
+}
+
 impl WriterPropertiesBuilder {
     /// Returns default state of the builder.
     fn with_defaults() -> Self {
@@ -276,7 +295,7 @@ impl WriterPropertiesBuilder {
             writer_version: DEFAULT_WRITER_VERSION,
             created_by: DEFAULT_CREATED_BY.to_string(),
             key_value_metadata: None,
-            default_column_properties: ColumnProperties::new(),
+            default_column_properties: Default::default(),
             column_properties: HashMap::new(),
         }
     }
@@ -306,6 +325,8 @@ impl WriterPropertiesBuilder {
         self
     }
 
+    def_per_col_setter!(writer_version, WriterVersion);
+
     /// Sets best effort maximum size of a data page in bytes.
     ///
     /// Note: this is a best effort limit based on value of
@@ -423,7 +444,7 @@ impl WriterPropertiesBuilder {
     fn get_mut_props(&mut self, col: ColumnPath) -> &mut ColumnProperties {
         self.column_properties
             .entry(col)
-            .or_insert_with(ColumnProperties::new)
+            .or_insert_with(Default::default)
     }
 
     /// Sets encoding for a column.
@@ -476,6 +497,17 @@ impl WriterPropertiesBuilder {
         self.get_mut_props(col).set_max_statistics_size(value);
         self
     }
+
+    /// Sets bloom filter enabled for a column.
+    /// Takes precedence over globally defined settings.
+    pub fn set_column_bloom_filter_enabled(
+        mut self,
+        col: ColumnPath,
+        value: bool,
+    ) -> Self {
+        self.get_mut_props(col).set_bloom_filter_enabled(value);
+        self
+    }
 }
 
 /// Controls the level of statistics to be computed by the writer
@@ -499,27 +531,24 @@ impl Default for EnabledStatistics {
 ///
 /// If a field is `None`, it means that no specific value has been set for this column,
 /// so some subsequent or default value must be used.
-#[derive(Debug, Clone, PartialEq)]
+#[derive(Debug, Clone, Default, PartialEq)]
 struct ColumnProperties {
     encoding: Option<Encoding>,
     codec: Option<Compression>,
     dictionary_enabled: Option<bool>,
     statistics_enabled: Option<EnabledStatistics>,
     max_statistics_size: Option<usize>,
+    /// bloom filter enabled
+    bloom_filter_enabled: Option<bool>,
+    /// bloom filter expected number of distinct values
+    bloom_filter_ndv: Option<u64>,
+    /// bloom filter false positive probability
+    bloom_filter_fpp: Option<f64>,
+    /// bloom filter max number of bytes
+    bloom_filter_max_bytes: Option<u32>,
 }
 
 impl ColumnProperties {
-    /// Initialise column properties with default values.
-    fn new() -> Self {
-        Self {
-            encoding: None,
-            codec: None,
-            dictionary_enabled: None,
-            statistics_enabled: None,
-            max_statistics_size: None,
-        }
-    }
-
     /// Sets encoding for this column.
     ///
     /// If dictionary is not enabled, this is treated as a primary encoding for a column.
@@ -556,6 +585,26 @@ impl ColumnProperties {
         self.max_statistics_size = Some(value);
     }
 
+    /// Sets bloom filter enabled
+    fn set_bloom_filter_enabled(&mut self, enabled: bool) {
+        self.bloom_filter_enabled = Some(enabled);
+    }
+
+    /// Sets bloom filter max size in bytes
+    fn set_bloom_filter_max_size(&mut self, value: u32) {
+        self.bloom_filter_max_bytes = Some(value);
+    }
+
+    /// Sets bloom filter expected number of distinct values
+    fn set_bloom_filter_ndv(&mut self, value: u64) {
+        self.bloom_filter_ndv = Some(value);
+    }
+
+    /// Sets bloom filter false positive probability
+    fn set_bloom_filter_fpp(&mut self, value: f64) {
+        self.bloom_filter_fpp = Some(value);
+    }
+
     /// Returns optional encoding for this column.
     fn encoding(&self) -> Option<Encoding> {
         self.encoding
@@ -583,6 +632,30 @@ impl ColumnProperties {
     fn max_statistics_size(&self) -> Option<usize> {
         self.max_statistics_size
     }
+
+    /// Returns `Some(true)` if bloom filter is enabled for this column, if disabled then
+    /// returns `Some(false)`. If result is `None`, then no setting has been provided.
+    fn bloom_filter_enabled(&self) -> Option<bool> {
+        self.bloom_filter_enabled
+    }
+
+    /// Returns `Some(u32)` if bloom filter max size in bytes is set for this column,
+    /// if not set then returns `None`.
+    fn bloom_filter_max_bytes(&self) -> Option<u32> {
+        self.bloom_filter_max_bytes
+    }
+
+    /// Returns `Some(u64)` if bloom filter number of distinct values is set for this column,
+    /// if not set then returns `None`.
+    fn bloom_filter_ndv(&self) -> Option<u64> {
+        self.bloom_filter_ndv
+    }
+
+    /// Returns `Some(f64)` if bloom filter false positive probability is set for this column,
+    /// if not set then returns `None`.
+    fn bloom_filter_fpp(&self) -> Option<f64> {
+        self.bloom_filter_fpp
+    }
 }
 
 /// Reference counted reader properties.
@@ -685,6 +758,7 @@ mod tests {
             props.max_statistics_size(&ColumnPath::from("col")),
             DEFAULT_MAX_STATISTICS_SIZE
         );
+        assert_eq!(props.bloom_filter_enabled(&ColumnPath::from("col")), false);
     }
 
     #[test]