You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by tu...@apache.org on 2023/11/15 11:31:43 UTC

(arrow-rs) branch master updated: Enable truncation of binary statistics columns (#5076)

This is an automated email from the ASF dual-hosted git repository.

tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/master by this push:
     new 7941577d41 Enable truncation of binary statistics columns (#5076)
7941577d41 is described below

commit 7941577d414b9c93d60795bc79125ddad760c252
Author: emcake <37...@users.noreply.github.com>
AuthorDate: Wed Nov 15 11:31:36 2023 +0000

    Enable truncation of binary statistics columns (#5076)
    
    * changes needed to introduce min/max exactness
    
    * implement truncation property and logic, tests
    
    * format lints
    
    * change min/max exact to be with... methods
    
    * reduce code noise
    
    * remove redundant clone
    
    ---------
    
    Co-authored-by: Matthew Kemp <mk...@drwholdings.com>
---
 parquet/src/column/writer/mod.rs     | 228 ++++++++++++++++++++++++++++++++---
 parquet/src/file/properties.rs       |  24 ++++
 parquet/src/file/statistics.rs       | 171 +++++++++++++++++++++-----
 parquet/tests/arrow_writer_layout.rs |  52 ++++----
 4 files changed, 401 insertions(+), 74 deletions(-)

diff --git a/parquet/src/column/writer/mod.rs b/parquet/src/column/writer/mod.rs
index a917c48649..11c3968591 100644
--- a/parquet/src/column/writer/mod.rs
+++ b/parquet/src/column/writer/mod.rs
@@ -636,8 +636,16 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> {
                         Type::BYTE_ARRAY | Type::FIXED_LEN_BYTE_ARRAY => {
                             self.column_index_builder.append(
                                 null_page,
-                                self.truncate_min_value(stat.min_bytes()),
-                                self.truncate_max_value(stat.max_bytes()),
+                                self.truncate_min_value(
+                                    self.props.column_index_truncate_length(),
+                                    stat.min_bytes(),
+                                )
+                                .0,
+                                self.truncate_max_value(
+                                    self.props.column_index_truncate_length(),
+                                    stat.max_bytes(),
+                                )
+                                .0,
                                 self.page_metrics.num_page_nulls as i64,
                             );
                         }
@@ -658,26 +666,26 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> {
             .append_row_count(self.page_metrics.num_buffered_rows as i64);
     }
 
-    fn truncate_min_value(&self, data: &[u8]) -> Vec<u8> {
-        self.props
-            .column_index_truncate_length()
+    fn truncate_min_value(&self, truncation_length: Option<usize>, data: &[u8]) -> (Vec<u8>, bool) {
+        truncation_length
             .filter(|l| data.len() > *l)
             .and_then(|l| match str::from_utf8(data) {
                 Ok(str_data) => truncate_utf8(str_data, l),
                 Err(_) => Some(data[..l].to_vec()),
             })
-            .unwrap_or_else(|| data.to_vec())
+            .map(|truncated| (truncated, true))
+            .unwrap_or_else(|| (data.to_vec(), false))
     }
 
-    fn truncate_max_value(&self, data: &[u8]) -> Vec<u8> {
-        self.props
-            .column_index_truncate_length()
+    fn truncate_max_value(&self, truncation_length: Option<usize>, data: &[u8]) -> (Vec<u8>, bool) {
+        truncation_length
             .filter(|l| data.len() > *l)
             .and_then(|l| match str::from_utf8(data) {
                 Ok(str_data) => truncate_utf8(str_data, l).and_then(increment_utf8),
                 Err(_) => increment(data[..l].to_vec()),
             })
-            .unwrap_or_else(|| data.to_vec())
+            .map(|truncated| (truncated, true))
+            .unwrap_or_else(|| (data.to_vec(), false))
     }
 
     /// Adds data page.
@@ -856,20 +864,64 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> {
             .set_dictionary_page_offset(dict_page_offset);
 
         if self.statistics_enabled != EnabledStatistics::None {
+            let backwards_compatible_min_max = self.descr.sort_order().is_signed();
+
             let statistics = ValueStatistics::<E::T>::new(
                 self.column_metrics.min_column_value.clone(),
                 self.column_metrics.max_column_value.clone(),
                 self.column_metrics.column_distinct_count,
                 self.column_metrics.num_column_nulls,
                 false,
-            );
+            )
+            .with_backwards_compatible_min_max(backwards_compatible_min_max)
+            .into();
+
+            let statistics = match statistics {
+                Statistics::ByteArray(stats) if stats.has_min_max_set() => {
+                    let (min, did_truncate_min) = self.truncate_min_value(
+                        self.props.statistics_truncate_length(),
+                        stats.min_bytes(),
+                    );
+                    let (max, did_truncate_max) = self.truncate_max_value(
+                        self.props.statistics_truncate_length(),
+                        stats.max_bytes(),
+                    );
+                    Statistics::ByteArray(
+                        ValueStatistics::new(
+                            Some(min.into()),
+                            Some(max.into()),
+                            stats.distinct_count(),
+                            stats.null_count(),
+                            backwards_compatible_min_max,
+                        )
+                        .with_max_is_exact(!did_truncate_max)
+                        .with_min_is_exact(!did_truncate_min),
+                    )
+                }
+                Statistics::FixedLenByteArray(stats) if stats.has_min_max_set() => {
+                    let (min, did_truncate_min) = self.truncate_min_value(
+                        self.props.statistics_truncate_length(),
+                        stats.min_bytes(),
+                    );
+                    let (max, did_truncate_max) = self.truncate_max_value(
+                        self.props.statistics_truncate_length(),
+                        stats.max_bytes(),
+                    );
+                    Statistics::FixedLenByteArray(
+                        ValueStatistics::new(
+                            Some(min.into()),
+                            Some(max.into()),
+                            stats.distinct_count(),
+                            stats.null_count(),
+                            backwards_compatible_min_max,
+                        )
+                        .with_max_is_exact(!did_truncate_max)
+                        .with_min_is_exact(!did_truncate_min),
+                    )
+                }
+                stats => stats,
+            };
 
-            // Some common readers only support the deprecated statistics
-            // format so we also write them out if possible
-            // See https://github.com/apache/arrow-rs/issues/799
-            let statistics = statistics
-                .with_backwards_compatible_min_max(self.descr.sort_order().is_signed())
-                .into();
             builder = builder.set_statistics(statistics);
         }
 
@@ -2612,6 +2664,148 @@ mod tests {
         }
     }
 
+    #[test]
+    fn test_statistics_truncating_byte_array() {
+        let page_writer = get_test_page_writer();
+
+        const TEST_TRUNCATE_LENGTH: usize = 1;
+
+        // Truncate values at 1 byte
+        let builder =
+            WriterProperties::builder().set_statistics_truncate_length(Some(TEST_TRUNCATE_LENGTH));
+        let props = Arc::new(builder.build());
+        let mut writer = get_test_column_writer::<ByteArrayType>(page_writer, 0, 0, props);
+
+        let mut data = vec![ByteArray::default(); 1];
+        // This is the expected min value
+        data[0].set_data(Bytes::from(String::from("Blart Versenwald III")));
+
+        writer.write_batch(&data, None, None).unwrap();
+
+        writer.flush_data_pages().unwrap();
+
+        let r = writer.close().unwrap();
+
+        assert_eq!(1, r.rows_written);
+
+        let stats = r.metadata.statistics().expect("statistics");
+        assert!(stats.has_min_max_set());
+        assert_eq!(stats.null_count(), 0);
+        assert_eq!(stats.distinct_count(), None);
+        if let Statistics::ByteArray(_stats) = stats {
+            let min_value = _stats.min();
+            let max_value = _stats.max();
+
+            assert!(!_stats.min_is_exact());
+            assert!(!_stats.max_is_exact());
+
+            assert_eq!(min_value.len(), TEST_TRUNCATE_LENGTH);
+            assert_eq!(max_value.len(), TEST_TRUNCATE_LENGTH);
+
+            assert_eq!("B".as_bytes(), min_value.as_bytes());
+            assert_eq!("C".as_bytes(), max_value.as_bytes());
+        } else {
+            panic!("expecting Statistics::ByteArray");
+        }
+    }
+
+    #[test]
+    fn test_statistics_truncating_fixed_len_byte_array() {
+        let page_writer = get_test_page_writer();
+
+        const TEST_TRUNCATE_LENGTH: usize = 1;
+
+        // Truncate values at 1 byte
+        let builder =
+            WriterProperties::builder().set_statistics_truncate_length(Some(TEST_TRUNCATE_LENGTH));
+        let props = Arc::new(builder.build());
+        let mut writer = get_test_column_writer::<FixedLenByteArrayType>(page_writer, 0, 0, props);
+
+        let mut data = vec![FixedLenByteArray::default(); 1];
+
+        const PSEUDO_DECIMAL_VALUE: i128 = 6541894651216648486512564456564654;
+        const PSEUDO_DECIMAL_BYTES: [u8; 16] = PSEUDO_DECIMAL_VALUE.to_be_bytes();
+
+        const EXPECTED_MIN: [u8; TEST_TRUNCATE_LENGTH] = [PSEUDO_DECIMAL_BYTES[0]]; // parquet specifies big-endian order for decimals
+        const EXPECTED_MAX: [u8; TEST_TRUNCATE_LENGTH] =
+            [PSEUDO_DECIMAL_BYTES[0].overflowing_add(1).0];
+
+        // This is the expected min value
+        data[0].set_data(Bytes::from(PSEUDO_DECIMAL_BYTES.as_slice()));
+
+        writer.write_batch(&data, None, None).unwrap();
+
+        writer.flush_data_pages().unwrap();
+
+        let r = writer.close().unwrap();
+
+        assert_eq!(1, r.rows_written);
+
+        let stats = r.metadata.statistics().expect("statistics");
+        assert!(stats.has_min_max_set());
+        assert_eq!(stats.null_count(), 0);
+        assert_eq!(stats.distinct_count(), None);
+        if let Statistics::FixedLenByteArray(_stats) = stats {
+            let min_value = _stats.min();
+            let max_value = _stats.max();
+
+            assert!(!_stats.min_is_exact());
+            assert!(!_stats.max_is_exact());
+
+            assert_eq!(min_value.len(), TEST_TRUNCATE_LENGTH);
+            assert_eq!(max_value.len(), TEST_TRUNCATE_LENGTH);
+
+            assert_eq!(EXPECTED_MIN.as_slice(), min_value.as_bytes());
+            assert_eq!(EXPECTED_MAX.as_slice(), max_value.as_bytes());
+
+            let reconstructed_min = i128::from_be_bytes([
+                min_value.as_bytes()[0],
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+            ]);
+
+            let reconstructed_max = i128::from_be_bytes([
+                max_value.as_bytes()[0],
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+            ]);
+
+            // check that the inner value is correctly bounded by the min/max
+            println!("min: {reconstructed_min} {PSEUDO_DECIMAL_VALUE}");
+            assert!(reconstructed_min <= PSEUDO_DECIMAL_VALUE);
+            println!("max {reconstructed_max} {PSEUDO_DECIMAL_VALUE}");
+            assert!(reconstructed_max >= PSEUDO_DECIMAL_VALUE);
+        } else {
+            panic!("expecting Statistics::FixedLenByteArray");
+        }
+    }
+
     #[test]
     fn test_send() {
         fn test<T: Send>() {}
diff --git a/parquet/src/file/properties.rs b/parquet/src/file/properties.rs
index ea71763a01..287e73c990 100644
--- a/parquet/src/file/properties.rs
+++ b/parquet/src/file/properties.rs
@@ -51,6 +51,8 @@ pub const DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH: Option<usize> = Some(64);
 pub const DEFAULT_BLOOM_FILTER_FPP: f64 = 0.05;
 /// Default value for [`BloomFilterProperties::ndv`]
 pub const DEFAULT_BLOOM_FILTER_NDV: u64 = 1_000_000_u64;
+/// Default values for [`WriterProperties::statistics_truncate_length`]
+pub const DEFAULT_STATISTICS_TRUNCATE_LENGTH: Option<usize> = None;
 
 /// Parquet writer version.
 ///
@@ -136,6 +138,7 @@ pub struct WriterProperties {
     column_properties: HashMap<ColumnPath, ColumnProperties>,
     sorting_columns: Option<Vec<SortingColumn>>,
     column_index_truncate_length: Option<usize>,
+    statistics_truncate_length: Option<usize>,
 }
 
 impl Default for WriterProperties {
@@ -241,6 +244,13 @@ impl WriterProperties {
         self.column_index_truncate_length
     }
 
+    /// Returns the maximum length of truncated min/max values in statistics.
+    ///
+    /// `None` if truncation is disabled, must be greater than 0 otherwise.
+    pub fn statistics_truncate_length(&self) -> Option<usize> {
+        self.statistics_truncate_length
+    }
+
     /// Returns encoding for a data page, when dictionary encoding is enabled.
     /// This is not configurable.
     #[inline]
@@ -334,6 +344,7 @@ pub struct WriterPropertiesBuilder {
     column_properties: HashMap<ColumnPath, ColumnProperties>,
     sorting_columns: Option<Vec<SortingColumn>>,
     column_index_truncate_length: Option<usize>,
+    statistics_truncate_length: Option<usize>,
 }
 
 impl WriterPropertiesBuilder {
@@ -352,6 +363,7 @@ impl WriterPropertiesBuilder {
             column_properties: HashMap::new(),
             sorting_columns: None,
             column_index_truncate_length: DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH,
+            statistics_truncate_length: DEFAULT_STATISTICS_TRUNCATE_LENGTH,
         }
     }
 
@@ -370,6 +382,7 @@ impl WriterPropertiesBuilder {
             column_properties: self.column_properties,
             sorting_columns: self.sorting_columns,
             column_index_truncate_length: self.column_index_truncate_length,
+            statistics_truncate_length: self.statistics_truncate_length,
         }
     }
 
@@ -643,6 +656,17 @@ impl WriterPropertiesBuilder {
         self.column_index_truncate_length = max_length;
         self
     }
+
+    /// Sets the max length of min/max value fields in statistics. Must be greater than 0.
+    /// If set to `None` - there's no effective limit.
+    pub fn set_statistics_truncate_length(mut self, max_length: Option<usize>) -> Self {
+        if let Some(value) = max_length {
+            assert!(value > 0, "Cannot have a 0 statistics truncate length. If you wish to disable min/max value truncation, set it to `None`.");
+        }
+
+        self.statistics_truncate_length = max_length;
+        self
+    }
 }
 
 /// Controls the level of statistics to be computed by the writer
diff --git a/parquet/src/file/statistics.rs b/parquet/src/file/statistics.rs
index 345fe7dd26..1bc003d488 100644
--- a/parquet/src/file/statistics.rs
+++ b/parquet/src/file/statistics.rs
@@ -27,6 +27,8 @@
 //! assert_eq!(stats.null_count(), 3);
 //! assert!(stats.has_min_max_set());
 //! assert!(stats.is_min_max_deprecated());
+//! assert!(stats.min_is_exact());
+//! assert!(stats.max_is_exact());
 //!
 //! match stats {
 //!     Statistics::Int32(ref typed) => {
@@ -206,19 +208,27 @@ pub fn from_thrift(
                     null_count,
                     old_format,
                 ),
-                Type::BYTE_ARRAY => Statistics::byte_array(
-                    min.map(ByteArray::from),
-                    max.map(ByteArray::from),
-                    distinct_count,
-                    null_count,
-                    old_format,
+                Type::BYTE_ARRAY => Statistics::ByteArray(
+                    ValueStatistics::new(
+                        min.map(ByteArray::from),
+                        max.map(ByteArray::from),
+                        distinct_count,
+                        null_count,
+                        old_format,
+                    )
+                    .with_max_is_exact(stats.is_max_value_exact.unwrap_or(false))
+                    .with_min_is_exact(stats.is_min_value_exact.unwrap_or(false)),
                 ),
-                Type::FIXED_LEN_BYTE_ARRAY => Statistics::fixed_len_byte_array(
-                    min.map(ByteArray::from).map(FixedLenByteArray::from),
-                    max.map(ByteArray::from).map(FixedLenByteArray::from),
-                    distinct_count,
-                    null_count,
-                    old_format,
+                Type::FIXED_LEN_BYTE_ARRAY => Statistics::FixedLenByteArray(
+                    ValueStatistics::new(
+                        min.map(ByteArray::from).map(FixedLenByteArray::from),
+                        max.map(ByteArray::from).map(FixedLenByteArray::from),
+                        distinct_count,
+                        null_count,
+                        old_format,
+                    )
+                    .with_max_is_exact(stats.is_max_value_exact.unwrap_or(false))
+                    .with_min_is_exact(stats.is_min_value_exact.unwrap_or(false)),
                 ),
             };
 
@@ -248,13 +258,15 @@ pub fn to_thrift(stats: Option<&Statistics>) -> Option<TStatistics> {
     };
 
     // Get min/max if set.
-    let (min, max) = if stats.has_min_max_set() {
+    let (min, max, min_exact, max_exact) = if stats.has_min_max_set() {
         (
             Some(stats.min_bytes().to_vec()),
             Some(stats.max_bytes().to_vec()),
+            Some(stats.min_is_exact()),
+            Some(stats.max_is_exact()),
         )
     } else {
-        (None, None)
+        (None, None, None, None)
     };
 
     if stats.is_min_max_backwards_compatible() {
@@ -268,6 +280,9 @@ pub fn to_thrift(stats: Option<&Statistics>) -> Option<TStatistics> {
         thrift_stats.max_value = max;
     }
 
+    thrift_stats.is_min_value_exact = min_exact;
+    thrift_stats.is_max_value_exact = max_exact;
+
     Some(thrift_stats)
 }
 
@@ -374,6 +389,16 @@ impl Statistics {
         statistics_enum_func![self, has_min_max_set]
     }
 
+    /// Returns `true` if the min value is set, and is an exact min value.
+    pub fn min_is_exact(&self) -> bool {
+        statistics_enum_func![self, min_is_exact]
+    }
+
+    /// Returns `true` if the max value is set, and is an exact max value.
+    pub fn max_is_exact(&self) -> bool {
+        statistics_enum_func![self, max_is_exact]
+    }
+
     /// Returns slice of bytes that represent min value.
     /// Panics if min value is not set.
     pub fn min_bytes(&self) -> &[u8] {
@@ -428,6 +453,10 @@ pub struct ValueStatistics<T> {
     distinct_count: Option<u64>,
     null_count: u64,
 
+    // Whether or not the min or max values are exact, or truncated.
+    is_max_value_exact: bool,
+    is_min_value_exact: bool,
+
     /// If `true` populate the deprecated `min` and `max` fields instead of
     /// `min_value` and `max_value`
     is_min_max_deprecated: bool,
@@ -447,6 +476,8 @@ impl<T: ParquetValueType> ValueStatistics<T> {
         is_min_max_deprecated: bool,
     ) -> Self {
         Self {
+            is_max_value_exact: max.is_some(),
+            is_min_value_exact: min.is_some(),
             min,
             max,
             distinct_count,
@@ -456,6 +487,28 @@ impl<T: ParquetValueType> ValueStatistics<T> {
         }
     }
 
+    /// Set whether the stored `min` field represents the exact
+    /// minimum, or just a bound on the minimum value.
+    ///
+    /// see [`Self::min_is_exact`]
+    pub fn with_min_is_exact(self, is_min_value_exact: bool) -> Self {
+        Self {
+            is_min_value_exact,
+            ..self
+        }
+    }
+
+    /// Set whether the stored `max` field represents the exact
+    /// maximum, or just a bound on the maximum value.
+    ///
+    /// see [`Self::max_is_exact`]
+    pub fn with_max_is_exact(self, is_max_value_exact: bool) -> Self {
+        Self {
+            is_max_value_exact,
+            ..self
+        }
+    }
+
     /// Set whether to write the deprecated `min` and `max` fields
     /// for compatibility with older parquet writers
     ///
@@ -506,13 +559,23 @@ impl<T: ParquetValueType> ValueStatistics<T> {
         self.min.is_some() && self.max.is_some()
     }
 
+    /// Whether or not max value is set, and is an exact value.
+    pub fn max_is_exact(&self) -> bool {
+        self.max.is_some() && self.is_max_value_exact
+    }
+
+    /// Whether or not min value is set, and is an exact value.
+    pub fn min_is_exact(&self) -> bool {
+        self.min.is_some() && self.is_min_value_exact
+    }
+
     /// Returns optional value of number of distinct values occurring.
-    fn distinct_count(&self) -> Option<u64> {
+    pub fn distinct_count(&self) -> Option<u64> {
         self.distinct_count
     }
 
     /// Returns null count.
-    fn null_count(&self) -> u64 {
+    pub fn null_count(&self) -> u64 {
         self.null_count
     }
 
@@ -556,6 +619,8 @@ impl<T: ParquetValueType> fmt::Display for ValueStatistics<T> {
         }
         write!(f, ", null_count: {}", self.null_count)?;
         write!(f, ", min_max_deprecated: {}", self.is_min_max_deprecated)?;
+        write!(f, ", max_value_exact: {}", self.is_max_value_exact)?;
+        write!(f, ", min_value_exact: {}", self.is_min_value_exact)?;
         write!(f, "}}")
     }
 }
@@ -565,13 +630,15 @@ impl<T: ParquetValueType> fmt::Debug for ValueStatistics<T> {
         write!(
             f,
             "{{min: {:?}, max: {:?}, distinct_count: {:?}, null_count: {}, \
-             min_max_deprecated: {}, min_max_backwards_compatible: {}}}",
+             min_max_deprecated: {}, min_max_backwards_compatible: {}, max_value_exact: {}, min_value_exact: {}}}",
             self.min,
             self.max,
             self.distinct_count,
             self.null_count,
             self.is_min_max_deprecated,
-            self.is_min_max_backwards_compatible
+            self.is_min_max_backwards_compatible,
+            self.is_max_value_exact,
+            self.is_min_value_exact
         )
     }
 }
@@ -628,14 +695,14 @@ mod tests {
         assert_eq!(
             format!("{stats:?}"),
             "Int32({min: Some(1), max: Some(12), distinct_count: None, null_count: 12, \
-             min_max_deprecated: true, min_max_backwards_compatible: true})"
+             min_max_deprecated: true, min_max_backwards_compatible: true, max_value_exact: true, min_value_exact: true})"
         );
 
         let stats = Statistics::int32(None, None, None, 7, false);
         assert_eq!(
             format!("{stats:?}"),
             "Int32({min: None, max: None, distinct_count: None, null_count: 7, \
-             min_max_deprecated: false, min_max_backwards_compatible: false})"
+             min_max_deprecated: false, min_max_backwards_compatible: false, max_value_exact: false, min_value_exact: false})"
         )
     }
 
@@ -644,14 +711,14 @@ mod tests {
         let stats = Statistics::int32(Some(1), Some(12), None, 12, true);
         assert_eq!(
             format!("{stats}"),
-            "{min: 1, max: 12, distinct_count: N/A, null_count: 12, min_max_deprecated: true}"
+            "{min: 1, max: 12, distinct_count: N/A, null_count: 12, min_max_deprecated: true, max_value_exact: true, min_value_exact: true}"
         );
 
         let stats = Statistics::int64(None, None, None, 7, false);
         assert_eq!(
             format!("{stats}"),
             "{min: N/A, max: N/A, distinct_count: N/A, null_count: 7, min_max_deprecated: \
-             false}"
+             false, max_value_exact: false, min_value_exact: false}"
         );
 
         let stats = Statistics::int96(
@@ -664,19 +731,23 @@ mod tests {
         assert_eq!(
             format!("{stats}"),
             "{min: [1, 0, 0], max: [2, 3, 4], distinct_count: N/A, null_count: 3, \
-             min_max_deprecated: true}"
+             min_max_deprecated: true, max_value_exact: true, min_value_exact: true}"
         );
 
-        let stats = Statistics::byte_array(
-            Some(ByteArray::from(vec![1u8])),
-            Some(ByteArray::from(vec![2u8])),
-            Some(5),
-            7,
-            false,
+        let stats = Statistics::ByteArray(
+            ValueStatistics::new(
+                Some(ByteArray::from(vec![1u8])),
+                Some(ByteArray::from(vec![2u8])),
+                Some(5),
+                7,
+                false,
+            )
+            .with_max_is_exact(false)
+            .with_min_is_exact(false),
         );
         assert_eq!(
             format!("{stats}"),
-            "{min: [1], max: [2], distinct_count: 5, null_count: 7, min_max_deprecated: false}"
+            "{min: [1], max: [2], distinct_count: 5, null_count: 7, min_max_deprecated: false, max_value_exact: false, min_value_exact: false}"
         );
     }
 
@@ -712,7 +783,45 @@ mod tests {
                 Some(ByteArray::from(vec![1, 2, 3]).into()),
                 None,
                 0,
-                true
+                true,
+            )
+        );
+
+        assert!(
+            Statistics::byte_array(
+                Some(ByteArray::from(vec![1, 2, 3])),
+                Some(ByteArray::from(vec![1, 2, 3])),
+                None,
+                0,
+                true,
+            ) != Statistics::ByteArray(
+                ValueStatistics::new(
+                    Some(ByteArray::from(vec![1, 2, 3])),
+                    Some(ByteArray::from(vec![1, 2, 3])),
+                    None,
+                    0,
+                    true,
+                )
+                .with_max_is_exact(false)
+            )
+        );
+
+        assert!(
+            Statistics::fixed_len_byte_array(
+                Some(FixedLenByteArray::from(vec![1, 2, 3])),
+                Some(FixedLenByteArray::from(vec![1, 2, 3])),
+                None,
+                0,
+                true,
+            ) != Statistics::FixedLenByteArray(
+                ValueStatistics::new(
+                    Some(FixedLenByteArray::from(vec![1, 2, 3])),
+                    Some(FixedLenByteArray::from(vec![1, 2, 3])),
+                    None,
+                    0,
+                    true,
+                )
+                .with_min_is_exact(false)
             )
         );
     }
diff --git a/parquet/tests/arrow_writer_layout.rs b/parquet/tests/arrow_writer_layout.rs
index fab87f32f5..cd124031cf 100644
--- a/parquet/tests/arrow_writer_layout.rs
+++ b/parquet/tests/arrow_writer_layout.rs
@@ -185,7 +185,7 @@ fn test_primitive() {
                     pages: (0..8)
                         .map(|_| Page {
                             rows: 250,
-                            page_header_size: 34,
+                            page_header_size: 36,
                             compressed_size: 1000,
                             encoding: Encoding::PLAIN,
                             page_type: PageType::DATA_PAGE,
@@ -214,14 +214,14 @@ fn test_primitive() {
                     pages: vec![
                         Page {
                             rows: 250,
-                            page_header_size: 34,
+                            page_header_size: 36,
                             compressed_size: 258,
                             encoding: Encoding::RLE_DICTIONARY,
                             page_type: PageType::DATA_PAGE,
                         },
                         Page {
                             rows: 1750,
-                            page_header_size: 34,
+                            page_header_size: 36,
                             compressed_size: 7000,
                             encoding: Encoding::PLAIN,
                             page_type: PageType::DATA_PAGE,
@@ -229,7 +229,7 @@ fn test_primitive() {
                     ],
                     dictionary_page: Some(Page {
                         rows: 250,
-                        page_header_size: 34,
+                        page_header_size: 36,
                         compressed_size: 1000,
                         encoding: Encoding::PLAIN,
                         page_type: PageType::DICTIONARY_PAGE,
@@ -256,42 +256,42 @@ fn test_primitive() {
                     pages: vec![
                         Page {
                             rows: 400,
-                            page_header_size: 34,
+                            page_header_size: 36,
                             compressed_size: 452,
                             encoding: Encoding::RLE_DICTIONARY,
                             page_type: PageType::DATA_PAGE,
                         },
                         Page {
                             rows: 370,
-                            page_header_size: 34,
+                            page_header_size: 36,
                             compressed_size: 472,
                             encoding: Encoding::RLE_DICTIONARY,
                             page_type: PageType::DATA_PAGE,
                         },
                         Page {
                             rows: 330,
-                            page_header_size: 34,
+                            page_header_size: 36,
                             compressed_size: 464,
                             encoding: Encoding::RLE_DICTIONARY,
                             page_type: PageType::DATA_PAGE,
                         },
                         Page {
                             rows: 330,
-                            page_header_size: 34,
+                            page_header_size: 36,
                             compressed_size: 464,
                             encoding: Encoding::RLE_DICTIONARY,
                             page_type: PageType::DATA_PAGE,
                         },
                         Page {
                             rows: 330,
-                            page_header_size: 34,
+                            page_header_size: 36,
                             compressed_size: 464,
                             encoding: Encoding::RLE_DICTIONARY,
                             page_type: PageType::DATA_PAGE,
                         },
                         Page {
                             rows: 240,
-                            page_header_size: 34,
+                            page_header_size: 36,
                             compressed_size: 332,
                             encoding: Encoding::RLE_DICTIONARY,
                             page_type: PageType::DATA_PAGE,
@@ -299,7 +299,7 @@ fn test_primitive() {
                     ],
                     dictionary_page: Some(Page {
                         rows: 2000,
-                        page_header_size: 34,
+                        page_header_size: 36,
                         compressed_size: 8000,
                         encoding: Encoding::PLAIN,
                         page_type: PageType::DICTIONARY_PAGE,
@@ -325,7 +325,7 @@ fn test_primitive() {
                     pages: (0..20)
                         .map(|_| Page {
                             rows: 100,
-                            page_header_size: 34,
+                            page_header_size: 36,
                             compressed_size: 400,
                             encoding: Encoding::PLAIN,
                             page_type: PageType::DATA_PAGE,
@@ -360,14 +360,14 @@ fn test_string() {
                     pages: (0..15)
                         .map(|_| Page {
                             rows: 130,
-                            page_header_size: 34,
+                            page_header_size: 36,
                             compressed_size: 1040,
                             encoding: Encoding::PLAIN,
                             page_type: PageType::DATA_PAGE,
                         })
                         .chain(std::iter::once(Page {
                             rows: 50,
-                            page_header_size: 33,
+                            page_header_size: 35,
                             compressed_size: 400,
                             encoding: Encoding::PLAIN,
                             page_type: PageType::DATA_PAGE,
@@ -396,21 +396,21 @@ fn test_string() {
                     pages: vec![
                         Page {
                             rows: 130,
-                            page_header_size: 34,
+                            page_header_size: 36,
                             compressed_size: 138,
                             encoding: Encoding::RLE_DICTIONARY,
                             page_type: PageType::DATA_PAGE,
                         },
                         Page {
                             rows: 1250,
-                            page_header_size: 36,
+                            page_header_size: 38,
                             compressed_size: 10000,
                             encoding: Encoding::PLAIN,
                             page_type: PageType::DATA_PAGE,
                         },
                         Page {
                             rows: 620,
-                            page_header_size: 34,
+                            page_header_size: 36,
                             compressed_size: 4960,
                             encoding: Encoding::PLAIN,
                             page_type: PageType::DATA_PAGE,
@@ -418,7 +418,7 @@ fn test_string() {
                     ],
                     dictionary_page: Some(Page {
                         rows: 130,
-                        page_header_size: 34,
+                        page_header_size: 36,
                         compressed_size: 1040,
                         encoding: Encoding::PLAIN,
                         page_type: PageType::DICTIONARY_PAGE,
@@ -445,42 +445,42 @@ fn test_string() {
                     pages: vec![
                         Page {
                             rows: 400,
-                            page_header_size: 34,
+                            page_header_size: 36,
                             compressed_size: 452,
                             encoding: Encoding::RLE_DICTIONARY,
                             page_type: PageType::DATA_PAGE,
                         },
                         Page {
                             rows: 370,
-                            page_header_size: 34,
+                            page_header_size: 36,
                             compressed_size: 472,
                             encoding: Encoding::RLE_DICTIONARY,
                             page_type: PageType::DATA_PAGE,
                         },
                         Page {
                             rows: 330,
-                            page_header_size: 34,
+                            page_header_size: 36,
                             compressed_size: 464,
                             encoding: Encoding::RLE_DICTIONARY,
                             page_type: PageType::DATA_PAGE,
                         },
                         Page {
                             rows: 330,
-                            page_header_size: 34,
+                            page_header_size: 36,
                             compressed_size: 464,
                             encoding: Encoding::RLE_DICTIONARY,
                             page_type: PageType::DATA_PAGE,
                         },
                         Page {
                             rows: 330,
-                            page_header_size: 34,
+                            page_header_size: 36,
                             compressed_size: 464,
                             encoding: Encoding::RLE_DICTIONARY,
                             page_type: PageType::DATA_PAGE,
                         },
                         Page {
                             rows: 240,
-                            page_header_size: 34,
+                            page_header_size: 36,
                             compressed_size: 332,
                             encoding: Encoding::RLE_DICTIONARY,
                             page_type: PageType::DATA_PAGE,
@@ -488,7 +488,7 @@ fn test_string() {
                     ],
                     dictionary_page: Some(Page {
                         rows: 2000,
-                        page_header_size: 34,
+                        page_header_size: 36,
                         compressed_size: 16000,
                         encoding: Encoding::PLAIN,
                         page_type: PageType::DICTIONARY_PAGE,
@@ -528,7 +528,7 @@ fn test_list() {
                     pages: (0..10)
                         .map(|_| Page {
                             rows: 20,
-                            page_header_size: 34,
+                            page_header_size: 36,
                             compressed_size: 672,
                             encoding: Encoding::PLAIN,
                             page_type: PageType::DATA_PAGE,