You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by tu...@apache.org on 2023/11/15 11:31:43 UTC
(arrow-rs) branch master updated: Enable truncation of binary statistics columns (#5076)
This is an automated email from the ASF dual-hosted git repository.
tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new 7941577d41 Enable truncation of binary statistics columns (#5076)
7941577d41 is described below
commit 7941577d414b9c93d60795bc79125ddad760c252
Author: emcake <37...@users.noreply.github.com>
AuthorDate: Wed Nov 15 11:31:36 2023 +0000
Enable truncation of binary statistics columns (#5076)
* changes needed to introduce min/max exactness
* implement truncation property and logic, tests
* format lints
* change min/max exact to be with... methods
* reduce code noise
* remove redundant clone
---------
Co-authored-by: Matthew Kemp <mk...@drwholdings.com>
---
parquet/src/column/writer/mod.rs | 228 ++++++++++++++++++++++++++++++++---
parquet/src/file/properties.rs | 24 ++++
parquet/src/file/statistics.rs | 171 +++++++++++++++++++++-----
parquet/tests/arrow_writer_layout.rs | 52 ++++----
4 files changed, 401 insertions(+), 74 deletions(-)
diff --git a/parquet/src/column/writer/mod.rs b/parquet/src/column/writer/mod.rs
index a917c48649..11c3968591 100644
--- a/parquet/src/column/writer/mod.rs
+++ b/parquet/src/column/writer/mod.rs
@@ -636,8 +636,16 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> {
Type::BYTE_ARRAY | Type::FIXED_LEN_BYTE_ARRAY => {
self.column_index_builder.append(
null_page,
- self.truncate_min_value(stat.min_bytes()),
- self.truncate_max_value(stat.max_bytes()),
+ self.truncate_min_value(
+ self.props.column_index_truncate_length(),
+ stat.min_bytes(),
+ )
+ .0,
+ self.truncate_max_value(
+ self.props.column_index_truncate_length(),
+ stat.max_bytes(),
+ )
+ .0,
self.page_metrics.num_page_nulls as i64,
);
}
@@ -658,26 +666,26 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> {
.append_row_count(self.page_metrics.num_buffered_rows as i64);
}
- fn truncate_min_value(&self, data: &[u8]) -> Vec<u8> {
- self.props
- .column_index_truncate_length()
+ fn truncate_min_value(&self, truncation_length: Option<usize>, data: &[u8]) -> (Vec<u8>, bool) {
+ truncation_length
.filter(|l| data.len() > *l)
.and_then(|l| match str::from_utf8(data) {
Ok(str_data) => truncate_utf8(str_data, l),
Err(_) => Some(data[..l].to_vec()),
})
- .unwrap_or_else(|| data.to_vec())
+ .map(|truncated| (truncated, true))
+ .unwrap_or_else(|| (data.to_vec(), false))
}
- fn truncate_max_value(&self, data: &[u8]) -> Vec<u8> {
- self.props
- .column_index_truncate_length()
+ fn truncate_max_value(&self, truncation_length: Option<usize>, data: &[u8]) -> (Vec<u8>, bool) {
+ truncation_length
.filter(|l| data.len() > *l)
.and_then(|l| match str::from_utf8(data) {
Ok(str_data) => truncate_utf8(str_data, l).and_then(increment_utf8),
Err(_) => increment(data[..l].to_vec()),
})
- .unwrap_or_else(|| data.to_vec())
+ .map(|truncated| (truncated, true))
+ .unwrap_or_else(|| (data.to_vec(), false))
}
/// Adds data page.
@@ -856,20 +864,64 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> {
.set_dictionary_page_offset(dict_page_offset);
if self.statistics_enabled != EnabledStatistics::None {
+ let backwards_compatible_min_max = self.descr.sort_order().is_signed();
+
let statistics = ValueStatistics::<E::T>::new(
self.column_metrics.min_column_value.clone(),
self.column_metrics.max_column_value.clone(),
self.column_metrics.column_distinct_count,
self.column_metrics.num_column_nulls,
false,
- );
+ )
+ .with_backwards_compatible_min_max(backwards_compatible_min_max)
+ .into();
+
+ let statistics = match statistics {
+ Statistics::ByteArray(stats) if stats.has_min_max_set() => {
+ let (min, did_truncate_min) = self.truncate_min_value(
+ self.props.statistics_truncate_length(),
+ stats.min_bytes(),
+ );
+ let (max, did_truncate_max) = self.truncate_max_value(
+ self.props.statistics_truncate_length(),
+ stats.max_bytes(),
+ );
+ Statistics::ByteArray(
+ ValueStatistics::new(
+ Some(min.into()),
+ Some(max.into()),
+ stats.distinct_count(),
+ stats.null_count(),
+ backwards_compatible_min_max,
+ )
+ .with_max_is_exact(!did_truncate_max)
+ .with_min_is_exact(!did_truncate_min),
+ )
+ }
+ Statistics::FixedLenByteArray(stats) if stats.has_min_max_set() => {
+ let (min, did_truncate_min) = self.truncate_min_value(
+ self.props.statistics_truncate_length(),
+ stats.min_bytes(),
+ );
+ let (max, did_truncate_max) = self.truncate_max_value(
+ self.props.statistics_truncate_length(),
+ stats.max_bytes(),
+ );
+ Statistics::FixedLenByteArray(
+ ValueStatistics::new(
+ Some(min.into()),
+ Some(max.into()),
+ stats.distinct_count(),
+ stats.null_count(),
+ backwards_compatible_min_max,
+ )
+ .with_max_is_exact(!did_truncate_max)
+ .with_min_is_exact(!did_truncate_min),
+ )
+ }
+ stats => stats,
+ };
- // Some common readers only support the deprecated statistics
- // format so we also write them out if possible
- // See https://github.com/apache/arrow-rs/issues/799
- let statistics = statistics
- .with_backwards_compatible_min_max(self.descr.sort_order().is_signed())
- .into();
builder = builder.set_statistics(statistics);
}
@@ -2612,6 +2664,148 @@ mod tests {
}
}
+ #[test]
+ fn test_statistics_truncating_byte_array() {
+ let page_writer = get_test_page_writer();
+
+ const TEST_TRUNCATE_LENGTH: usize = 1;
+
+ // Truncate values at 1 byte
+ let builder =
+ WriterProperties::builder().set_statistics_truncate_length(Some(TEST_TRUNCATE_LENGTH));
+ let props = Arc::new(builder.build());
+ let mut writer = get_test_column_writer::<ByteArrayType>(page_writer, 0, 0, props);
+
+ let mut data = vec![ByteArray::default(); 1];
+ // This is the expected min value
+ data[0].set_data(Bytes::from(String::from("Blart Versenwald III")));
+
+ writer.write_batch(&data, None, None).unwrap();
+
+ writer.flush_data_pages().unwrap();
+
+ let r = writer.close().unwrap();
+
+ assert_eq!(1, r.rows_written);
+
+ let stats = r.metadata.statistics().expect("statistics");
+ assert!(stats.has_min_max_set());
+ assert_eq!(stats.null_count(), 0);
+ assert_eq!(stats.distinct_count(), None);
+ if let Statistics::ByteArray(_stats) = stats {
+ let min_value = _stats.min();
+ let max_value = _stats.max();
+
+ assert!(!_stats.min_is_exact());
+ assert!(!_stats.max_is_exact());
+
+ assert_eq!(min_value.len(), TEST_TRUNCATE_LENGTH);
+ assert_eq!(max_value.len(), TEST_TRUNCATE_LENGTH);
+
+ assert_eq!("B".as_bytes(), min_value.as_bytes());
+ assert_eq!("C".as_bytes(), max_value.as_bytes());
+ } else {
+ panic!("expecting Statistics::ByteArray");
+ }
+ }
+
+ #[test]
+ fn test_statistics_truncating_fixed_len_byte_array() {
+ let page_writer = get_test_page_writer();
+
+ const TEST_TRUNCATE_LENGTH: usize = 1;
+
+ // Truncate values at 1 byte
+ let builder =
+ WriterProperties::builder().set_statistics_truncate_length(Some(TEST_TRUNCATE_LENGTH));
+ let props = Arc::new(builder.build());
+ let mut writer = get_test_column_writer::<FixedLenByteArrayType>(page_writer, 0, 0, props);
+
+ let mut data = vec![FixedLenByteArray::default(); 1];
+
+ const PSEUDO_DECIMAL_VALUE: i128 = 6541894651216648486512564456564654;
+ const PSEUDO_DECIMAL_BYTES: [u8; 16] = PSEUDO_DECIMAL_VALUE.to_be_bytes();
+
+ const EXPECTED_MIN: [u8; TEST_TRUNCATE_LENGTH] = [PSEUDO_DECIMAL_BYTES[0]]; // parquet specifies big-endian order for decimals
+ const EXPECTED_MAX: [u8; TEST_TRUNCATE_LENGTH] =
+ [PSEUDO_DECIMAL_BYTES[0].overflowing_add(1).0];
+
+ // This is the expected min value
+ data[0].set_data(Bytes::from(PSEUDO_DECIMAL_BYTES.as_slice()));
+
+ writer.write_batch(&data, None, None).unwrap();
+
+ writer.flush_data_pages().unwrap();
+
+ let r = writer.close().unwrap();
+
+ assert_eq!(1, r.rows_written);
+
+ let stats = r.metadata.statistics().expect("statistics");
+ assert!(stats.has_min_max_set());
+ assert_eq!(stats.null_count(), 0);
+ assert_eq!(stats.distinct_count(), None);
+ if let Statistics::FixedLenByteArray(_stats) = stats {
+ let min_value = _stats.min();
+ let max_value = _stats.max();
+
+ assert!(!_stats.min_is_exact());
+ assert!(!_stats.max_is_exact());
+
+ assert_eq!(min_value.len(), TEST_TRUNCATE_LENGTH);
+ assert_eq!(max_value.len(), TEST_TRUNCATE_LENGTH);
+
+ assert_eq!(EXPECTED_MIN.as_slice(), min_value.as_bytes());
+ assert_eq!(EXPECTED_MAX.as_slice(), max_value.as_bytes());
+
+ let reconstructed_min = i128::from_be_bytes([
+ min_value.as_bytes()[0],
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ ]);
+
+ let reconstructed_max = i128::from_be_bytes([
+ max_value.as_bytes()[0],
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ ]);
+
+ // check that the inner value is correctly bounded by the min/max
+ println!("min: {reconstructed_min} {PSEUDO_DECIMAL_VALUE}");
+ assert!(reconstructed_min <= PSEUDO_DECIMAL_VALUE);
+ println!("max {reconstructed_max} {PSEUDO_DECIMAL_VALUE}");
+ assert!(reconstructed_max >= PSEUDO_DECIMAL_VALUE);
+ } else {
+ panic!("expecting Statistics::FixedLenByteArray");
+ }
+ }
+
#[test]
fn test_send() {
fn test<T: Send>() {}
diff --git a/parquet/src/file/properties.rs b/parquet/src/file/properties.rs
index ea71763a01..287e73c990 100644
--- a/parquet/src/file/properties.rs
+++ b/parquet/src/file/properties.rs
@@ -51,6 +51,8 @@ pub const DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH: Option<usize> = Some(64);
pub const DEFAULT_BLOOM_FILTER_FPP: f64 = 0.05;
/// Default value for [`BloomFilterProperties::ndv`]
pub const DEFAULT_BLOOM_FILTER_NDV: u64 = 1_000_000_u64;
+/// Default values for [`WriterProperties::statistics_truncate_length`]
+pub const DEFAULT_STATISTICS_TRUNCATE_LENGTH: Option<usize> = None;
/// Parquet writer version.
///
@@ -136,6 +138,7 @@ pub struct WriterProperties {
column_properties: HashMap<ColumnPath, ColumnProperties>,
sorting_columns: Option<Vec<SortingColumn>>,
column_index_truncate_length: Option<usize>,
+ statistics_truncate_length: Option<usize>,
}
impl Default for WriterProperties {
@@ -241,6 +244,13 @@ impl WriterProperties {
self.column_index_truncate_length
}
+ /// Returns the maximum length of truncated min/max values in statistics.
+ ///
+ /// `None` if truncation is disabled, must be greater than 0 otherwise.
+ pub fn statistics_truncate_length(&self) -> Option<usize> {
+ self.statistics_truncate_length
+ }
+
/// Returns encoding for a data page, when dictionary encoding is enabled.
/// This is not configurable.
#[inline]
@@ -334,6 +344,7 @@ pub struct WriterPropertiesBuilder {
column_properties: HashMap<ColumnPath, ColumnProperties>,
sorting_columns: Option<Vec<SortingColumn>>,
column_index_truncate_length: Option<usize>,
+ statistics_truncate_length: Option<usize>,
}
impl WriterPropertiesBuilder {
@@ -352,6 +363,7 @@ impl WriterPropertiesBuilder {
column_properties: HashMap::new(),
sorting_columns: None,
column_index_truncate_length: DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH,
+ statistics_truncate_length: DEFAULT_STATISTICS_TRUNCATE_LENGTH,
}
}
@@ -370,6 +382,7 @@ impl WriterPropertiesBuilder {
column_properties: self.column_properties,
sorting_columns: self.sorting_columns,
column_index_truncate_length: self.column_index_truncate_length,
+ statistics_truncate_length: self.statistics_truncate_length,
}
}
@@ -643,6 +656,17 @@ impl WriterPropertiesBuilder {
self.column_index_truncate_length = max_length;
self
}
+
+ /// Sets the max length of min/max value fields in statistics. Must be greater than 0.
+ /// If set to `None` - there's no effective limit.
+ pub fn set_statistics_truncate_length(mut self, max_length: Option<usize>) -> Self {
+ if let Some(value) = max_length {
+ assert!(value > 0, "Cannot have a 0 statistics truncate length. If you wish to disable min/max value truncation, set it to `None`.");
+ }
+
+ self.statistics_truncate_length = max_length;
+ self
+ }
}
/// Controls the level of statistics to be computed by the writer
diff --git a/parquet/src/file/statistics.rs b/parquet/src/file/statistics.rs
index 345fe7dd26..1bc003d488 100644
--- a/parquet/src/file/statistics.rs
+++ b/parquet/src/file/statistics.rs
@@ -27,6 +27,8 @@
//! assert_eq!(stats.null_count(), 3);
//! assert!(stats.has_min_max_set());
//! assert!(stats.is_min_max_deprecated());
+//! assert!(stats.min_is_exact());
+//! assert!(stats.max_is_exact());
//!
//! match stats {
//! Statistics::Int32(ref typed) => {
@@ -206,19 +208,27 @@ pub fn from_thrift(
null_count,
old_format,
),
- Type::BYTE_ARRAY => Statistics::byte_array(
- min.map(ByteArray::from),
- max.map(ByteArray::from),
- distinct_count,
- null_count,
- old_format,
+ Type::BYTE_ARRAY => Statistics::ByteArray(
+ ValueStatistics::new(
+ min.map(ByteArray::from),
+ max.map(ByteArray::from),
+ distinct_count,
+ null_count,
+ old_format,
+ )
+ .with_max_is_exact(stats.is_max_value_exact.unwrap_or(false))
+ .with_min_is_exact(stats.is_min_value_exact.unwrap_or(false)),
),
- Type::FIXED_LEN_BYTE_ARRAY => Statistics::fixed_len_byte_array(
- min.map(ByteArray::from).map(FixedLenByteArray::from),
- max.map(ByteArray::from).map(FixedLenByteArray::from),
- distinct_count,
- null_count,
- old_format,
+ Type::FIXED_LEN_BYTE_ARRAY => Statistics::FixedLenByteArray(
+ ValueStatistics::new(
+ min.map(ByteArray::from).map(FixedLenByteArray::from),
+ max.map(ByteArray::from).map(FixedLenByteArray::from),
+ distinct_count,
+ null_count,
+ old_format,
+ )
+ .with_max_is_exact(stats.is_max_value_exact.unwrap_or(false))
+ .with_min_is_exact(stats.is_min_value_exact.unwrap_or(false)),
),
};
@@ -248,13 +258,15 @@ pub fn to_thrift(stats: Option<&Statistics>) -> Option<TStatistics> {
};
// Get min/max if set.
- let (min, max) = if stats.has_min_max_set() {
+ let (min, max, min_exact, max_exact) = if stats.has_min_max_set() {
(
Some(stats.min_bytes().to_vec()),
Some(stats.max_bytes().to_vec()),
+ Some(stats.min_is_exact()),
+ Some(stats.max_is_exact()),
)
} else {
- (None, None)
+ (None, None, None, None)
};
if stats.is_min_max_backwards_compatible() {
@@ -268,6 +280,9 @@ pub fn to_thrift(stats: Option<&Statistics>) -> Option<TStatistics> {
thrift_stats.max_value = max;
}
+ thrift_stats.is_min_value_exact = min_exact;
+ thrift_stats.is_max_value_exact = max_exact;
+
Some(thrift_stats)
}
@@ -374,6 +389,16 @@ impl Statistics {
statistics_enum_func![self, has_min_max_set]
}
+ /// Returns `true` if the min value is set, and is an exact min value.
+ pub fn min_is_exact(&self) -> bool {
+ statistics_enum_func![self, min_is_exact]
+ }
+
+ /// Returns `true` if the max value is set, and is an exact max value.
+ pub fn max_is_exact(&self) -> bool {
+ statistics_enum_func![self, max_is_exact]
+ }
+
/// Returns slice of bytes that represent min value.
/// Panics if min value is not set.
pub fn min_bytes(&self) -> &[u8] {
@@ -428,6 +453,10 @@ pub struct ValueStatistics<T> {
distinct_count: Option<u64>,
null_count: u64,
+ // Whether or not the min or max values are exact, or truncated.
+ is_max_value_exact: bool,
+ is_min_value_exact: bool,
+
/// If `true` populate the deprecated `min` and `max` fields instead of
/// `min_value` and `max_value`
is_min_max_deprecated: bool,
@@ -447,6 +476,8 @@ impl<T: ParquetValueType> ValueStatistics<T> {
is_min_max_deprecated: bool,
) -> Self {
Self {
+ is_max_value_exact: max.is_some(),
+ is_min_value_exact: min.is_some(),
min,
max,
distinct_count,
@@ -456,6 +487,28 @@ impl<T: ParquetValueType> ValueStatistics<T> {
}
}
+ /// Set whether the stored `min` field represents the exact
+ /// minimum, or just a bound on the minimum value.
+ ///
+ /// see [`Self::min_is_exact`]
+ pub fn with_min_is_exact(self, is_min_value_exact: bool) -> Self {
+ Self {
+ is_min_value_exact,
+ ..self
+ }
+ }
+
+ /// Set whether the stored `max` field represents the exact
+ /// maximum, or just a bound on the maximum value.
+ ///
+ /// see [`Self::max_is_exact`]
+ pub fn with_max_is_exact(self, is_max_value_exact: bool) -> Self {
+ Self {
+ is_max_value_exact,
+ ..self
+ }
+ }
+
/// Set whether to write the deprecated `min` and `max` fields
/// for compatibility with older parquet writers
///
@@ -506,13 +559,23 @@ impl<T: ParquetValueType> ValueStatistics<T> {
self.min.is_some() && self.max.is_some()
}
+ /// Whether or not max value is set, and is an exact value.
+ pub fn max_is_exact(&self) -> bool {
+ self.max.is_some() && self.is_max_value_exact
+ }
+
+ /// Whether or not min value is set, and is an exact value.
+ pub fn min_is_exact(&self) -> bool {
+ self.min.is_some() && self.is_min_value_exact
+ }
+
/// Returns optional value of number of distinct values occurring.
- fn distinct_count(&self) -> Option<u64> {
+ pub fn distinct_count(&self) -> Option<u64> {
self.distinct_count
}
/// Returns null count.
- fn null_count(&self) -> u64 {
+ pub fn null_count(&self) -> u64 {
self.null_count
}
@@ -556,6 +619,8 @@ impl<T: ParquetValueType> fmt::Display for ValueStatistics<T> {
}
write!(f, ", null_count: {}", self.null_count)?;
write!(f, ", min_max_deprecated: {}", self.is_min_max_deprecated)?;
+ write!(f, ", max_value_exact: {}", self.is_max_value_exact)?;
+ write!(f, ", min_value_exact: {}", self.is_min_value_exact)?;
write!(f, "}}")
}
}
@@ -565,13 +630,15 @@ impl<T: ParquetValueType> fmt::Debug for ValueStatistics<T> {
write!(
f,
"{{min: {:?}, max: {:?}, distinct_count: {:?}, null_count: {}, \
- min_max_deprecated: {}, min_max_backwards_compatible: {}}}",
+ min_max_deprecated: {}, min_max_backwards_compatible: {}, max_value_exact: {}, min_value_exact: {}}}",
self.min,
self.max,
self.distinct_count,
self.null_count,
self.is_min_max_deprecated,
- self.is_min_max_backwards_compatible
+ self.is_min_max_backwards_compatible,
+ self.is_max_value_exact,
+ self.is_min_value_exact
)
}
}
@@ -628,14 +695,14 @@ mod tests {
assert_eq!(
format!("{stats:?}"),
"Int32({min: Some(1), max: Some(12), distinct_count: None, null_count: 12, \
- min_max_deprecated: true, min_max_backwards_compatible: true})"
+ min_max_deprecated: true, min_max_backwards_compatible: true, max_value_exact: true, min_value_exact: true})"
);
let stats = Statistics::int32(None, None, None, 7, false);
assert_eq!(
format!("{stats:?}"),
"Int32({min: None, max: None, distinct_count: None, null_count: 7, \
- min_max_deprecated: false, min_max_backwards_compatible: false})"
+ min_max_deprecated: false, min_max_backwards_compatible: false, max_value_exact: false, min_value_exact: false})"
)
}
@@ -644,14 +711,14 @@ mod tests {
let stats = Statistics::int32(Some(1), Some(12), None, 12, true);
assert_eq!(
format!("{stats}"),
- "{min: 1, max: 12, distinct_count: N/A, null_count: 12, min_max_deprecated: true}"
+ "{min: 1, max: 12, distinct_count: N/A, null_count: 12, min_max_deprecated: true, max_value_exact: true, min_value_exact: true}"
);
let stats = Statistics::int64(None, None, None, 7, false);
assert_eq!(
format!("{stats}"),
"{min: N/A, max: N/A, distinct_count: N/A, null_count: 7, min_max_deprecated: \
- false}"
+ false, max_value_exact: false, min_value_exact: false}"
);
let stats = Statistics::int96(
@@ -664,19 +731,23 @@ mod tests {
assert_eq!(
format!("{stats}"),
"{min: [1, 0, 0], max: [2, 3, 4], distinct_count: N/A, null_count: 3, \
- min_max_deprecated: true}"
+ min_max_deprecated: true, max_value_exact: true, min_value_exact: true}"
);
- let stats = Statistics::byte_array(
- Some(ByteArray::from(vec![1u8])),
- Some(ByteArray::from(vec![2u8])),
- Some(5),
- 7,
- false,
+ let stats = Statistics::ByteArray(
+ ValueStatistics::new(
+ Some(ByteArray::from(vec![1u8])),
+ Some(ByteArray::from(vec![2u8])),
+ Some(5),
+ 7,
+ false,
+ )
+ .with_max_is_exact(false)
+ .with_min_is_exact(false),
);
assert_eq!(
format!("{stats}"),
- "{min: [1], max: [2], distinct_count: 5, null_count: 7, min_max_deprecated: false}"
+ "{min: [1], max: [2], distinct_count: 5, null_count: 7, min_max_deprecated: false, max_value_exact: false, min_value_exact: false}"
);
}
@@ -712,7 +783,45 @@ mod tests {
Some(ByteArray::from(vec![1, 2, 3]).into()),
None,
0,
- true
+ true,
+ )
+ );
+
+ assert!(
+ Statistics::byte_array(
+ Some(ByteArray::from(vec![1, 2, 3])),
+ Some(ByteArray::from(vec![1, 2, 3])),
+ None,
+ 0,
+ true,
+ ) != Statistics::ByteArray(
+ ValueStatistics::new(
+ Some(ByteArray::from(vec![1, 2, 3])),
+ Some(ByteArray::from(vec![1, 2, 3])),
+ None,
+ 0,
+ true,
+ )
+ .with_max_is_exact(false)
+ )
+ );
+
+ assert!(
+ Statistics::fixed_len_byte_array(
+ Some(FixedLenByteArray::from(vec![1, 2, 3])),
+ Some(FixedLenByteArray::from(vec![1, 2, 3])),
+ None,
+ 0,
+ true,
+ ) != Statistics::FixedLenByteArray(
+ ValueStatistics::new(
+ Some(FixedLenByteArray::from(vec![1, 2, 3])),
+ Some(FixedLenByteArray::from(vec![1, 2, 3])),
+ None,
+ 0,
+ true,
+ )
+ .with_min_is_exact(false)
)
);
}
diff --git a/parquet/tests/arrow_writer_layout.rs b/parquet/tests/arrow_writer_layout.rs
index fab87f32f5..cd124031cf 100644
--- a/parquet/tests/arrow_writer_layout.rs
+++ b/parquet/tests/arrow_writer_layout.rs
@@ -185,7 +185,7 @@ fn test_primitive() {
pages: (0..8)
.map(|_| Page {
rows: 250,
- page_header_size: 34,
+ page_header_size: 36,
compressed_size: 1000,
encoding: Encoding::PLAIN,
page_type: PageType::DATA_PAGE,
@@ -214,14 +214,14 @@ fn test_primitive() {
pages: vec![
Page {
rows: 250,
- page_header_size: 34,
+ page_header_size: 36,
compressed_size: 258,
encoding: Encoding::RLE_DICTIONARY,
page_type: PageType::DATA_PAGE,
},
Page {
rows: 1750,
- page_header_size: 34,
+ page_header_size: 36,
compressed_size: 7000,
encoding: Encoding::PLAIN,
page_type: PageType::DATA_PAGE,
@@ -229,7 +229,7 @@ fn test_primitive() {
],
dictionary_page: Some(Page {
rows: 250,
- page_header_size: 34,
+ page_header_size: 36,
compressed_size: 1000,
encoding: Encoding::PLAIN,
page_type: PageType::DICTIONARY_PAGE,
@@ -256,42 +256,42 @@ fn test_primitive() {
pages: vec![
Page {
rows: 400,
- page_header_size: 34,
+ page_header_size: 36,
compressed_size: 452,
encoding: Encoding::RLE_DICTIONARY,
page_type: PageType::DATA_PAGE,
},
Page {
rows: 370,
- page_header_size: 34,
+ page_header_size: 36,
compressed_size: 472,
encoding: Encoding::RLE_DICTIONARY,
page_type: PageType::DATA_PAGE,
},
Page {
rows: 330,
- page_header_size: 34,
+ page_header_size: 36,
compressed_size: 464,
encoding: Encoding::RLE_DICTIONARY,
page_type: PageType::DATA_PAGE,
},
Page {
rows: 330,
- page_header_size: 34,
+ page_header_size: 36,
compressed_size: 464,
encoding: Encoding::RLE_DICTIONARY,
page_type: PageType::DATA_PAGE,
},
Page {
rows: 330,
- page_header_size: 34,
+ page_header_size: 36,
compressed_size: 464,
encoding: Encoding::RLE_DICTIONARY,
page_type: PageType::DATA_PAGE,
},
Page {
rows: 240,
- page_header_size: 34,
+ page_header_size: 36,
compressed_size: 332,
encoding: Encoding::RLE_DICTIONARY,
page_type: PageType::DATA_PAGE,
@@ -299,7 +299,7 @@ fn test_primitive() {
],
dictionary_page: Some(Page {
rows: 2000,
- page_header_size: 34,
+ page_header_size: 36,
compressed_size: 8000,
encoding: Encoding::PLAIN,
page_type: PageType::DICTIONARY_PAGE,
@@ -325,7 +325,7 @@ fn test_primitive() {
pages: (0..20)
.map(|_| Page {
rows: 100,
- page_header_size: 34,
+ page_header_size: 36,
compressed_size: 400,
encoding: Encoding::PLAIN,
page_type: PageType::DATA_PAGE,
@@ -360,14 +360,14 @@ fn test_string() {
pages: (0..15)
.map(|_| Page {
rows: 130,
- page_header_size: 34,
+ page_header_size: 36,
compressed_size: 1040,
encoding: Encoding::PLAIN,
page_type: PageType::DATA_PAGE,
})
.chain(std::iter::once(Page {
rows: 50,
- page_header_size: 33,
+ page_header_size: 35,
compressed_size: 400,
encoding: Encoding::PLAIN,
page_type: PageType::DATA_PAGE,
@@ -396,21 +396,21 @@ fn test_string() {
pages: vec![
Page {
rows: 130,
- page_header_size: 34,
+ page_header_size: 36,
compressed_size: 138,
encoding: Encoding::RLE_DICTIONARY,
page_type: PageType::DATA_PAGE,
},
Page {
rows: 1250,
- page_header_size: 36,
+ page_header_size: 38,
compressed_size: 10000,
encoding: Encoding::PLAIN,
page_type: PageType::DATA_PAGE,
},
Page {
rows: 620,
- page_header_size: 34,
+ page_header_size: 36,
compressed_size: 4960,
encoding: Encoding::PLAIN,
page_type: PageType::DATA_PAGE,
@@ -418,7 +418,7 @@ fn test_string() {
],
dictionary_page: Some(Page {
rows: 130,
- page_header_size: 34,
+ page_header_size: 36,
compressed_size: 1040,
encoding: Encoding::PLAIN,
page_type: PageType::DICTIONARY_PAGE,
@@ -445,42 +445,42 @@ fn test_string() {
pages: vec![
Page {
rows: 400,
- page_header_size: 34,
+ page_header_size: 36,
compressed_size: 452,
encoding: Encoding::RLE_DICTIONARY,
page_type: PageType::DATA_PAGE,
},
Page {
rows: 370,
- page_header_size: 34,
+ page_header_size: 36,
compressed_size: 472,
encoding: Encoding::RLE_DICTIONARY,
page_type: PageType::DATA_PAGE,
},
Page {
rows: 330,
- page_header_size: 34,
+ page_header_size: 36,
compressed_size: 464,
encoding: Encoding::RLE_DICTIONARY,
page_type: PageType::DATA_PAGE,
},
Page {
rows: 330,
- page_header_size: 34,
+ page_header_size: 36,
compressed_size: 464,
encoding: Encoding::RLE_DICTIONARY,
page_type: PageType::DATA_PAGE,
},
Page {
rows: 330,
- page_header_size: 34,
+ page_header_size: 36,
compressed_size: 464,
encoding: Encoding::RLE_DICTIONARY,
page_type: PageType::DATA_PAGE,
},
Page {
rows: 240,
- page_header_size: 34,
+ page_header_size: 36,
compressed_size: 332,
encoding: Encoding::RLE_DICTIONARY,
page_type: PageType::DATA_PAGE,
@@ -488,7 +488,7 @@ fn test_string() {
],
dictionary_page: Some(Page {
rows: 2000,
- page_header_size: 34,
+ page_header_size: 36,
compressed_size: 16000,
encoding: Encoding::PLAIN,
page_type: PageType::DICTIONARY_PAGE,
@@ -528,7 +528,7 @@ fn test_list() {
pages: (0..10)
.map(|_| Page {
rows: 20,
- page_header_size: 34,
+ page_header_size: 36,
compressed_size: 672,
encoding: Encoding::PLAIN,
page_type: PageType::DATA_PAGE,