You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by tu...@apache.org on 2023/12/05 12:08:51 UTC

(arrow-rs) branch master updated: Parquet: don't truncate f16/decimal min/max stats (#5154)

This is an automated email from the ASF dual-hosted git repository.

tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/master by this push:
     new b8d3f3380c Parquet: don't truncate f16/decimal min/max stats (#5154)
b8d3f3380c is described below

commit b8d3f3380c853a5f4b9c47410f6b30ed7aa28737
Author: Jeffrey <22...@users.noreply.github.com>
AuthorDate: Tue Dec 5 23:08:46 2023 +1100

    Parquet: don't truncate f16/decimal min/max stats (#5154)
    
    * Parquet: don't truncate f16/decimal min/max stats
    
    * Fix
---
 parquet/src/column/writer/mod.rs | 153 +++++++++++++++++++++++++++++++--------
 1 file changed, 124 insertions(+), 29 deletions(-)

diff --git a/parquet/src/column/writer/mod.rs b/parquet/src/column/writer/mod.rs
index e92a502689..5dd7747c6f 100644
--- a/parquet/src/column/writer/mod.rs
+++ b/parquet/src/column/writer/mod.rs
@@ -680,32 +680,28 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> {
                     }
                     self.last_non_null_data_page_min_max = Some((new_min.clone(), new_max.clone()));
 
-                    // We only truncate if the data is represented as binary
-                    match self.descr.physical_type() {
-                        Type::BYTE_ARRAY | Type::FIXED_LEN_BYTE_ARRAY => {
-                            self.column_index_builder.append(
-                                null_page,
-                                self.truncate_min_value(
-                                    self.props.column_index_truncate_length(),
-                                    stat.min_bytes(),
-                                )
-                                .0,
-                                self.truncate_max_value(
-                                    self.props.column_index_truncate_length(),
-                                    stat.max_bytes(),
-                                )
-                                .0,
-                                self.page_metrics.num_page_nulls as i64,
-                            );
-                        }
-                        _ => {
-                            self.column_index_builder.append(
-                                null_page,
-                                stat.min_bytes().to_vec(),
-                                stat.max_bytes().to_vec(),
-                                self.page_metrics.num_page_nulls as i64,
-                            );
-                        }
+                    if self.can_truncate_value() {
+                        self.column_index_builder.append(
+                            null_page,
+                            self.truncate_min_value(
+                                self.props.column_index_truncate_length(),
+                                stat.min_bytes(),
+                            )
+                            .0,
+                            self.truncate_max_value(
+                                self.props.column_index_truncate_length(),
+                                stat.max_bytes(),
+                            )
+                            .0,
+                            self.page_metrics.num_page_nulls as i64,
+                        );
+                    } else {
+                        self.column_index_builder.append(
+                            null_page,
+                            stat.min_bytes().to_vec(),
+                            stat.max_bytes().to_vec(),
+                            self.page_metrics.num_page_nulls as i64,
+                        );
                     }
                 }
             }
@@ -715,6 +711,26 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> {
             .append_row_count(self.page_metrics.num_buffered_rows as i64);
     }
 
+    /// Determine if we should allow truncating min/max values for this column's statistics
+    fn can_truncate_value(&self) -> bool {
+        match self.descr.physical_type() {
+            // Don't truncate for Float16 and Decimal because their sort order is different
+            // from that of FIXED_LEN_BYTE_ARRAY sort order.
+            // So truncation of those types could lead to inaccurate min/max statistics
+            Type::FIXED_LEN_BYTE_ARRAY
+                if !matches!(
+                    self.descr.logical_type(),
+                    Some(LogicalType::Decimal { .. }) | Some(LogicalType::Float16)
+                ) =>
+            {
+                true
+            }
+            Type::BYTE_ARRAY => true,
+            // Truncation only applies for fba/binary physical types
+            _ => false,
+        }
+    }
+
     fn truncate_min_value(&self, truncation_length: Option<usize>, data: &[u8]) -> (Vec<u8>, bool) {
         truncation_length
             .filter(|l| data.len() > *l)
@@ -948,7 +964,9 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> {
                         .with_min_is_exact(!did_truncate_min),
                     )
                 }
-                Statistics::FixedLenByteArray(stats) if stats.has_min_max_set() => {
+                Statistics::FixedLenByteArray(stats)
+                    if (stats.has_min_max_set() && self.can_truncate_value()) =>
+                {
                     let (min, did_truncate_min) = self.truncate_min_value(
                         self.props.statistics_truncate_length(),
                         stats.min_bytes(),
@@ -2713,6 +2731,82 @@ mod tests {
         }
     }
 
+    #[test]
+    fn test_float16_min_max_no_truncation() {
+        // Even if we set truncation to occur at 1 byte, we should not truncate for Float16
+        let builder = WriterProperties::builder().set_column_index_truncate_length(Some(1));
+        let props = Arc::new(builder.build());
+        let page_writer = get_test_page_writer();
+        let mut writer = get_test_float16_column_writer(page_writer, props);
+
+        let expected_value = f16::PI.to_le_bytes().to_vec();
+        let data = vec![ByteArray::from(expected_value.clone()).into()];
+        writer.write_batch(&data, None, None).unwrap();
+        writer.flush_data_pages().unwrap();
+
+        let r = writer.close().unwrap();
+
+        // stats should still be written
+        // ensure bytes weren't truncated for column index
+        let column_index = r.column_index.unwrap();
+        let column_index_min_bytes = column_index.min_values[0].as_slice();
+        let column_index_max_bytes = column_index.max_values[0].as_slice();
+        assert_eq!(expected_value, column_index_min_bytes);
+        assert_eq!(expected_value, column_index_max_bytes);
+
+        // ensure bytes weren't truncated for statistics
+        let stats = r.metadata.statistics().unwrap();
+        assert!(stats.has_min_max_set());
+        if let Statistics::FixedLenByteArray(stats) = stats {
+            let stats_min_bytes = stats.min_bytes();
+            let stats_max_bytes = stats.max_bytes();
+            assert_eq!(expected_value, stats_min_bytes);
+            assert_eq!(expected_value, stats_max_bytes);
+        } else {
+            panic!("expecting Statistics::FixedLenByteArray");
+        }
+    }
+
+    #[test]
+    fn test_decimal_min_max_no_truncation() {
+        // Even if we set truncation to occur at 1 byte, we should not truncate for Decimal
+        let builder = WriterProperties::builder().set_column_index_truncate_length(Some(1));
+        let props = Arc::new(builder.build());
+        let page_writer = get_test_page_writer();
+        let mut writer =
+            get_test_decimals_column_writer::<FixedLenByteArrayType>(page_writer, 0, 0, props);
+
+        let expected_value = vec![
+            255u8, 255u8, 255u8, 255u8, 255u8, 255u8, 255u8, 255u8, 179u8, 172u8, 19u8, 35u8,
+            231u8, 90u8, 0u8, 0u8,
+        ];
+        let data = vec![ByteArray::from(expected_value.clone()).into()];
+        writer.write_batch(&data, None, None).unwrap();
+        writer.flush_data_pages().unwrap();
+
+        let r = writer.close().unwrap();
+
+        // stats should still be written
+        // ensure bytes weren't truncated for column index
+        let column_index = r.column_index.unwrap();
+        let column_index_min_bytes = column_index.min_values[0].as_slice();
+        let column_index_max_bytes = column_index.max_values[0].as_slice();
+        assert_eq!(expected_value, column_index_min_bytes);
+        assert_eq!(expected_value, column_index_max_bytes);
+
+        // ensure bytes weren't truncated for statistics
+        let stats = r.metadata.statistics().unwrap();
+        assert!(stats.has_min_max_set());
+        if let Statistics::FixedLenByteArray(stats) = stats {
+            let stats_min_bytes = stats.min_bytes();
+            let stats_max_bytes = stats.max_bytes();
+            assert_eq!(expected_value, stats_min_bytes);
+            assert_eq!(expected_value, stats_max_bytes);
+        } else {
+            panic!("expecting Statistics::FixedLenByteArray");
+        }
+    }
+
     #[test]
     fn test_statistics_truncating_byte_array() {
         let page_writer = get_test_page_writer();
@@ -3421,7 +3515,7 @@ mod tests {
         values: &[FixedLenByteArray],
     ) -> ValueStatistics<FixedLenByteArray> {
         let page_writer = get_test_page_writer();
-        let mut writer = get_test_float16_column_writer(page_writer);
+        let mut writer = get_test_float16_column_writer(page_writer, Default::default());
         writer.write_batch(values, None, None).unwrap();
 
         let metadata = writer.close().unwrap().metadata;
@@ -3434,9 +3528,10 @@ mod tests {
 
     fn get_test_float16_column_writer(
         page_writer: Box<dyn PageWriter>,
+        props: WriterPropertiesPtr,
     ) -> ColumnWriterImpl<'static, FixedLenByteArrayType> {
         let descr = Arc::new(get_test_float16_column_descr(0, 0));
-        let column_writer = get_column_writer(descr, Default::default(), page_writer);
+        let column_writer = get_column_writer(descr, props, page_writer);
         get_typed_column_writer::<FixedLenByteArrayType>(column_writer)
     }