You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by al...@apache.org on 2021/08/09 11:25:06 UTC

[arrow-rs] branch active_release updated: Fix parquet string statistics generation (#643) (#676)

This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch active_release
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/active_release by this push:
     new 876f397  Fix parquet string statistics generation (#643) (#676)
876f397 is described below

commit 876f397a99821424681c7839de3ca729f525edd1
Author: Andrew Lamb <an...@nerdnetworks.org>
AuthorDate: Mon Aug 9 07:25:01 2021 -0400

    Fix parquet string statistics generation (#643) (#676)
    
    * Fix string statistics generation, add tests
    
    * fix Int96 stats test
    
    * Add notes for additional tickets
---
 parquet/src/column/writer.rs | 122 +++++++++++++++++++++++++++++++++++++++++++
 parquet/src/data_type.rs     |  29 +++++-----
 2 files changed, 134 insertions(+), 17 deletions(-)

diff --git a/parquet/src/column/writer.rs b/parquet/src/column/writer.rs
index d5b8457..3cb17e1 100644
--- a/parquet/src/column/writer.rs
+++ b/parquet/src/column/writer.rs
@@ -1688,6 +1688,128 @@ mod tests {
     }
 
     #[test]
+    fn test_bool_statistics() {
+        let stats = statistics_roundtrip::<BoolType>(&[true, false, false, true]);
+        assert!(stats.has_min_max_set());
+        // should it be BooleanStatistics??
+        // https://github.com/apache/arrow-rs/issues/659
+        if let Statistics::Int32(stats) = stats {
+            assert_eq!(stats.min(), &0);
+            assert_eq!(stats.max(), &1);
+        } else {
+            panic!("expecting Statistics::Int32, got {:?}", stats);
+        }
+    }
+
+    #[test]
+    fn test_int32_statistics() {
+        let stats = statistics_roundtrip::<Int32Type>(&[-1, 3, -2, 2]);
+        assert!(stats.has_min_max_set());
+        if let Statistics::Int32(stats) = stats {
+            assert_eq!(stats.min(), &-2);
+            assert_eq!(stats.max(), &3);
+        } else {
+            panic!("expecting Statistics::Int32, got {:?}", stats);
+        }
+    }
+
+    #[test]
+    fn test_int64_statistics() {
+        let stats = statistics_roundtrip::<Int64Type>(&[-1, 3, -2, 2]);
+        assert!(stats.has_min_max_set());
+        if let Statistics::Int64(stats) = stats {
+            assert_eq!(stats.min(), &-2);
+            assert_eq!(stats.max(), &3);
+        } else {
+            panic!("expecting Statistics::Int64, got {:?}", stats);
+        }
+    }
+
+    #[test]
+    fn test_int96_statistics() {
+        let input = vec![
+            Int96::from(vec![1, 20, 30]),
+            Int96::from(vec![3, 20, 10]),
+            Int96::from(vec![0, 20, 30]),
+            Int96::from(vec![2, 20, 30]),
+        ]
+        .into_iter()
+        .collect::<Vec<Int96>>();
+
+        let stats = statistics_roundtrip::<Int96Type>(&input);
+        assert!(stats.has_min_max_set());
+        if let Statistics::Int96(stats) = stats {
+            assert_eq!(stats.min(), &Int96::from(vec![0, 20, 30]));
+            assert_eq!(stats.max(), &Int96::from(vec![3, 20, 10]));
+        } else {
+            panic!("expecting Statistics::Int96, got {:?}", stats);
+        }
+    }
+
+    #[test]
+    fn test_float_statistics() {
+        let stats = statistics_roundtrip::<FloatType>(&[-1.0, 3.0, -2.0, 2.0]);
+        assert!(stats.has_min_max_set());
+        if let Statistics::Float(stats) = stats {
+            assert_eq!(stats.min(), &-2.0);
+            assert_eq!(stats.max(), &3.0);
+        } else {
+            panic!("expecting Statistics::Float, got {:?}", stats);
+        }
+    }
+
+    #[test]
+    fn test_double_statistics() {
+        let stats = statistics_roundtrip::<DoubleType>(&[-1.0, 3.0, -2.0, 2.0]);
+        assert!(stats.has_min_max_set());
+        if let Statistics::Double(stats) = stats {
+            assert_eq!(stats.min(), &-2.0);
+            assert_eq!(stats.max(), &3.0);
+        } else {
+            panic!("expecting Statistics::Double, got {:?}", stats);
+        }
+    }
+
+    #[test]
+    fn test_byte_array_statistics() {
+        let input = vec!["aawaa", "zz", "aaw", "m", "qrs"]
+            .iter()
+            .map(|&s| s.into())
+            .collect::<Vec<ByteArray>>();
+
+        let stats = statistics_roundtrip::<ByteArrayType>(&input);
+        assert!(stats.has_min_max_set());
+        if let Statistics::ByteArray(stats) = stats {
+            assert_eq!(stats.min(), &ByteArray::from("aaw"));
+            assert_eq!(stats.max(), &ByteArray::from("zz"));
+        } else {
+            panic!("expecting Statistics::ByteArray, got {:?}", stats);
+        }
+    }
+
+    #[test]
+    fn test_fixed_len_byte_array_statistics() {
+        let input = vec!["aawaa", "zz   ", "aaw  ", "m    ", "qrs  "]
+            .iter()
+            .map(|&s| {
+                let b: ByteArray = s.into();
+                b.into()
+            })
+            .collect::<Vec<FixedLenByteArray>>();
+
+        let stats = statistics_roundtrip::<FixedLenByteArrayType>(&input);
+        assert!(stats.has_min_max_set());
+        // should it be FixedLenByteArray?
+        // https://github.com/apache/arrow-rs/issues/660
+        if let Statistics::ByteArray(stats) = stats {
+            assert_eq!(stats.min(), &ByteArray::from("aaw  "));
+            assert_eq!(stats.max(), &ByteArray::from("zz   "));
+        } else {
+            panic!("expecting Statistics::ByteArray, got {:?}", stats);
+        }
+    }
+
+    #[test]
     fn test_float_statistics_nan_middle() {
         let stats = statistics_roundtrip::<FloatType>(&[1.0, f32::NAN, 2.0]);
         assert!(stats.has_min_max_set());
diff --git a/parquet/src/data_type.rs b/parquet/src/data_type.rs
index 797324e..127ba95 100644
--- a/parquet/src/data_type.rs
+++ b/parquet/src/data_type.rs
@@ -128,23 +128,18 @@ impl std::fmt::Debug for ByteArray {
 
 impl PartialOrd for ByteArray {
     fn partial_cmp(&self, other: &ByteArray) -> Option<Ordering> {
-        if self.data.is_some() && other.data.is_some() {
-            match self.len().cmp(&other.len()) {
-                Ordering::Greater => Some(Ordering::Greater),
-                Ordering::Less => Some(Ordering::Less),
-                Ordering::Equal => {
-                    for (v1, v2) in self.data().iter().zip(other.data().iter()) {
-                        match v1.cmp(v2) {
-                            Ordering::Greater => return Some(Ordering::Greater),
-                            Ordering::Less => return Some(Ordering::Less),
-                            _ => {}
-                        }
-                    }
-                    Some(Ordering::Equal)
-                }
+        // sort nulls first (consistent with PartialCmp on Option)
+        //
+        // Since ByteBuffer doesn't implement PartialOrd, so can't
+        // derive an implementation
+        match (&self.data, &other.data) {
+            (None, None) => Some(Ordering::Equal),
+            (None, Some(_)) => Some(Ordering::Less),
+            (Some(_), None) => Some(Ordering::Greater),
+            (Some(self_data), Some(other_data)) => {
+                // compare slices directly
+                self_data.data().partial_cmp(other_data.data())
             }
-        } else {
-            None
         }
     }
 }
@@ -1368,7 +1363,7 @@ mod tests {
         let ba4 = ByteArray::from(vec![]);
         let ba5 = ByteArray::from(vec![2, 2, 3]);
 
-        assert!(ba1 > ba2);
+        assert!(ba1 < ba2);
         assert!(ba3 > ba1);
         assert!(ba1 > ba4);
         assert_eq!(ba1, ba11);