You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by al...@apache.org on 2021/08/09 11:25:06 UTC
[arrow-rs] branch active_release updated: Fix parquet string
statistics generation (#643) (#676)
This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch active_release
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/active_release by this push:
new 876f397 Fix parquet string statistics generation (#643) (#676)
876f397 is described below
commit 876f397a99821424681c7839de3ca729f525edd1
Author: Andrew Lamb <an...@nerdnetworks.org>
AuthorDate: Mon Aug 9 07:25:01 2021 -0400
Fix parquet string statistics generation (#643) (#676)
* Fix string statistics generation, add tests
* fix Int96 stats test
* Add notes for additional tickets
---
parquet/src/column/writer.rs | 122 +++++++++++++++++++++++++++++++++++++++++++
parquet/src/data_type.rs | 29 +++++-----
2 files changed, 134 insertions(+), 17 deletions(-)
diff --git a/parquet/src/column/writer.rs b/parquet/src/column/writer.rs
index d5b8457..3cb17e1 100644
--- a/parquet/src/column/writer.rs
+++ b/parquet/src/column/writer.rs
@@ -1688,6 +1688,128 @@ mod tests {
}
#[test]
+ fn test_bool_statistics() {
+ let stats = statistics_roundtrip::<BoolType>(&[true, false, false, true]);
+ assert!(stats.has_min_max_set());
+ // should it be BooleanStatistics??
+ // https://github.com/apache/arrow-rs/issues/659
+ if let Statistics::Int32(stats) = stats {
+ assert_eq!(stats.min(), &0);
+ assert_eq!(stats.max(), &1);
+ } else {
+ panic!("expecting Statistics::Int32, got {:?}", stats);
+ }
+ }
+
+ #[test]
+ fn test_int32_statistics() {
+ let stats = statistics_roundtrip::<Int32Type>(&[-1, 3, -2, 2]);
+ assert!(stats.has_min_max_set());
+ if let Statistics::Int32(stats) = stats {
+ assert_eq!(stats.min(), &-2);
+ assert_eq!(stats.max(), &3);
+ } else {
+ panic!("expecting Statistics::Int32, got {:?}", stats);
+ }
+ }
+
+ #[test]
+ fn test_int64_statistics() {
+ let stats = statistics_roundtrip::<Int64Type>(&[-1, 3, -2, 2]);
+ assert!(stats.has_min_max_set());
+ if let Statistics::Int64(stats) = stats {
+ assert_eq!(stats.min(), &-2);
+ assert_eq!(stats.max(), &3);
+ } else {
+ panic!("expecting Statistics::Int64, got {:?}", stats);
+ }
+ }
+
+ #[test]
+ fn test_int96_statistics() {
+ let input = vec![
+ Int96::from(vec![1, 20, 30]),
+ Int96::from(vec![3, 20, 10]),
+ Int96::from(vec![0, 20, 30]),
+ Int96::from(vec![2, 20, 30]),
+ ]
+ .into_iter()
+ .collect::<Vec<Int96>>();
+
+ let stats = statistics_roundtrip::<Int96Type>(&input);
+ assert!(stats.has_min_max_set());
+ if let Statistics::Int96(stats) = stats {
+ assert_eq!(stats.min(), &Int96::from(vec![0, 20, 30]));
+ assert_eq!(stats.max(), &Int96::from(vec![3, 20, 10]));
+ } else {
+ panic!("expecting Statistics::Int96, got {:?}", stats);
+ }
+ }
+
+ #[test]
+ fn test_float_statistics() {
+ let stats = statistics_roundtrip::<FloatType>(&[-1.0, 3.0, -2.0, 2.0]);
+ assert!(stats.has_min_max_set());
+ if let Statistics::Float(stats) = stats {
+ assert_eq!(stats.min(), &-2.0);
+ assert_eq!(stats.max(), &3.0);
+ } else {
+ panic!("expecting Statistics::Float, got {:?}", stats);
+ }
+ }
+
+ #[test]
+ fn test_double_statistics() {
+ let stats = statistics_roundtrip::<DoubleType>(&[-1.0, 3.0, -2.0, 2.0]);
+ assert!(stats.has_min_max_set());
+ if let Statistics::Double(stats) = stats {
+ assert_eq!(stats.min(), &-2.0);
+ assert_eq!(stats.max(), &3.0);
+ } else {
+ panic!("expecting Statistics::Double, got {:?}", stats);
+ }
+ }
+
+ #[test]
+ fn test_byte_array_statistics() {
+ let input = vec!["aawaa", "zz", "aaw", "m", "qrs"]
+ .iter()
+ .map(|&s| s.into())
+ .collect::<Vec<ByteArray>>();
+
+ let stats = statistics_roundtrip::<ByteArrayType>(&input);
+ assert!(stats.has_min_max_set());
+ if let Statistics::ByteArray(stats) = stats {
+ assert_eq!(stats.min(), &ByteArray::from("aaw"));
+ assert_eq!(stats.max(), &ByteArray::from("zz"));
+ } else {
+ panic!("expecting Statistics::ByteArray, got {:?}", stats);
+ }
+ }
+
+ #[test]
+ fn test_fixed_len_byte_array_statistics() {
+ let input = vec!["aawaa", "zz ", "aaw ", "m ", "qrs "]
+ .iter()
+ .map(|&s| {
+ let b: ByteArray = s.into();
+ b.into()
+ })
+ .collect::<Vec<FixedLenByteArray>>();
+
+ let stats = statistics_roundtrip::<FixedLenByteArrayType>(&input);
+ assert!(stats.has_min_max_set());
+ // should it be FixedLenByteArray?
+ // https://github.com/apache/arrow-rs/issues/660
+ if let Statistics::ByteArray(stats) = stats {
+ assert_eq!(stats.min(), &ByteArray::from("aaw "));
+ assert_eq!(stats.max(), &ByteArray::from("zz "));
+ } else {
+ panic!("expecting Statistics::ByteArray, got {:?}", stats);
+ }
+ }
+
+ #[test]
fn test_float_statistics_nan_middle() {
let stats = statistics_roundtrip::<FloatType>(&[1.0, f32::NAN, 2.0]);
assert!(stats.has_min_max_set());
diff --git a/parquet/src/data_type.rs b/parquet/src/data_type.rs
index 797324e..127ba95 100644
--- a/parquet/src/data_type.rs
+++ b/parquet/src/data_type.rs
@@ -128,23 +128,18 @@ impl std::fmt::Debug for ByteArray {
impl PartialOrd for ByteArray {
fn partial_cmp(&self, other: &ByteArray) -> Option<Ordering> {
- if self.data.is_some() && other.data.is_some() {
- match self.len().cmp(&other.len()) {
- Ordering::Greater => Some(Ordering::Greater),
- Ordering::Less => Some(Ordering::Less),
- Ordering::Equal => {
- for (v1, v2) in self.data().iter().zip(other.data().iter()) {
- match v1.cmp(v2) {
- Ordering::Greater => return Some(Ordering::Greater),
- Ordering::Less => return Some(Ordering::Less),
- _ => {}
- }
- }
- Some(Ordering::Equal)
- }
+ // sort nulls first (consistent with PartialCmp on Option)
+ //
+ // Since ByteBuffer doesn't implement PartialOrd, so can't
+ // derive an implementation
+ match (&self.data, &other.data) {
+ (None, None) => Some(Ordering::Equal),
+ (None, Some(_)) => Some(Ordering::Less),
+ (Some(_), None) => Some(Ordering::Greater),
+ (Some(self_data), Some(other_data)) => {
+ // compare slices directly
+ self_data.data().partial_cmp(other_data.data())
}
- } else {
- None
}
}
}
@@ -1368,7 +1363,7 @@ mod tests {
let ba4 = ByteArray::from(vec![]);
let ba5 = ByteArray::from(vec![2, 2, 3]);
- assert!(ba1 > ba2);
+ assert!(ba1 < ba2);
assert!(ba3 > ba1);
assert!(ba1 > ba4);
assert_eq!(ba1, ba11);