You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by al...@apache.org on 2022/08/08 11:59:56 UTC
[arrow-datafusion] branch master updated: Fix `IsNull` pruning expression generation without null_count statistics (#3044)
This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git
The following commit(s) were added to refs/heads/master by this push:
new 6e6f3bf9d Fix `IsNull` pruning expression generation without null_count statistics (#3044)
6e6f3bf9d is described below
commit 6e6f3bf9dec82b332e21f11699552c34f72493ac
Author: Andrew Lamb <an...@nerdnetworks.org>
AuthorDate: Mon Aug 8 07:59:52 2022 -0400
Fix `IsNull` pruning expression generation without null_count statistics (#3044)
---
datafusion/core/src/physical_optimizer/pruning.rs | 105 ++++++++++++++++++++--
1 file changed, 99 insertions(+), 6 deletions(-)
diff --git a/datafusion/core/src/physical_optimizer/pruning.rs b/datafusion/core/src/physical_optimizer/pruning.rs
index 809bbcaeb..859ef79fc 100644
--- a/datafusion/core/src/physical_optimizer/pruning.rs
+++ b/datafusion/core/src/physical_optimizer/pruning.rs
@@ -639,7 +639,7 @@ fn build_is_null_column_expr(
Expr::Column(ref col) => {
let field = schema.field_with_name(&col.name).ok()?;
- let null_count_field = &Field::new(field.name(), DataType::UInt64, false);
+ let null_count_field = &Field::new(field.name(), DataType::UInt64, true);
required_columns
.null_count_column_expr(col, expr, null_count_field)
.map(|null_count_column_expr| {
@@ -809,10 +809,19 @@ mod tests {
use std::collections::HashMap;
#[derive(Debug)]
- /// Test for container stats
+
+ /// Mock statistic provider for tests
+ ///
+ /// Each row represents the statistics for a "container" (which
+ /// might represent an entire parquet file, or directory of files,
+ /// or some other collection of data for which we had statistics)
+ ///
+ /// Note All `ArrayRefs` must be the same size.
struct ContainerStats {
min: ArrayRef,
max: ArrayRef,
+ /// Optional values
+ null_counts: Option<ArrayRef>,
}
impl ContainerStats {
@@ -835,6 +844,7 @@ mod tests {
.with_precision_and_scale(precision, scale)
.unwrap(),
),
+ null_counts: None,
}
}
@@ -845,6 +855,7 @@ mod tests {
Self {
min: Arc::new(min.into_iter().collect::<Int64Array>()),
max: Arc::new(max.into_iter().collect::<Int64Array>()),
+ null_counts: None,
}
}
@@ -855,6 +866,7 @@ mod tests {
Self {
min: Arc::new(min.into_iter().collect::<Int32Array>()),
max: Arc::new(max.into_iter().collect::<Int32Array>()),
+ null_counts: None,
}
}
@@ -865,6 +877,7 @@ mod tests {
Self {
min: Arc::new(min.into_iter().collect::<StringArray>()),
max: Arc::new(max.into_iter().collect::<StringArray>()),
+ null_counts: None,
}
}
@@ -875,6 +888,7 @@ mod tests {
Self {
min: Arc::new(min.into_iter().collect::<BooleanArray>()),
max: Arc::new(max.into_iter().collect::<BooleanArray>()),
+ null_counts: None,
}
}
@@ -886,10 +900,29 @@ mod tests {
Some(self.max.clone())
}
+ fn null_counts(&self) -> Option<ArrayRef> {
+ self.null_counts.clone()
+ }
+
fn len(&self) -> usize {
assert_eq!(self.min.len(), self.max.len());
self.min.len()
}
+
+ /// Add null counts. There must be the same number of null counts as
+ /// there are containers
+ fn with_null_counts(
+ mut self,
+ counts: impl IntoIterator<Item = Option<i64>>,
+ ) -> Self {
+ // take stats out and update them
+ let null_counts: ArrayRef =
+ Arc::new(counts.into_iter().collect::<Int64Array>());
+
+ assert_eq!(null_counts.len(), self.len());
+ self.null_counts = Some(null_counts);
+ self
+ }
}
#[derive(Debug, Default)]
@@ -908,8 +941,30 @@ mod tests {
name: impl Into<String>,
container_stats: ContainerStats,
) -> Self {
- self.stats
- .insert(Column::from_name(name.into()), container_stats);
+ let col = Column::from_name(name.into());
+ self.stats.insert(col, container_stats);
+ self
+ }
+
+ /// Add null counts for the specified columm.
+ /// There must be the same number of null counts as
+ /// there are containers
+ fn with_null_counts(
+ mut self,
+ name: impl Into<String>,
+ counts: impl IntoIterator<Item = Option<i64>>,
+ ) -> Self {
+ let col = Column::from_name(name.into());
+
+ // take stats out and update them
+ let container_stats = self
+ .stats
+ .remove(&col)
+ .expect("Can not find stats for column")
+ .with_null_counts(counts);
+
+ // put stats back in
+ self.stats.insert(col, container_stats);
self
}
}
@@ -937,8 +992,11 @@ mod tests {
.unwrap_or(0)
}
- fn null_counts(&self, _column: &Column) -> Option<ArrayRef> {
- None
+ fn null_counts(&self, column: &Column) -> Option<ArrayRef> {
+ self.stats
+ .get(column)
+ .map(|container_stats| container_stats.null_counts())
+ .unwrap_or(None)
}
}
@@ -1761,4 +1819,39 @@ mod tests {
let result = p.prune(&statistics).unwrap();
assert_eq!(result, expected_ret);
}
+
+ #[test]
+ fn prune_int32_is_null() {
+ let (schema, statistics) = int32_setup();
+
+ // Expression "i IS NULL" when there are no null statistics,
+ // should all be kept
+ let expected_ret = vec![true, true, true, true, true];
+
+ // i IS NULL, no null statistics
+ let expr = col("i").is_null();
+ let p = PruningPredicate::try_new(expr, schema.clone()).unwrap();
+ let result = p.prune(&statistics).unwrap();
+ assert_eq!(result, expected_ret);
+
+ // provide null counts for each column
+ let statistics = statistics.with_null_counts(
+ "i",
+ vec![
+ Some(0), // no nulls (don't keep)
+ Some(1), // 1 null
+ None, // unknown nulls
+ None, // unknown nulls (min/max are both null too, like no stats at all)
+ Some(0), // 0 nulls (max=null too which means no known max) (don't keep)
+ ],
+ );
+
+ let expected_ret = vec![false, true, true, true, false];
+
+ // i IS NULL, with actual null statistcs
+ let expr = col("i").is_null();
+ let p = PruningPredicate::try_new(expr, schema).unwrap();
+ let result = p.prune(&statistics).unwrap();
+ assert_eq!(result, expected_ret);
+ }
}