You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by al...@apache.org on 2023/11/09 16:49:48 UTC
(arrow-datafusion) branch main updated: Bug-fix in Filter and Limit statistics (#8094)
This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git
The following commit(s) were added to refs/heads/main by this push:
new e54894c392 Bug-fix in Filter and Limit statistics (#8094)
e54894c392 is described below
commit e54894c39202815b14d9e7eae58f64d3a269c165
Author: Berkay Şahin <12...@users.noreply.github.com>
AuthorDate: Thu Nov 9 19:49:42 2023 +0300
Bug-fix in Filter and Limit statistics (#8094)
* Bug fix and code simplification
* Remove limit stats changes
* Test added
* Reduce code diff
---
datafusion/common/src/stats.rs | 6 +++--
datafusion/core/src/datasource/statistics.rs | 6 ++++-
datafusion/physical-plan/src/filter.rs | 38 ++++++++++++++++++++++++++--
3 files changed, 45 insertions(+), 5 deletions(-)
diff --git a/datafusion/common/src/stats.rs b/datafusion/common/src/stats.rs
index fbf639a321..2e799c92be 100644
--- a/datafusion/common/src/stats.rs
+++ b/datafusion/common/src/stats.rs
@@ -279,9 +279,11 @@ pub struct ColumnStatistics {
impl ColumnStatistics {
/// Column contains a single non null value (e.g constant).
pub fn is_singleton(&self) -> bool {
- match (self.min_value.get_value(), self.max_value.get_value()) {
+ match (&self.min_value, &self.max_value) {
// Min and max values are the same and not infinity.
- (Some(min), Some(max)) => !min.is_null() && !max.is_null() && (min == max),
+ (Precision::Exact(min), Precision::Exact(max)) => {
+ !min.is_null() && !max.is_null() && (min == max)
+ }
(_, _) => false,
}
}
diff --git a/datafusion/core/src/datasource/statistics.rs b/datafusion/core/src/datasource/statistics.rs
index 3d8248dfde..695e139517 100644
--- a/datafusion/core/src/datasource/statistics.rs
+++ b/datafusion/core/src/datasource/statistics.rs
@@ -70,7 +70,11 @@ pub async fn get_statistics_with_limit(
// files. This only applies when we know the number of rows. It also
// currently ignores tables that have no statistics regarding the
// number of rows.
- if num_rows.get_value().unwrap_or(&usize::MIN) <= &limit.unwrap_or(usize::MAX) {
+ let conservative_num_rows = match num_rows {
+ Precision::Exact(nr) => nr,
+ _ => usize::MIN,
+ };
+ if conservative_num_rows <= limit.unwrap_or(usize::MAX) {
while let Some(current) = all_files.next().await {
let (file, file_stats) = current?;
result_files.push(file);
diff --git a/datafusion/physical-plan/src/filter.rs b/datafusion/physical-plan/src/filter.rs
index ce66d61472..0c44b367e5 100644
--- a/datafusion/physical-plan/src/filter.rs
+++ b/datafusion/physical-plan/src/filter.rs
@@ -252,13 +252,25 @@ fn collect_new_statistics(
},
)| {
let closed_interval = interval.close_bounds();
+ let (min_value, max_value) =
+ if closed_interval.lower.value.eq(&closed_interval.upper.value) {
+ (
+ Precision::Exact(closed_interval.lower.value),
+ Precision::Exact(closed_interval.upper.value),
+ )
+ } else {
+ (
+ Precision::Inexact(closed_interval.lower.value),
+ Precision::Inexact(closed_interval.upper.value),
+ )
+ };
ColumnStatistics {
null_count: match input_column_stats[idx].null_count.get_value() {
Some(nc) => Precision::Inexact(*nc),
None => Precision::Absent,
},
- max_value: Precision::Inexact(closed_interval.upper.value),
- min_value: Precision::Inexact(closed_interval.lower.value),
+ max_value,
+ min_value,
distinct_count: match distinct_count.get_value() {
Some(dc) => Precision::Inexact(*dc),
None => Precision::Absent,
@@ -963,4 +975,26 @@ mod tests {
Ok(())
}
+
+ #[tokio::test]
+ async fn test_statistics_with_constant_column() -> Result<()> {
+ let schema = Schema::new(vec![Field::new("a", DataType::Int32, false)]);
+ let input = Arc::new(StatisticsExec::new(
+ Statistics::new_unknown(&schema),
+ schema,
+ ));
+ // WHERE a = 10
+ let predicate = Arc::new(BinaryExpr::new(
+ Arc::new(Column::new("a", 0)),
+ Operator::Eq,
+ Arc::new(Literal::new(ScalarValue::Int32(Some(10)))),
+ ));
+ let filter: Arc<dyn ExecutionPlan> =
+ Arc::new(FilterExec::try_new(predicate, input)?);
+ let filter_statistics = filter.statistics()?;
+ // First column is "a", and it is a column with only one value after the filter.
+ assert!(filter_statistics.column_statistics[0].is_singleton());
+
+ Ok(())
+ }
}