You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by al...@apache.org on 2023/11/09 16:49:48 UTC

(arrow-datafusion) branch main updated: Bug-fix in Filter and Limit statistics (#8094)

This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git


The following commit(s) were added to refs/heads/main by this push:
     new e54894c392 Bug-fix in Filter and Limit statistics (#8094)
e54894c392 is described below

commit e54894c39202815b14d9e7eae58f64d3a269c165
Author: Berkay Şahin <12...@users.noreply.github.com>
AuthorDate: Thu Nov 9 19:49:42 2023 +0300

    Bug-fix in Filter and Limit statistics (#8094)
    
    * Bug fix and code simplification
    
    * Remove limit stats changes
    
    * Test added
    
    * Reduce code diff
---
 datafusion/common/src/stats.rs               |  6 +++--
 datafusion/core/src/datasource/statistics.rs |  6 ++++-
 datafusion/physical-plan/src/filter.rs       | 38 ++++++++++++++++++++++++++--
 3 files changed, 45 insertions(+), 5 deletions(-)

diff --git a/datafusion/common/src/stats.rs b/datafusion/common/src/stats.rs
index fbf639a321..2e799c92be 100644
--- a/datafusion/common/src/stats.rs
+++ b/datafusion/common/src/stats.rs
@@ -279,9 +279,11 @@ pub struct ColumnStatistics {
 impl ColumnStatistics {
     /// Column contains a single non null value (e.g constant).
     pub fn is_singleton(&self) -> bool {
-        match (self.min_value.get_value(), self.max_value.get_value()) {
+        match (&self.min_value, &self.max_value) {
             // Min and max values are the same and not infinity.
-            (Some(min), Some(max)) => !min.is_null() && !max.is_null() && (min == max),
+            (Precision::Exact(min), Precision::Exact(max)) => {
+                !min.is_null() && !max.is_null() && (min == max)
+            }
             (_, _) => false,
         }
     }
diff --git a/datafusion/core/src/datasource/statistics.rs b/datafusion/core/src/datasource/statistics.rs
index 3d8248dfde..695e139517 100644
--- a/datafusion/core/src/datasource/statistics.rs
+++ b/datafusion/core/src/datasource/statistics.rs
@@ -70,7 +70,11 @@ pub async fn get_statistics_with_limit(
         // files. This only applies when we know the number of rows. It also
         // currently ignores tables that have no statistics regarding the
         // number of rows.
-        if num_rows.get_value().unwrap_or(&usize::MIN) <= &limit.unwrap_or(usize::MAX) {
+        let conservative_num_rows = match num_rows {
+            Precision::Exact(nr) => nr,
+            _ => usize::MIN,
+        };
+        if conservative_num_rows <= limit.unwrap_or(usize::MAX) {
             while let Some(current) = all_files.next().await {
                 let (file, file_stats) = current?;
                 result_files.push(file);
diff --git a/datafusion/physical-plan/src/filter.rs b/datafusion/physical-plan/src/filter.rs
index ce66d61472..0c44b367e5 100644
--- a/datafusion/physical-plan/src/filter.rs
+++ b/datafusion/physical-plan/src/filter.rs
@@ -252,13 +252,25 @@ fn collect_new_statistics(
                 },
             )| {
                 let closed_interval = interval.close_bounds();
+                let (min_value, max_value) =
+                    if closed_interval.lower.value.eq(&closed_interval.upper.value) {
+                        (
+                            Precision::Exact(closed_interval.lower.value),
+                            Precision::Exact(closed_interval.upper.value),
+                        )
+                    } else {
+                        (
+                            Precision::Inexact(closed_interval.lower.value),
+                            Precision::Inexact(closed_interval.upper.value),
+                        )
+                    };
                 ColumnStatistics {
                     null_count: match input_column_stats[idx].null_count.get_value() {
                         Some(nc) => Precision::Inexact(*nc),
                         None => Precision::Absent,
                     },
-                    max_value: Precision::Inexact(closed_interval.upper.value),
-                    min_value: Precision::Inexact(closed_interval.lower.value),
+                    max_value,
+                    min_value,
                     distinct_count: match distinct_count.get_value() {
                         Some(dc) => Precision::Inexact(*dc),
                         None => Precision::Absent,
@@ -963,4 +975,26 @@ mod tests {
 
         Ok(())
     }
+
+    #[tokio::test]
+    async fn test_statistics_with_constant_column() -> Result<()> {
+        let schema = Schema::new(vec![Field::new("a", DataType::Int32, false)]);
+        let input = Arc::new(StatisticsExec::new(
+            Statistics::new_unknown(&schema),
+            schema,
+        ));
+        // WHERE a = 10
+        let predicate = Arc::new(BinaryExpr::new(
+            Arc::new(Column::new("a", 0)),
+            Operator::Eq,
+            Arc::new(Literal::new(ScalarValue::Int32(Some(10)))),
+        ));
+        let filter: Arc<dyn ExecutionPlan> =
+            Arc::new(FilterExec::try_new(predicate, input)?);
+        let filter_statistics = filter.statistics()?;
+        // First column is "a", and it is a column with only one value after the filter.
+        assert!(filter_statistics.column_statistics[0].is_singleton());
+
+        Ok(())
+    }
 }