You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ya...@apache.org on 2024/01/20 10:52:59 UTC

(arrow-datafusion) branch main updated: Minor: distinguish parquet row group pruning test type (#8921)

This is an automated email from the ASF dual-hosted git repository.

yangjiang pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git


The following commit(s) were added to refs/heads/main by this push:
     new 95e739cb60 Minor: distinguish parquet row group pruning test type (#8921)
95e739cb60 is described below

commit 95e739cb605307d3337c54ef3f0ab8c72cca5717
Author: Yang Jiang <ya...@ebay.com>
AuthorDate: Sat Jan 20 18:52:53 2024 +0800

    Minor: distinguish parquet row group pruning test type (#8921)
---
 datafusion/core/tests/parquet/row_group_pruning.rs | 122 ++++++++++++++-------
 1 file changed, 83 insertions(+), 39 deletions(-)

diff --git a/datafusion/core/tests/parquet/row_group_pruning.rs b/datafusion/core/tests/parquet/row_group_pruning.rs
index 2bc5bd3f1c..fc1b66efed 100644
--- a/datafusion/core/tests/parquet/row_group_pruning.rs
+++ b/datafusion/core/tests/parquet/row_group_pruning.rs
@@ -26,11 +26,12 @@ use crate::parquet::Unit::RowGroup;
 use crate::parquet::{ContextWithParquet, Scenario};
 use datafusion_expr::{col, lit};
 
-async fn test_prune(
+async fn test_row_group_prune(
     case_data_type: Scenario,
     sql: &str,
     expected_errors: Option<usize>,
-    expected_row_group_pruned: Option<usize>,
+    expected_row_group_pruned_by_statistics: Option<usize>,
+    expected_row_group_pruned_by_bloom_filter: Option<usize>,
     expected_results: usize,
 ) {
     let output = ContextWithParquet::new(case_data_type, RowGroup)
@@ -40,7 +41,14 @@ async fn test_prune(
 
     println!("{}", output.description());
     assert_eq!(output.predicate_evaluation_errors(), expected_errors);
-    assert_eq!(output.row_groups_pruned(), expected_row_group_pruned);
+    assert_eq!(
+        output.row_groups_pruned_statistics(),
+        expected_row_group_pruned_by_statistics
+    );
+    assert_eq!(
+        output.row_groups_pruned_bloom_filter(),
+        expected_row_group_pruned_by_bloom_filter
+    );
     assert_eq!(
         output.result_rows,
         expected_results,
@@ -83,11 +91,12 @@ async fn test_prune_verbose(
 
 #[tokio::test]
 async fn prune_timestamps_nanos() {
-    test_prune(
+    test_row_group_prune(
         Scenario::Timestamps,
         "SELECT * FROM t where nanos < to_timestamp('2020-01-02 01:01:11Z')",
         Some(0),
         Some(1),
+        Some(0),
         10,
     )
     .await;
@@ -95,11 +104,12 @@ async fn prune_timestamps_nanos() {
 
 #[tokio::test]
 async fn prune_timestamps_micros() {
-    test_prune(
+    test_row_group_prune(
         Scenario::Timestamps,
         "SELECT * FROM t where micros < to_timestamp_micros('2020-01-02 01:01:11Z')",
         Some(0),
         Some(1),
+        Some(0),
         10,
     )
     .await;
@@ -107,11 +117,12 @@ async fn prune_timestamps_micros() {
 
 #[tokio::test]
 async fn prune_timestamps_millis() {
-    test_prune(
+    test_row_group_prune(
         Scenario::Timestamps,
         "SELECT * FROM t where millis < to_timestamp_millis('2020-01-02 01:01:11Z')",
         Some(0),
         Some(1),
+        Some(0),
         10,
     )
     .await;
@@ -119,11 +130,12 @@ async fn prune_timestamps_millis() {
 
 #[tokio::test]
 async fn prune_timestamps_seconds() {
-    test_prune(
+    test_row_group_prune(
         Scenario::Timestamps,
         "SELECT * FROM t where seconds < to_timestamp_seconds('2020-01-02 01:01:11Z')",
         Some(0),
         Some(1),
+        Some(0),
         10,
     )
     .await;
@@ -131,11 +143,12 @@ async fn prune_timestamps_seconds() {
 
 #[tokio::test]
 async fn prune_date32() {
-    test_prune(
+    test_row_group_prune(
         Scenario::Dates,
         "SELECT * FROM t where date32 < cast('2020-01-02' as date)",
         Some(0),
         Some(3),
+        Some(0),
         1,
     )
     .await;
@@ -168,11 +181,12 @@ async fn prune_date64() {
 
 #[tokio::test]
 async fn prune_disabled() {
-    test_prune(
+    test_row_group_prune(
         Scenario::Timestamps,
         "SELECT * FROM t where nanos < to_timestamp('2020-01-02 01:01:11Z')",
         Some(0),
         Some(1),
+        Some(0),
         10,
     )
     .await;
@@ -201,21 +215,23 @@ async fn prune_disabled() {
 
 #[tokio::test]
 async fn prune_int32_lt() {
-    test_prune(
+    test_row_group_prune(
         Scenario::Int32,
         "SELECT * FROM t where i < 1",
         Some(0),
         Some(1),
+        Some(0),
         11,
     )
     .await;
     // result of sql "SELECT * FROM t where i < 1" is same as
     // "SELECT * FROM t where -i > -1"
-    test_prune(
+    test_row_group_prune(
         Scenario::Int32,
         "SELECT * FROM t where -i > -1",
         Some(0),
         Some(1),
+        Some(0),
         11,
     )
     .await;
@@ -223,22 +239,24 @@ async fn prune_int32_lt() {
 
 #[tokio::test]
 async fn prune_int32_eq() {
-    test_prune(
+    test_row_group_prune(
         Scenario::Int32,
         "SELECT * FROM t where i = 1",
         Some(0),
         Some(3),
+        Some(0),
         1,
     )
     .await;
 }
 #[tokio::test]
 async fn prune_int32_scalar_fun_and_eq() {
-    test_prune(
+    test_row_group_prune(
         Scenario::Int32,
         "SELECT * FROM t where abs(i) = 1  and i = 1",
         Some(0),
         Some(3),
+        Some(0),
         1,
     )
     .await;
@@ -246,11 +264,12 @@ async fn prune_int32_scalar_fun_and_eq() {
 
 #[tokio::test]
 async fn prune_int32_scalar_fun() {
-    test_prune(
+    test_row_group_prune(
         Scenario::Int32,
         "SELECT * FROM t where abs(i) = 1",
         Some(0),
         Some(0),
+        Some(0),
         3,
     )
     .await;
@@ -258,11 +277,12 @@ async fn prune_int32_scalar_fun() {
 
 #[tokio::test]
 async fn prune_int32_complex_expr() {
-    test_prune(
+    test_row_group_prune(
         Scenario::Int32,
         "SELECT * FROM t where i+1 = 1",
         Some(0),
         Some(0),
+        Some(0),
         2,
     )
     .await;
@@ -270,11 +290,12 @@ async fn prune_int32_complex_expr() {
 
 #[tokio::test]
 async fn prune_int32_complex_expr_subtract() {
-    test_prune(
+    test_row_group_prune(
         Scenario::Int32,
         "SELECT * FROM t where 1-i > 1",
         Some(0),
         Some(0),
+        Some(0),
         9,
     )
     .await;
@@ -282,19 +303,21 @@ async fn prune_int32_complex_expr_subtract() {
 
 #[tokio::test]
 async fn prune_f64_lt() {
-    test_prune(
+    test_row_group_prune(
         Scenario::Float64,
         "SELECT * FROM t where f < 1",
         Some(0),
         Some(1),
+        Some(0),
         11,
     )
     .await;
-    test_prune(
+    test_row_group_prune(
         Scenario::Float64,
         "SELECT * FROM t where -f > -1",
         Some(0),
         Some(1),
+        Some(0),
         11,
     )
     .await;
@@ -304,11 +327,12 @@ async fn prune_f64_lt() {
 async fn prune_f64_scalar_fun_and_gt() {
     // result of sql "SELECT * FROM t where abs(f - 1) <= 0.000001  and f >= 0.1"
     // only use "f >= 0" to prune
-    test_prune(
+    test_row_group_prune(
         Scenario::Float64,
         "SELECT * FROM t where abs(f - 1) <= 0.000001  and f >= 0.1",
         Some(0),
         Some(2),
+        Some(0),
         1,
     )
     .await;
@@ -317,11 +341,12 @@ async fn prune_f64_scalar_fun_and_gt() {
 #[tokio::test]
 async fn prune_f64_scalar_fun() {
     // result of sql "SELECT * FROM t where abs(f-1) <= 0.000001" is not supported
-    test_prune(
+    test_row_group_prune(
         Scenario::Float64,
         "SELECT * FROM t where abs(f-1) <= 0.000001",
         Some(0),
         Some(0),
+        Some(0),
         1,
     )
     .await;
@@ -330,11 +355,12 @@ async fn prune_f64_scalar_fun() {
 #[tokio::test]
 async fn prune_f64_complex_expr() {
     // result of sql "SELECT * FROM t where f+1 > 1.1"" is not supported
-    test_prune(
+    test_row_group_prune(
         Scenario::Float64,
         "SELECT * FROM t where f+1 > 1.1",
         Some(0),
         Some(0),
+        Some(0),
         9,
     )
     .await;
@@ -343,11 +369,12 @@ async fn prune_f64_complex_expr() {
 #[tokio::test]
 async fn prune_f64_complex_expr_subtract() {
     // result of sql "SELECT * FROM t where 1-f > 1" is not supported
-    test_prune(
+    test_row_group_prune(
         Scenario::Float64,
         "SELECT * FROM t where 1-f > 1",
         Some(0),
         Some(0),
+        Some(0),
         9,
     )
     .await;
@@ -356,11 +383,12 @@ async fn prune_f64_complex_expr_subtract() {
 #[tokio::test]
 async fn prune_int32_eq_in_list() {
     // result of sql "SELECT * FROM t where in (1)"
-    test_prune(
+    test_row_group_prune(
         Scenario::Int32,
         "SELECT * FROM t where i in (1)",
         Some(0),
         Some(3),
+        Some(0),
         1,
     )
     .await;
@@ -404,11 +432,12 @@ async fn prune_int32_eq_large_in_list() {
 #[tokio::test]
 async fn prune_int32_eq_in_list_negated() {
     // result of sql "SELECT * FROM t where not in (1)" prune nothing
-    test_prune(
+    test_row_group_prune(
         Scenario::Int32,
         "SELECT * FROM t where i not in (1)",
         Some(0),
         Some(0),
+        Some(0),
         19,
     )
     .await;
@@ -419,39 +448,43 @@ async fn prune_decimal_lt() {
     // The data type of decimal_col is decimal(9,2)
     // There are three row groups:
     // [1.00, 6.00], [-5.00,6.00], [20.00,60.00]
-    test_prune(
+    test_row_group_prune(
         Scenario::Decimal,
         "SELECT * FROM t where decimal_col < 4",
         Some(0),
         Some(1),
+        Some(0),
         6,
     )
     .await;
     // compare with the casted decimal value
-    test_prune(
+    test_row_group_prune(
         Scenario::Decimal,
         "SELECT * FROM t where decimal_col < cast(4.55 as decimal(20,2))",
         Some(0),
         Some(1),
+        Some(0),
         8,
     )
     .await;
 
     // The data type of decimal_col is decimal(38,2)
-    test_prune(
+    test_row_group_prune(
         Scenario::DecimalLargePrecision,
         "SELECT * FROM t where decimal_col < 4",
         Some(0),
         Some(1),
+        Some(0),
         6,
     )
     .await;
     // compare with the casted decimal value
-    test_prune(
+    test_row_group_prune(
         Scenario::DecimalLargePrecision,
         "SELECT * FROM t where decimal_col < cast(4.55 as decimal(20,2))",
         Some(0),
         Some(1),
+        Some(0),
         8,
     )
     .await;
@@ -462,37 +495,41 @@ async fn prune_decimal_eq() {
     // The data type of decimal_col is decimal(9,2)
     // There are three row groups:
     // [1.00, 6.00], [-5.00,6.00], [20.00,60.00]
-    test_prune(
+    test_row_group_prune(
         Scenario::Decimal,
         "SELECT * FROM t where decimal_col = 4",
         Some(0),
         Some(1),
+        Some(0),
         2,
     )
     .await;
-    test_prune(
+    test_row_group_prune(
         Scenario::Decimal,
         "SELECT * FROM t where decimal_col = 4.00",
         Some(0),
         Some(1),
+        Some(0),
         2,
     )
     .await;
 
     // The data type of decimal_col is decimal(38,2)
-    test_prune(
+    test_row_group_prune(
         Scenario::DecimalLargePrecision,
         "SELECT * FROM t where decimal_col = 4",
         Some(0),
         Some(1),
+        Some(0),
         2,
     )
     .await;
-    test_prune(
+    test_row_group_prune(
         Scenario::DecimalLargePrecision,
         "SELECT * FROM t where decimal_col = 4.00",
         Some(0),
         Some(1),
+        Some(0),
         2,
     )
     .await;
@@ -503,37 +540,41 @@ async fn prune_decimal_in_list() {
     // The data type of decimal_col is decimal(9,2)
     // There are three row groups:
     // [1.00, 6.00], [-5.00,6.00], [20.00,60.00]
-    test_prune(
+    test_row_group_prune(
         Scenario::Decimal,
         "SELECT * FROM t where decimal_col in (4,3,2,123456789123)",
         Some(0),
         Some(1),
+        Some(0),
         5,
     )
     .await;
-    test_prune(
+    test_row_group_prune(
         Scenario::Decimal,
         "SELECT * FROM t where decimal_col in (4.00,3.00,11.2345,1)",
         Some(0),
         Some(1),
+        Some(0),
         6,
     )
     .await;
 
     // The data type of decimal_col is decimal(38,2)
-    test_prune(
+    test_row_group_prune(
         Scenario::DecimalLargePrecision,
         "SELECT * FROM t where decimal_col in (4,3,2,123456789123)",
         Some(0),
         Some(1),
+        Some(0),
         5,
     )
     .await;
-    test_prune(
+    test_row_group_prune(
         Scenario::DecimalLargePrecision,
         "SELECT * FROM t where decimal_col in (4.00,3.00,11.2345,1)",
         Some(0),
         Some(1),
+        Some(0),
         6,
     )
     .await;
@@ -545,28 +586,31 @@ async fn prune_periods_in_column_names() {
     // name = "HTTP GET / DISPATCH", service.name = ['frontend', 'frontend'],
     // name = "HTTP PUT / DISPATCH", service.name = ['backend',  'frontend'],
     // name = "HTTP GET / DISPATCH", service.name = ['backend',  'backend' ],
-    test_prune(
+    test_row_group_prune(
         Scenario::PeriodsInColumnNames,
         // use double quotes to use column named "service.name"
         "SELECT \"name\", \"service.name\" FROM t WHERE \"service.name\" = 'frontend'",
         Some(0),
         Some(1), // prune out last row group
+        Some(0),
         7,
     )
     .await;
-    test_prune(
+    test_row_group_prune(
         Scenario::PeriodsInColumnNames,
         "SELECT \"name\", \"service.name\" FROM t WHERE \"name\" != 'HTTP GET / DISPATCH'",
         Some(0),
         Some(2), // prune out first and last row group
+        Some(0),
         5,
     )
     .await;
-    test_prune(
+    test_row_group_prune(
         Scenario::PeriodsInColumnNames,
         "SELECT \"name\", \"service.name\" FROM t WHERE \"service.name\" = 'frontend' AND \"name\" != 'HTTP GET / DISPATCH'",
         Some(0),
         Some(2), // prune out middle and last row group
+        Some(0),
         2,
     )
     .await;