You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by li...@apache.org on 2022/07/30 02:18:06 UTC

[arrow-datafusion] branch master updated: test: add file/SQL level test for pruning parquet row group with decimal data type. (#2977)

This is an automated email from the ASF dual-hosted git repository.

liukun pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git


The following commit(s) were added to refs/heads/master by this push:
     new 3d1de1557 test: add file/SQL level test for pruning parquet row group with decimal data type. (#2977)
3d1de1557 is described below

commit 3d1de1557143efa7514e73839f5c54a6808d388c
Author: Kun Liu <li...@apache.org>
AuthorDate: Sat Jul 30 10:18:01 2022 +0800

    test: add file/SQL level test for pruning parquet row group with decimal data type. (#2977)
---
 datafusion/core/tests/parquet_pruning.rs | 186 +++++++++++++++++++++++++++++++
 1 file changed, 186 insertions(+)

diff --git a/datafusion/core/tests/parquet_pruning.rs b/datafusion/core/tests/parquet_pruning.rs
index b4bf6a0c0..df7be0fcf 100644
--- a/datafusion/core/tests/parquet_pruning.rs
+++ b/datafusion/core/tests/parquet_pruning.rs
@@ -19,6 +19,7 @@
 // data into a parquet file and then
 use std::sync::Arc;
 
+use arrow::array::Decimal128Array;
 use arrow::{
     array::{
         Array, ArrayRef, Date32Array, Date64Array, Float64Array, Int32Array, StringArray,
@@ -449,6 +450,154 @@ async fn prune_int32_eq_in_list_negated() {
     assert_eq!(output.result_rows, 19, "{}", output.description());
 }
 
+async fn test_prune_decimal(
+    decimal_case_type: Scenario,
+    sql: &str,
+    expected_errors: Option<usize>,
+    expected_row_group_pruned: Option<usize>,
+    expected_results: usize,
+) {
+    let output = ContextWithParquet::new(decimal_case_type)
+        .await
+        .query(sql)
+        .await;
+
+    println!("{}", output.description());
+    assert_eq!(output.predicate_evaluation_errors(), expected_errors);
+    assert_eq!(output.row_groups_pruned(), expected_row_group_pruned);
+    assert_eq!(
+        output.result_rows,
+        expected_results,
+        "{}",
+        output.description()
+    );
+}
+
+#[tokio::test]
+async fn prune_decimal_lt() {
+    // The data type of decimal_col is decimal(9,2)
+    // There are three row groups:
+    // [1.00, 6.00], [-5.00,6.00], [20.00,60.00]
+    test_prune_decimal(
+        Scenario::Decimal,
+        "SELECT * FROM t where decimal_col < 4",
+        Some(0),
+        Some(1),
+        6,
+    )
+    .await;
+    // compare with the casted decimal value
+    test_prune_decimal(
+        Scenario::Decimal,
+        "SELECT * FROM t where decimal_col < cast(4.55 as decimal(20,2))",
+        Some(0),
+        Some(1),
+        8,
+    )
+    .await;
+
+    // The data type of decimal_col is decimal(38,2)
+    test_prune_decimal(
+        Scenario::DecimalLargePrecision,
+        "SELECT * FROM t where decimal_col < 4",
+        Some(0),
+        Some(1),
+        6,
+    )
+    .await;
+    // compare with the casted decimal value
+    test_prune_decimal(
+        Scenario::DecimalLargePrecision,
+        "SELECT * FROM t where decimal_col < cast(4.55 as decimal(20,2))",
+        Some(0),
+        Some(1),
+        8,
+    )
+    .await;
+}
+
+#[tokio::test]
+async fn prune_decimal_eq() {
+    // The data type of decimal_col is decimal(9,2)
+    // There are three row groups:
+    // [1.00, 6.00], [-5.00,6.00], [20.00,60.00]
+    test_prune_decimal(
+        Scenario::Decimal,
+        "SELECT * FROM t where decimal_col = 4",
+        Some(0),
+        Some(1),
+        2,
+    )
+    .await;
+    test_prune_decimal(
+        Scenario::Decimal,
+        "SELECT * FROM t where decimal_col = 4.00",
+        Some(0),
+        Some(1),
+        2,
+    )
+    .await;
+
+    // The data type of decimal_col is decimal(38,2)
+    test_prune_decimal(
+        Scenario::DecimalLargePrecision,
+        "SELECT * FROM t where decimal_col = 4",
+        Some(0),
+        Some(1),
+        2,
+    )
+    .await;
+    test_prune_decimal(
+        Scenario::DecimalLargePrecision,
+        "SELECT * FROM t where decimal_col = 4.00",
+        Some(0),
+        Some(1),
+        2,
+    )
+    .await;
+}
+
+#[tokio::test]
+async fn prune_decimal_in_list() {
+    // The data type of decimal_col is decimal(9,2)
+    // There are three row groups:
+    // [1.00, 6.00], [-5.00,6.00], [20.00,60.00]
+    test_prune_decimal(
+        Scenario::Decimal,
+        "SELECT * FROM t where decimal_col in (4,3,2,123456789123)",
+        Some(0),
+        Some(1),
+        5,
+    )
+    .await;
+    test_prune_decimal(
+        Scenario::Decimal,
+        "SELECT * FROM t where decimal_col in (4.00,3.00,11.2345,1)",
+        Some(0),
+        Some(1),
+        6,
+    )
+    .await;
+
+    // The data type of decimal_col is decimal(38,2)
+    test_prune_decimal(
+        Scenario::DecimalLargePrecision,
+        "SELECT * FROM t where decimal_col in (4,3,2,123456789123)",
+        Some(0),
+        Some(1),
+        5,
+    )
+    .await;
+    test_prune_decimal(
+        Scenario::DecimalLargePrecision,
+        "SELECT * FROM t where decimal_col in (4.00,3.00,11.2345,1)",
+        Some(0),
+        Some(1),
+        6,
+    )
+    .await;
+}
+
 // ----------------------
 // Begin test fixture
 // ----------------------
@@ -459,6 +608,8 @@ enum Scenario {
     Dates,
     Int32,
     Float64,
+    Decimal,
+    DecimalLargePrecision,
 }
 
 /// Test fixture that has an execution context that has an external
@@ -681,6 +832,23 @@ async fn make_test_file(scenario: Scenario) -> NamedTempFile {
                 make_f64_batch(vec![5.0, 6.0, 7.0, 8.0, 9.0]),
             ]
         }
+        Scenario::Decimal => {
+            // decimal record batch
+            vec![
+                make_decimal_batch(vec![100, 200, 300, 400, 600], 9, 2),
+                make_decimal_batch(vec![-500, 100, 300, 400, 600], 9, 2),
+                make_decimal_batch(vec![2000, 3000, 3000, 4000, 6000], 9, 2),
+            ]
+        }
+        Scenario::DecimalLargePrecision => {
+            // decimal record batch with large precision,
+            // and the data will stored as FIXED_LENGTH_BYTE_ARRAY
+            vec![
+                make_decimal_batch(vec![100, 200, 300, 400, 600], 38, 2),
+                make_decimal_batch(vec![-500, 100, 300, 400, 600], 38, 2),
+                make_decimal_batch(vec![2000, 3000, 3000, 4000, 6000], 38, 2),
+            ]
+        }
     };
 
     let schema = batches[0].schema();
@@ -799,6 +967,24 @@ fn make_f64_batch(v: Vec<f64>) -> RecordBatch {
     RecordBatch::try_new(schema, vec![array.clone()]).unwrap()
 }
 
+/// Return record batch with decimal vector
+///
+/// Columns are named
+/// "decimal_col" -> DecimalArray
+fn make_decimal_batch(v: Vec<i128>, precision: usize, scale: usize) -> RecordBatch {
+    let schema = Arc::new(Schema::new(vec![Field::new(
+        "decimal_col",
+        DataType::Decimal(precision, scale),
+        true,
+    )]));
+    let array = Arc::new(
+        Decimal128Array::from_iter_values(v)
+            .with_precision_and_scale(precision, scale)
+            .unwrap(),
+    ) as ArrayRef;
+    RecordBatch::try_new(schema, vec![array.clone()]).unwrap()
+}
+
 /// Return record batch with a few rows of data for all of the supported date
 /// types with the specified offset (in days)
 ///