You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by li...@apache.org on 2023/01/05 03:16:22 UTC

[arrow-datafusion] branch master updated: Add test cases: row group filter with missing statistics for decimal data type (#4810)

This is an automated email from the ASF dual-hosted git repository.

liukun pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git


The following commit(s) were added to refs/heads/master by this push:
     new 087ac0993 Add test cases: row group filter with missing statistics for decimal data type (#4810)
087ac0993 is described below

commit 087ac099394bb275d55a860d9ec029ce23078aa2
Author: Kun Liu <li...@apache.org>
AuthorDate: Thu Jan 5 11:16:17 2023 +0800

    Add test cases: row group filter with missing statistics for decimal data type (#4810)
    
    * add null case for row group filter
    
    * Apply suggestions from code review
    
    Co-authored-by: Andrew Lamb <an...@nerdnetworks.org>
    
    Co-authored-by: Andrew Lamb <an...@nerdnetworks.org>
---
 .../file_format/parquet/row_groups.rs              | 69 ++++++++++++++++++----
 1 file changed, 58 insertions(+), 11 deletions(-)

diff --git a/datafusion/core/src/physical_plan/file_format/parquet/row_groups.rs b/datafusion/core/src/physical_plan/file_format/parquet/row_groups.rs
index 716c47cf9..4aae795b4 100644
--- a/datafusion/core/src/physical_plan/file_format/parquet/row_groups.rs
+++ b/datafusion/core/src/physical_plan/file_format/parquet/row_groups.rs
@@ -475,10 +475,21 @@ mod tests {
             // c1 > 5, this row group will not be included in the results.
             vec![ParquetStatistics::int32(Some(10), Some(20), None, 0, false)],
         );
+        let rgm3 = get_row_group_meta_data(
+            &schema_descr,
+            // [1, None]
+            // c1 > 5, this row group can not be filtered out, so will be included in the results.
+            vec![ParquetStatistics::int32(Some(100), None, None, 0, false)],
+        );
         let metrics = parquet_file_metrics();
         assert_eq!(
-            prune_row_groups(&[rgm1, rgm2], None, Some(&pruning_predicate), &metrics),
-            vec![0]
+            prune_row_groups(
+                &[rgm1, rgm2, rgm3],
+                None,
+                Some(&pruning_predicate),
+                &metrics
+            ),
+            vec![0, 2]
         );
 
         // INT32: c1 > 5, but parquet decimal type has different precision or scale to arrow decimal
@@ -528,15 +539,21 @@ mod tests {
             // c1 > 5, this row group will not be included in the results.
             vec![ParquetStatistics::int32(Some(0), Some(2), None, 0, false)],
         );
+        let rgm4 = get_row_group_meta_data(
+            &schema_descr,
+            // [None, 2]
+            // c1 > 5, this row group can not be filtered out, so will be included in the results.
+            vec![ParquetStatistics::int32(None, Some(2), None, 0, false)],
+        );
         let metrics = parquet_file_metrics();
         assert_eq!(
             prune_row_groups(
-                &[rgm1, rgm2, rgm3],
+                &[rgm1, rgm2, rgm3, rgm4],
                 None,
                 Some(&pruning_predicate),
                 &metrics
             ),
-            vec![0, 1]
+            vec![0, 1, 3]
         );
 
         // INT64: c1 < 5, the c1 is decimal(18,2)
@@ -572,10 +589,20 @@ mod tests {
             // [0.1, 0.2]
             vec![ParquetStatistics::int64(Some(10), Some(20), None, 0, false)],
         );
+        let rgm3 = get_row_group_meta_data(
+            &schema_descr,
+            // [0.1, 0.2]
+            vec![ParquetStatistics::int64(None, None, None, 0, false)],
+        );
         let metrics = parquet_file_metrics();
         assert_eq!(
-            prune_row_groups(&[rgm1, rgm2], None, Some(&pruning_predicate), &metrics),
-            vec![1]
+            prune_row_groups(
+                &[rgm1, rgm2, rgm3],
+                None,
+                Some(&pruning_predicate),
+                &metrics
+            ),
+            vec![1, 2]
         );
 
         // FIXED_LENGTH_BYTE_ARRAY: c1 = decimal128(100000, 28, 3), the c1 is decimal(18,2)
@@ -631,13 +658,24 @@ mod tests {
                 false,
             )],
         );
+
+        let rgm3 = get_row_group_meta_data(
+            &schema_descr,
+            vec![ParquetStatistics::fixed_len_byte_array(
+                None, None, None, 0, false,
+            )],
+        );
         let metrics = parquet_file_metrics();
         assert_eq!(
-            prune_row_groups(&[rgm1, rgm2], None, Some(&pruning_predicate), &metrics),
-            vec![1]
+            prune_row_groups(
+                &[rgm1, rgm2, rgm3],
+                None,
+                Some(&pruning_predicate),
+                &metrics
+            ),
+            vec![1, 2]
         );
 
-        // TODO: BYTE_ARRAY support read decimal from parquet, after the 20.0.0 arrow-rs release
         // BYTE_ARRAY: c1 = decimal128(100000, 28, 3), the c1 is decimal(18,2)
         // the type of parquet is decimal(18,2)
         let schema =
@@ -683,10 +721,19 @@ mod tests {
                 false,
             )],
         );
+        let rgm3 = get_row_group_meta_data(
+            &schema_descr,
+            vec![ParquetStatistics::byte_array(None, None, None, 0, false)],
+        );
         let metrics = parquet_file_metrics();
         assert_eq!(
-            prune_row_groups(&[rgm1, rgm2], None, Some(&pruning_predicate), &metrics),
-            vec![1]
+            prune_row_groups(
+                &[rgm1, rgm2, rgm3],
+                None,
+                Some(&pruning_predicate),
+                &metrics
+            ),
+            vec![1, 2]
         );
     }