You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by li...@apache.org on 2023/01/05 03:16:22 UTC
[arrow-datafusion] branch master updated: Add test cases: row group filter with missing statistics for decimal data type (#4810)
This is an automated email from the ASF dual-hosted git repository.
liukun pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git
The following commit(s) were added to refs/heads/master by this push:
new 087ac0993 Add test cases: row group filter with missing statistics for decimal data type (#4810)
087ac0993 is described below
commit 087ac099394bb275d55a860d9ec029ce23078aa2
Author: Kun Liu <li...@apache.org>
AuthorDate: Thu Jan 5 11:16:17 2023 +0800
Add test cases: row group filter with missing statistics for decimal data type (#4810)
* add null case for row group filter
* Apply suggestions from code review
Co-authored-by: Andrew Lamb <an...@nerdnetworks.org>
Co-authored-by: Andrew Lamb <an...@nerdnetworks.org>
---
.../file_format/parquet/row_groups.rs | 69 ++++++++++++++++++----
1 file changed, 58 insertions(+), 11 deletions(-)
diff --git a/datafusion/core/src/physical_plan/file_format/parquet/row_groups.rs b/datafusion/core/src/physical_plan/file_format/parquet/row_groups.rs
index 716c47cf9..4aae795b4 100644
--- a/datafusion/core/src/physical_plan/file_format/parquet/row_groups.rs
+++ b/datafusion/core/src/physical_plan/file_format/parquet/row_groups.rs
@@ -475,10 +475,21 @@ mod tests {
// c1 > 5, this row group will not be included in the results.
vec![ParquetStatistics::int32(Some(10), Some(20), None, 0, false)],
);
+ let rgm3 = get_row_group_meta_data(
+ &schema_descr,
+ // [1, None]
+ // c1 > 5, this row group can not be filtered out, so will be included in the results.
+ vec![ParquetStatistics::int32(Some(100), None, None, 0, false)],
+ );
let metrics = parquet_file_metrics();
assert_eq!(
- prune_row_groups(&[rgm1, rgm2], None, Some(&pruning_predicate), &metrics),
- vec![0]
+ prune_row_groups(
+ &[rgm1, rgm2, rgm3],
+ None,
+ Some(&pruning_predicate),
+ &metrics
+ ),
+ vec![0, 2]
);
// INT32: c1 > 5, but parquet decimal type has different precision or scale to arrow decimal
@@ -528,15 +539,21 @@ mod tests {
// c1 > 5, this row group will not be included in the results.
vec![ParquetStatistics::int32(Some(0), Some(2), None, 0, false)],
);
+ let rgm4 = get_row_group_meta_data(
+ &schema_descr,
+ // [None, 2]
+ // c1 > 5, this row group can not be filtered out, so will be included in the results.
+ vec![ParquetStatistics::int32(None, Some(2), None, 0, false)],
+ );
let metrics = parquet_file_metrics();
assert_eq!(
prune_row_groups(
- &[rgm1, rgm2, rgm3],
+ &[rgm1, rgm2, rgm3, rgm4],
None,
Some(&pruning_predicate),
&metrics
),
- vec![0, 1]
+ vec![0, 1, 3]
);
// INT64: c1 < 5, the c1 is decimal(18,2)
@@ -572,10 +589,20 @@ mod tests {
// [0.1, 0.2]
vec![ParquetStatistics::int64(Some(10), Some(20), None, 0, false)],
);
+ let rgm3 = get_row_group_meta_data(
+ &schema_descr,
+ // [0.1, 0.2]
+ vec![ParquetStatistics::int64(None, None, None, 0, false)],
+ );
let metrics = parquet_file_metrics();
assert_eq!(
- prune_row_groups(&[rgm1, rgm2], None, Some(&pruning_predicate), &metrics),
- vec![1]
+ prune_row_groups(
+ &[rgm1, rgm2, rgm3],
+ None,
+ Some(&pruning_predicate),
+ &metrics
+ ),
+ vec![1, 2]
);
// FIXED_LENGTH_BYTE_ARRAY: c1 = decimal128(100000, 28, 3), the c1 is decimal(18,2)
@@ -631,13 +658,24 @@ mod tests {
false,
)],
);
+
+ let rgm3 = get_row_group_meta_data(
+ &schema_descr,
+ vec![ParquetStatistics::fixed_len_byte_array(
+ None, None, None, 0, false,
+ )],
+ );
let metrics = parquet_file_metrics();
assert_eq!(
- prune_row_groups(&[rgm1, rgm2], None, Some(&pruning_predicate), &metrics),
- vec![1]
+ prune_row_groups(
+ &[rgm1, rgm2, rgm3],
+ None,
+ Some(&pruning_predicate),
+ &metrics
+ ),
+ vec![1, 2]
);
- // TODO: BYTE_ARRAY support read decimal from parquet, after the 20.0.0 arrow-rs release
// BYTE_ARRAY: c1 = decimal128(100000, 28, 3), the c1 is decimal(18,2)
// the type of parquet is decimal(18,2)
let schema =
@@ -683,10 +721,19 @@ mod tests {
false,
)],
);
+ let rgm3 = get_row_group_meta_data(
+ &schema_descr,
+ vec![ParquetStatistics::byte_array(None, None, None, 0, false)],
+ );
let metrics = parquet_file_metrics();
assert_eq!(
- prune_row_groups(&[rgm1, rgm2], None, Some(&pruning_predicate), &metrics),
- vec![1]
+ prune_row_groups(
+ &[rgm1, rgm2, rgm3],
+ None,
+ Some(&pruning_predicate),
+ &metrics
+ ),
+ vec![1, 2]
);
}