You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by tu...@apache.org on 2023/04/13 21:13:38 UTC
[arrow-datafusion] branch main updated: Don't use parquet file offset for file range pruning (#5997)
This is an automated email from the ASF dual-hosted git repository.
tustvold pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git
The following commit(s) were added to refs/heads/main by this push:
new 5c025cc806 Don't use parquet file offset for file range pruning (#5997)
5c025cc806 is described below
commit 5c025cc8062558fee586a88d49e1d6de433a86be
Author: Raphael Taylor-Davies <17...@users.noreply.github.com>
AuthorDate: Thu Apr 13 22:13:33 2023 +0100
Don't use parquet file offset for file range pruning (#5997)
* Don't use parquet file offset for file range pruning
* Update datafusion/core/src/physical_plan/file_format/parquet/row_groups.rs
Co-authored-by: Andrew Lamb <an...@nerdnetworks.org>
* Format
* Tweak logic
* Update test
---------
Co-authored-by: Andrew Lamb <an...@nerdnetworks.org>
---
datafusion/core/src/physical_plan/file_format/parquet.rs | 8 ++++----
.../core/src/physical_plan/file_format/parquet/row_groups.rs | 8 +++++++-
2 files changed, 11 insertions(+), 5 deletions(-)
diff --git a/datafusion/core/src/physical_plan/file_format/parquet.rs b/datafusion/core/src/physical_plan/file_format/parquet.rs
index 349fa68a4b..c69cdb7417 100644
--- a/datafusion/core/src/physical_plan/file_format/parquet.rs
+++ b/datafusion/core/src/physical_plan/file_format/parquet.rs
@@ -1619,11 +1619,11 @@ mod tests {
.infer_schema(&state, &store, &[meta.clone()])
.await?;
- let group_empty = vec![vec![file_range(&meta, 0, 5)]];
- let group_contain = vec![vec![file_range(&meta, 5, i64::MAX)]];
+ let group_empty = vec![vec![file_range(&meta, 0, 2)]];
+ let group_contain = vec![vec![file_range(&meta, 2, i64::MAX)]];
let group_all = vec![vec![
- file_range(&meta, 0, 5),
- file_range(&meta, 5, i64::MAX),
+ file_range(&meta, 0, 2),
+ file_range(&meta, 2, i64::MAX),
]];
assert_parquet_read(&state, group_empty, None, file_schema.clone()).await?;
diff --git a/datafusion/core/src/physical_plan/file_format/parquet/row_groups.rs b/datafusion/core/src/physical_plan/file_format/parquet/row_groups.rs
index 376ae35c66..86cf06620c 100644
--- a/datafusion/core/src/physical_plan/file_format/parquet/row_groups.rs
+++ b/datafusion/core/src/physical_plan/file_format/parquet/row_groups.rs
@@ -53,7 +53,13 @@ pub(crate) fn prune_row_groups(
let mut filtered = Vec::with_capacity(groups.len());
for (idx, metadata) in groups.iter().enumerate() {
if let Some(range) = &range {
- let offset = metadata.column(0).file_offset();
+ // figure out where the first dictionary page (or first data page are)
+ // note don't use the location of metadata
+ // <https://github.com/apache/arrow-datafusion/issues/5995>
+ let col = metadata.column(0);
+ let offset = col
+ .dictionary_page_offset()
+ .unwrap_or_else(|| col.data_page_offset());
if offset < range.start || offset >= range.end {
continue;
}