You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by al...@apache.org on 2020/11/21 23:17:19 UTC
[arrow] branch master updated: ARROW-10620: [Rust][Parquet] move
column chunk range logic to metadata.rs
This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 0e8be3c ARROW-10620: [Rust][Parquet] move column chunk range logic to metadata.rs
0e8be3c is described below
commit 0e8be3caa4a6a2dc71e4cc5cb6fe9ac375f5648d
Author: rdettai <rd...@gmail.com>
AuthorDate: Sat Nov 21 18:16:50 2020 -0500
ARROW-10620: [Rust][Parquet] move column chunk range logic to metadata.rs
> Getting the range of bytes of a column chunk inside a parquet file can be useful for external crates (for instance if they want to pre-fetch the columns), and is not completely obvious (it is enough to take a look at [1] and [2] to see that things can quickly get messy).
>
> I think it would be nice to move this logic in the metadata definition rather than have lost it in the middle of the reader implem.
>
> [1] https://stackoverflow.com/questions/55225108/why-is-dictionary-page-offset-0-for-plain-dictionary-encoding/
> [2] https://issues.apache.org/jira/browse/PARQUET-816
https://issues.apache.org/jira/browse/ARROW-10620
Closes #8682 from rdettai/ARROW-10620-chunk-range
Authored-by: rdettai <rd...@gmail.com>
Signed-off-by: Andrew Lamb <an...@nerdnetworks.org>
---
rust/parquet/src/file/metadata.rs | 15 +++++++++++++++
rust/parquet/src/file/serialized_reader.rs | 11 ++---------
2 files changed, 17 insertions(+), 9 deletions(-)
diff --git a/rust/parquet/src/file/metadata.rs b/rust/parquet/src/file/metadata.rs
index b6b489d..8565115 100644
--- a/rust/parquet/src/file/metadata.rs
+++ b/rust/parquet/src/file/metadata.rs
@@ -433,6 +433,21 @@ impl ColumnChunkMetaData {
self.dictionary_page_offset
}
+ /// Returns the offset and length in bytes of the column chunk within the file
+ pub fn byte_range(&self) -> (u64, u64) {
+ let col_start = if self.has_dictionary_page() {
+ self.dictionary_page_offset().unwrap()
+ } else {
+ self.data_page_offset()
+ };
+ let col_len = self.compressed_size();
+ assert!(
+ col_start >= 0 && col_len >= 0,
+ "column start and length should not be negative"
+ );
+ (col_start as u64, col_len as u64)
+ }
+
/// Returns statistics that are set for this column chunk,
/// or `None` if no statistics are available.
pub fn statistics(&self) -> Option<&Statistics> {
diff --git a/rust/parquet/src/file/serialized_reader.rs b/rust/parquet/src/file/serialized_reader.rs
index bd246af..663412d 100644
--- a/rust/parquet/src/file/serialized_reader.rs
+++ b/rust/parquet/src/file/serialized_reader.rs
@@ -191,15 +191,8 @@ impl<'a, R: 'static + ChunkReader> RowGroupReader for SerializedRowGroupReader<'
// TODO: fix PARQUET-816
fn get_column_page_reader(&self, i: usize) -> Result<Box<PageReader>> {
let col = self.metadata.column(i);
- let col_start = if col.has_dictionary_page() {
- col.dictionary_page_offset().unwrap()
- } else {
- col.data_page_offset()
- };
- let col_length = col.compressed_size();
- let file_chunk = self
- .chunk_reader
- .get_read(col_start as u64, col_length as usize)?;
+ let (col_start, col_length) = col.byte_range();
+ let file_chunk = self.chunk_reader.get_read(col_start, col_length as usize)?;
let page_reader = SerializedPageReader::new(
file_chunk,
col.num_values(),