You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by al...@apache.org on 2020/11/21 23:17:19 UTC
[arrow] branch master updated: ARROW-10620: [Rust][Parquet] move column chunk range logic to metadata.rs

This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 0e8be3c  ARROW-10620: [Rust][Parquet] move column chunk range logic to metadata.rs
0e8be3c is described below

commit 0e8be3caa4a6a2dc71e4cc5cb6fe9ac375f5648d
Author: rdettai <rd...@gmail.com>
AuthorDate: Sat Nov 21 18:16:50 2020 -0500

    ARROW-10620: [Rust][Parquet] move column chunk range logic to metadata.rs
    
    > Getting the range of bytes of a column chunk inside a parquet file can be useful for external crates (for instance if they want to pre-fetch the columns), and is not completely obvious (it is enough to take a look at [1] and [2] to see that things can quickly get messy).
    >
    > I think it would be nice to move this logic in the metadata definition rather than have lost it in the middle of the reader implem.
    >
    > [1] https://stackoverflow.com/questions/55225108/why-is-dictionary-page-offset-0-for-plain-dictionary-encoding/
    > [2] https://issues.apache.org/jira/browse/PARQUET-816
    
    https://issues.apache.org/jira/browse/ARROW-10620
    
    Closes #8682 from rdettai/ARROW-10620-chunk-range
    
    Authored-by: rdettai <rd...@gmail.com>
    Signed-off-by: Andrew Lamb <an...@nerdnetworks.org>
---
 rust/parquet/src/file/metadata.rs          | 15 +++++++++++++++
 rust/parquet/src/file/serialized_reader.rs | 11 ++---------
 2 files changed, 17 insertions(+), 9 deletions(-)

diff --git a/rust/parquet/src/file/metadata.rs b/rust/parquet/src/file/metadata.rs
index b6b489d..8565115 100644
--- a/rust/parquet/src/file/metadata.rs
+++ b/rust/parquet/src/file/metadata.rs
@@ -433,6 +433,21 @@ impl ColumnChunkMetaData {
         self.dictionary_page_offset
     }
 
+    /// Returns the offset and length in bytes of the column chunk within the file
+    pub fn byte_range(&self) -> (u64, u64) {
+        let col_start = if self.has_dictionary_page() {
+            self.dictionary_page_offset().unwrap()
+        } else {
+            self.data_page_offset()
+        };
+        let col_len = self.compressed_size();
+        assert!(
+            col_start >= 0 && col_len >= 0,
+            "column start and length should not be negative"
+        );
+        (col_start as u64, col_len as u64)
+    }
+
     /// Returns statistics that are set for this column chunk,
     /// or `None` if no statistics are available.
     pub fn statistics(&self) -> Option<&Statistics> {
diff --git a/rust/parquet/src/file/serialized_reader.rs b/rust/parquet/src/file/serialized_reader.rs
index bd246af..663412d 100644
--- a/rust/parquet/src/file/serialized_reader.rs
+++ b/rust/parquet/src/file/serialized_reader.rs
@@ -191,15 +191,8 @@ impl<'a, R: 'static + ChunkReader> RowGroupReader for SerializedRowGroupReader<'
     // TODO: fix PARQUET-816
     fn get_column_page_reader(&self, i: usize) -> Result<Box<PageReader>> {
         let col = self.metadata.column(i);
-        let col_start = if col.has_dictionary_page() {
-            col.dictionary_page_offset().unwrap()
-        } else {
-            col.data_page_offset()
-        };
-        let col_length = col.compressed_size();
-        let file_chunk = self
-            .chunk_reader
-            .get_read(col_start as u64, col_length as usize)?;
+        let (col_start, col_length) = col.byte_range();
+        let file_chunk = self.chunk_reader.get_read(col_start, col_length as usize)?;
         let page_reader = SerializedPageReader::new(
             file_chunk,
             col.num_values(),