You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by dh...@apache.org on 2023/01/31 14:04:20 UTC

[arrow-rs] branch master updated: Minor: Update doc strings about Page Index / Column Index (#3625)

This is an automated email from the ASF dual-hosted git repository.

dheres pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/master by this push:
     new f78a9be8b Minor: Update doc strings about Page Index / Column Index (#3625)
f78a9be8b is described below

commit f78a9be8b3a7479418cacc5ea6755b1e60c4b03f
Author: Andrew Lamb <an...@nerdnetworks.org>
AuthorDate: Tue Jan 31 15:04:12 2023 +0100

    Minor: Update doc strings about Page Index / Column Index (#3625)
    
    * Minor: Update doc strings about what Page Index / Column Index
    
    * tweaks
    
    * typos
    
    * Apply suggestions from code review
    
    * Update parquet/src/file/metadata.rs
    
    Co-authored-by: Liang-Chi Hsieh <vi...@gmail.com>
    
    ---------
    
    Co-authored-by: Liang-Chi Hsieh <vi...@gmail.com>
---
 parquet/src/file/metadata.rs                | 29 +++++++++++++++++++++++++++--
 parquet/src/file/page_encoding_stats.rs     |  2 ++
 parquet/src/file/page_index/index.rs        | 24 ++++++++++++++++++++----
 parquet/src/file/page_index/index_reader.rs | 28 ++++++++++++++++++++++++----
 parquet/src/file/page_index/mod.rs          |  4 ++++
 parquet/src/file/properties.rs              |  2 +-
 parquet/src/file/reader.rs                  |  5 +++--
 parquet/src/file/serialized_reader.rs       |  7 +++++--
 8 files changed, 86 insertions(+), 15 deletions(-)

diff --git a/parquet/src/file/metadata.rs b/parquet/src/file/metadata.rs
index 0696b2901..a83f02dfd 100644
--- a/parquet/src/file/metadata.rs
+++ b/parquet/src/file/metadata.rs
@@ -50,7 +50,25 @@ use crate::schema::types::{
     Type as SchemaType,
 };
 
+/// [`Index`] for each row group of each column.
+///
+/// `column_index[row_group_number][column_number]` holds the
+/// [`Index`] corresponding to column `column_number` of row group
+/// `row_group_number`.
+///
+/// For example `column_index[2][3]` holds the [`Index`] for the forth
+/// column in the third row group of the parquet file.
 pub type ParquetColumnIndex = Vec<Vec<Index>>;
+
+/// [`PageLocation`] for each datapage of each row group of each column.
+///
+/// `offset_index[row_group_number][column_number][page_number]` holds
+/// the [`PageLocation`] corresponding to page `page_number` of column
+/// `column_number`of row group `row_group_number`.
+///
+/// For example `offset_index[2][3][4]` holds the [`PageLocation`] for
+/// the fifth page of the forth column in the third row group of the
+/// parquet file.
 pub type ParquetOffsetIndex = Vec<Vec<Vec<PageLocation>>>;
 
 /// Global Parquet metadata.
@@ -65,8 +83,8 @@ pub struct ParquetMetaData {
 }
 
 impl ParquetMetaData {
-    /// Creates Parquet metadata from file metadata and a list of row group metadata `Arc`s
-    /// for each available row group.
+    /// Creates Parquet metadata from file metadata and a list of row
+    /// group metadata
     pub fn new(file_metadata: FileMetaData, row_groups: Vec<RowGroupMetaData>) -> Self {
         ParquetMetaData {
             file_metadata,
@@ -76,6 +94,8 @@ impl ParquetMetaData {
         }
     }
 
+    /// Creates Parquet metadata from file metadata, a list of row
+    /// group metadata, and the column index structures.
     pub fn new_with_page_index(
         file_metadata: FileMetaData,
         row_groups: Vec<RowGroupMetaData>,
@@ -232,6 +252,7 @@ pub struct RowGroupMetaData {
     sorting_columns: Option<Vec<SortingColumn>>,
     total_byte_size: i64,
     schema_descr: SchemaDescPtr,
+    /// `page_offset_index[column_number][page_number]`
     page_offset_index: Option<Vec<Vec<PageLocation>>>,
 }
 
@@ -277,6 +298,8 @@ impl RowGroupMetaData {
     }
 
     /// Returns reference of page offset index of all column in this row group.
+    ///
+    /// The returned vector contains `page_offset[column_number][page_number]`
     pub fn page_offset_index(&self) -> Option<&Vec<Vec<PageLocation>>> {
         self.page_offset_index.as_ref()
     }
@@ -292,6 +315,8 @@ impl RowGroupMetaData {
     }
 
     /// Sets page offset index for this row group.
+    ///
+    /// The vector represents `page_offset[column_number][page_number]`
     pub fn set_page_offset(&mut self, page_offset: Vec<Vec<PageLocation>>) {
         self.page_offset_index = Some(page_offset);
     }
diff --git a/parquet/src/file/page_encoding_stats.rs b/parquet/src/file/page_encoding_stats.rs
index eb2680478..95a731180 100644
--- a/parquet/src/file/page_encoding_stats.rs
+++ b/parquet/src/file/page_encoding_stats.rs
@@ -15,6 +15,8 @@
 // specific language governing permissions and limitations
 // under the License.
 
+//! Per-page encoding information.
+
 use crate::basic::{Encoding, PageType};
 use crate::errors::Result;
 use crate::format::{
diff --git a/parquet/src/file/page_index/index.rs b/parquet/src/file/page_index/index.rs
index 83d55caa4..8f9cb6629 100644
--- a/parquet/src/file/page_index/index.rs
+++ b/parquet/src/file/page_index/index.rs
@@ -15,6 +15,8 @@
 // specific language governing permissions and limitations
 // under the License.
 
+//! [`Index`] structures holding decoded [`ColumnIndex`] information
+
 use crate::basic::Type;
 use crate::data_type::private::ParquetValueType;
 use crate::data_type::{ByteArray, Int96};
@@ -23,7 +25,14 @@ use crate::format::{BoundaryOrder, ColumnIndex};
 use crate::util::bit_util::from_le_slice;
 use std::fmt::Debug;
 
-/// The statistics in one page
+/// PageIndex Statistics for one data page, as described in [Column Index].
+///
+/// One significant difference from the row group level
+/// [`Statistics`](crate::format::Statistics) is that page level
+/// statistics may not store actual column values as min and max
+/// (e.g. they may store truncated strings to save space)
+///
+/// [Column Index]: https://github.com/apache/parquet-format/blob/master/PageIndex.md
 #[derive(Debug, Clone, PartialEq, Eq, Hash)]
 pub struct PageIndex<T> {
     /// The minimum value, It is None when all values are null
@@ -48,6 +57,10 @@ impl<T> PageIndex<T> {
 
 #[derive(Debug, Clone, PartialEq)]
 #[allow(non_camel_case_types)]
+/// Typed statistics for a data page in a column chunk. This structure
+/// is obtained from decoding the [ColumnIndex] in the parquet file
+/// and can be used to skip decoding pages while reading the file
+/// data.
 pub enum Index {
     /// Sometimes reading page index from parquet file
     /// will only return pageLocations without min_max index,
@@ -90,14 +103,17 @@ impl Index {
     }
 }
 
-/// An index of a column of [`Type`] physical representation
+/// Stores the [`PageIndex`] for each page of a column with [`Type`]
 #[derive(Debug, Clone, PartialEq, Eq, Hash)]
 pub struct NativeIndex<T: ParquetValueType> {
-    /// The physical type
+    /// The physical type of this column
     pub physical_type: Type,
     /// The indexes, one item per page
     pub indexes: Vec<PageIndex<T>>,
-    /// the order
+    /// If the min/max elements are ordered, and if so in which
+    /// direction. See [source] for details.
+    ///
+    /// [source]: https://github.com/apache/parquet-format/blob/bfc549b93e6927cb1fc425466e4084f76edc6d22/src/main/thrift/parquet.thrift#L959-L964
     pub boundary_order: BoundaryOrder,
 }
 
diff --git a/parquet/src/file/page_index/index_reader.rs b/parquet/src/file/page_index/index_reader.rs
index 36b1c9d6c..3ae37cf87 100644
--- a/parquet/src/file/page_index/index_reader.rs
+++ b/parquet/src/file/page_index/index_reader.rs
@@ -15,6 +15,8 @@
 // specific language governing permissions and limitations
 // under the License.
 
+//! Support for reading [`Index`] and [`PageLocation`] from parquet metadata.
+
 use crate::basic::Type;
 use crate::data_type::Int96;
 use crate::errors::ParquetError;
@@ -25,8 +27,17 @@ use crate::format::{ColumnIndex, OffsetIndex, PageLocation};
 use std::io::{Cursor, Read};
 use thrift::protocol::{TCompactInputProtocol, TSerializable};
 
-/// Read on row group's all columns indexes and change into  [`Index`]
-/// If not the format not available return an empty vector.
+/// Reads per-column [`Index`] for all columns of a row group by
+/// decoding [`ColumnIndex`] .
+///
+/// Returns a vector of `index[column_number]`.
+///
+/// Returns an empty vector if this row group does not contain a
+/// [`ColumnIndex`].
+///
+/// See [Column Index Documentation] for more details.
+///
+/// [Column Index Documentation]: https://github.com/apache/parquet-format/blob/master/PageIndex.md
 pub fn read_columns_indexes<R: ChunkReader>(
     reader: &R,
     chunks: &[ColumnChunkMetaData],
@@ -60,8 +71,17 @@ pub fn read_columns_indexes<R: ChunkReader>(
         .collect()
 }
 
-/// Read on row group's all indexes and change into  [`Index`]
-/// If not the format not available return an empty vector.
+/// Reads per-page [`PageLocation`] for all columns of a row group by
+/// decoding the [`OffsetIndex`].
+///
+/// Returns a vector of `location[column_number][page_number]`
+///
+/// Return an empty vector if this row group does not contain an
+/// [`OffsetIndex]`.
+///
+/// See [Column Index Documentation] for more details.
+///
+/// [Column Index Documentation]: https://github.com/apache/parquet-format/blob/master/PageIndex.md
 pub fn read_pages_locations<R: ChunkReader>(
     reader: &R,
     chunks: &[ColumnChunkMetaData],
diff --git a/parquet/src/file/page_index/mod.rs b/parquet/src/file/page_index/mod.rs
index dcc1120fc..9372645d7 100644
--- a/parquet/src/file/page_index/mod.rs
+++ b/parquet/src/file/page_index/mod.rs
@@ -15,5 +15,9 @@
 // specific language governing permissions and limitations
 // under the License.
 
+//! Page Index of "[Column Index] Layout to Support Page Skipping"
+//!
+//! [Column Index]: https://github.com/apache/parquet-format/blob/master/PageIndex.md
+
 pub mod index;
 pub mod index_reader;
diff --git a/parquet/src/file/properties.rs b/parquet/src/file/properties.rs
index cbd31f9a1..2ce0050c9 100644
--- a/parquet/src/file/properties.rs
+++ b/parquet/src/file/properties.rs
@@ -15,7 +15,7 @@
 // specific language governing permissions and limitations
 // under the License.
 
-//! Writer properties.
+//! [`WriterProperties`]
 //!
 //! # Usage
 //!
diff --git a/parquet/src/file/reader.rs b/parquet/src/file/reader.rs
index bb82f2299..545f22709 100644
--- a/parquet/src/file/reader.rs
+++ b/parquet/src/file/reader.rs
@@ -15,8 +15,9 @@
 // specific language governing permissions and limitations
 // under the License.
 
-//! Contains file reader API and provides methods to access file metadata, row group
-//! readers to read individual column chunks, or access record iterator.
+//! File reader API and methods to access file metadata, row group
+//! readers to read individual column chunks, or access record
+//! iterator.
 
 use bytes::Bytes;
 use std::{boxed::Box, io::Read, sync::Arc};
diff --git a/parquet/src/file/serialized_reader.rs b/parquet/src/file/serialized_reader.rs
index 95108ad58..e5ed26e9e 100644
--- a/parquet/src/file/serialized_reader.rs
+++ b/parquet/src/file/serialized_reader.rs
@@ -189,13 +189,16 @@ impl ReadOptionsBuilder {
         self
     }
 
-    /// Enable page index in the reading option,
+    /// Enable reading the page index structures described in
+    /// "[Column Index] Layout to Support Page Skipping"
+    ///
+    /// [Column Index]: https://github.com/apache/parquet-format/blob/master/PageIndex.md
     pub fn with_page_index(mut self) -> Self {
         self.enable_page_index = true;
         self
     }
 
-    /// Set the `ReaderProperties` configuration.
+    /// Set the [`ReaderProperties`] configuration.
     pub fn with_reader_properties(mut self, properties: ReaderProperties) -> Self {
         self.props = Some(properties);
         self