You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by dh...@apache.org on 2023/01/31 14:04:20 UTC
[arrow-rs] branch master updated: Minor: Update doc strings about Page Index / Column Index (#3625)
This is an automated email from the ASF dual-hosted git repository.
dheres pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new f78a9be8b Minor: Update doc strings about Page Index / Column Index (#3625)
f78a9be8b is described below
commit f78a9be8b3a7479418cacc5ea6755b1e60c4b03f
Author: Andrew Lamb <an...@nerdnetworks.org>
AuthorDate: Tue Jan 31 15:04:12 2023 +0100
Minor: Update doc strings about Page Index / Column Index (#3625)
* Minor: Update doc strings about what Page Index / Column Index
* tweaks
* typos
* Apply suggestions from code review
* Update parquet/src/file/metadata.rs
Co-authored-by: Liang-Chi Hsieh <vi...@gmail.com>
---------
Co-authored-by: Liang-Chi Hsieh <vi...@gmail.com>
---
parquet/src/file/metadata.rs | 29 +++++++++++++++++++++++++++--
parquet/src/file/page_encoding_stats.rs | 2 ++
parquet/src/file/page_index/index.rs | 24 ++++++++++++++++++++----
parquet/src/file/page_index/index_reader.rs | 28 ++++++++++++++++++++++++----
parquet/src/file/page_index/mod.rs | 4 ++++
parquet/src/file/properties.rs | 2 +-
parquet/src/file/reader.rs | 5 +++--
parquet/src/file/serialized_reader.rs | 7 +++++--
8 files changed, 86 insertions(+), 15 deletions(-)
diff --git a/parquet/src/file/metadata.rs b/parquet/src/file/metadata.rs
index 0696b2901..a83f02dfd 100644
--- a/parquet/src/file/metadata.rs
+++ b/parquet/src/file/metadata.rs
@@ -50,7 +50,25 @@ use crate::schema::types::{
Type as SchemaType,
};
+/// [`Index`] for each row group of each column.
+///
+/// `column_index[row_group_number][column_number]` holds the
+/// [`Index`] corresponding to column `column_number` of row group
+/// `row_group_number`.
+///
+/// For example `column_index[2][3]` holds the [`Index`] for the forth
+/// column in the third row group of the parquet file.
pub type ParquetColumnIndex = Vec<Vec<Index>>;
+
+/// [`PageLocation`] for each datapage of each row group of each column.
+///
+/// `offset_index[row_group_number][column_number][page_number]` holds
+/// the [`PageLocation`] corresponding to page `page_number` of column
+/// `column_number`of row group `row_group_number`.
+///
+/// For example `offset_index[2][3][4]` holds the [`PageLocation`] for
+/// the fifth page of the forth column in the third row group of the
+/// parquet file.
pub type ParquetOffsetIndex = Vec<Vec<Vec<PageLocation>>>;
/// Global Parquet metadata.
@@ -65,8 +83,8 @@ pub struct ParquetMetaData {
}
impl ParquetMetaData {
- /// Creates Parquet metadata from file metadata and a list of row group metadata `Arc`s
- /// for each available row group.
+ /// Creates Parquet metadata from file metadata and a list of row
+ /// group metadata
pub fn new(file_metadata: FileMetaData, row_groups: Vec<RowGroupMetaData>) -> Self {
ParquetMetaData {
file_metadata,
@@ -76,6 +94,8 @@ impl ParquetMetaData {
}
}
+ /// Creates Parquet metadata from file metadata, a list of row
+ /// group metadata, and the column index structures.
pub fn new_with_page_index(
file_metadata: FileMetaData,
row_groups: Vec<RowGroupMetaData>,
@@ -232,6 +252,7 @@ pub struct RowGroupMetaData {
sorting_columns: Option<Vec<SortingColumn>>,
total_byte_size: i64,
schema_descr: SchemaDescPtr,
+ /// `page_offset_index[column_number][page_number]`
page_offset_index: Option<Vec<Vec<PageLocation>>>,
}
@@ -277,6 +298,8 @@ impl RowGroupMetaData {
}
/// Returns reference of page offset index of all column in this row group.
+ ///
+ /// The returned vector contains `page_offset[column_number][page_number]`
pub fn page_offset_index(&self) -> Option<&Vec<Vec<PageLocation>>> {
self.page_offset_index.as_ref()
}
@@ -292,6 +315,8 @@ impl RowGroupMetaData {
}
/// Sets page offset index for this row group.
+ ///
+ /// The vector represents `page_offset[column_number][page_number]`
pub fn set_page_offset(&mut self, page_offset: Vec<Vec<PageLocation>>) {
self.page_offset_index = Some(page_offset);
}
diff --git a/parquet/src/file/page_encoding_stats.rs b/parquet/src/file/page_encoding_stats.rs
index eb2680478..95a731180 100644
--- a/parquet/src/file/page_encoding_stats.rs
+++ b/parquet/src/file/page_encoding_stats.rs
@@ -15,6 +15,8 @@
// specific language governing permissions and limitations
// under the License.
+//! Per-page encoding information.
+
use crate::basic::{Encoding, PageType};
use crate::errors::Result;
use crate::format::{
diff --git a/parquet/src/file/page_index/index.rs b/parquet/src/file/page_index/index.rs
index 83d55caa4..8f9cb6629 100644
--- a/parquet/src/file/page_index/index.rs
+++ b/parquet/src/file/page_index/index.rs
@@ -15,6 +15,8 @@
// specific language governing permissions and limitations
// under the License.
+//! [`Index`] structures holding decoded [`ColumnIndex`] information
+
use crate::basic::Type;
use crate::data_type::private::ParquetValueType;
use crate::data_type::{ByteArray, Int96};
@@ -23,7 +25,14 @@ use crate::format::{BoundaryOrder, ColumnIndex};
use crate::util::bit_util::from_le_slice;
use std::fmt::Debug;
-/// The statistics in one page
+/// PageIndex Statistics for one data page, as described in [Column Index].
+///
+/// One significant difference from the row group level
+/// [`Statistics`](crate::format::Statistics) is that page level
+/// statistics may not store actual column values as min and max
+/// (e.g. they may store truncated strings to save space)
+///
+/// [Column Index]: https://github.com/apache/parquet-format/blob/master/PageIndex.md
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub struct PageIndex<T> {
/// The minimum value, It is None when all values are null
@@ -48,6 +57,10 @@ impl<T> PageIndex<T> {
#[derive(Debug, Clone, PartialEq)]
#[allow(non_camel_case_types)]
+/// Typed statistics for a data page in a column chunk. This structure
+/// is obtained from decoding the [ColumnIndex] in the parquet file
+/// and can be used to skip decoding pages while reading the file
+/// data.
pub enum Index {
/// Sometimes reading page index from parquet file
/// will only return pageLocations without min_max index,
@@ -90,14 +103,17 @@ impl Index {
}
}
-/// An index of a column of [`Type`] physical representation
+/// Stores the [`PageIndex`] for each page of a column with [`Type`]
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub struct NativeIndex<T: ParquetValueType> {
- /// The physical type
+ /// The physical type of this column
pub physical_type: Type,
/// The indexes, one item per page
pub indexes: Vec<PageIndex<T>>,
- /// the order
+ /// If the min/max elements are ordered, and if so in which
+ /// direction. See [source] for details.
+ ///
+ /// [source]: https://github.com/apache/parquet-format/blob/bfc549b93e6927cb1fc425466e4084f76edc6d22/src/main/thrift/parquet.thrift#L959-L964
pub boundary_order: BoundaryOrder,
}
diff --git a/parquet/src/file/page_index/index_reader.rs b/parquet/src/file/page_index/index_reader.rs
index 36b1c9d6c..3ae37cf87 100644
--- a/parquet/src/file/page_index/index_reader.rs
+++ b/parquet/src/file/page_index/index_reader.rs
@@ -15,6 +15,8 @@
// specific language governing permissions and limitations
// under the License.
+//! Support for reading [`Index`] and [`PageLocation`] from parquet metadata.
+
use crate::basic::Type;
use crate::data_type::Int96;
use crate::errors::ParquetError;
@@ -25,8 +27,17 @@ use crate::format::{ColumnIndex, OffsetIndex, PageLocation};
use std::io::{Cursor, Read};
use thrift::protocol::{TCompactInputProtocol, TSerializable};
-/// Read on row group's all columns indexes and change into [`Index`]
-/// If not the format not available return an empty vector.
+/// Reads per-column [`Index`] for all columns of a row group by
+/// decoding [`ColumnIndex`] .
+///
+/// Returns a vector of `index[column_number]`.
+///
+/// Returns an empty vector if this row group does not contain a
+/// [`ColumnIndex`].
+///
+/// See [Column Index Documentation] for more details.
+///
+/// [Column Index Documentation]: https://github.com/apache/parquet-format/blob/master/PageIndex.md
pub fn read_columns_indexes<R: ChunkReader>(
reader: &R,
chunks: &[ColumnChunkMetaData],
@@ -60,8 +71,17 @@ pub fn read_columns_indexes<R: ChunkReader>(
.collect()
}
-/// Read on row group's all indexes and change into [`Index`]
-/// If not the format not available return an empty vector.
+/// Reads per-page [`PageLocation`] for all columns of a row group by
+/// decoding the [`OffsetIndex`].
+///
+/// Returns a vector of `location[column_number][page_number]`
+///
+/// Return an empty vector if this row group does not contain an
+/// [`OffsetIndex]`.
+///
+/// See [Column Index Documentation] for more details.
+///
+/// [Column Index Documentation]: https://github.com/apache/parquet-format/blob/master/PageIndex.md
pub fn read_pages_locations<R: ChunkReader>(
reader: &R,
chunks: &[ColumnChunkMetaData],
diff --git a/parquet/src/file/page_index/mod.rs b/parquet/src/file/page_index/mod.rs
index dcc1120fc..9372645d7 100644
--- a/parquet/src/file/page_index/mod.rs
+++ b/parquet/src/file/page_index/mod.rs
@@ -15,5 +15,9 @@
// specific language governing permissions and limitations
// under the License.
+//! Page Index of "[Column Index] Layout to Support Page Skipping"
+//!
+//! [Column Index]: https://github.com/apache/parquet-format/blob/master/PageIndex.md
+
pub mod index;
pub mod index_reader;
diff --git a/parquet/src/file/properties.rs b/parquet/src/file/properties.rs
index cbd31f9a1..2ce0050c9 100644
--- a/parquet/src/file/properties.rs
+++ b/parquet/src/file/properties.rs
@@ -15,7 +15,7 @@
// specific language governing permissions and limitations
// under the License.
-//! Writer properties.
+//! [`WriterProperties`]
//!
//! # Usage
//!
diff --git a/parquet/src/file/reader.rs b/parquet/src/file/reader.rs
index bb82f2299..545f22709 100644
--- a/parquet/src/file/reader.rs
+++ b/parquet/src/file/reader.rs
@@ -15,8 +15,9 @@
// specific language governing permissions and limitations
// under the License.
-//! Contains file reader API and provides methods to access file metadata, row group
-//! readers to read individual column chunks, or access record iterator.
+//! File reader API and methods to access file metadata, row group
+//! readers to read individual column chunks, or access record
+//! iterator.
use bytes::Bytes;
use std::{boxed::Box, io::Read, sync::Arc};
diff --git a/parquet/src/file/serialized_reader.rs b/parquet/src/file/serialized_reader.rs
index 95108ad58..e5ed26e9e 100644
--- a/parquet/src/file/serialized_reader.rs
+++ b/parquet/src/file/serialized_reader.rs
@@ -189,13 +189,16 @@ impl ReadOptionsBuilder {
self
}
- /// Enable page index in the reading option,
+ /// Enable reading the page index structures described in
+ /// "[Column Index] Layout to Support Page Skipping"
+ ///
+ /// [Column Index]: https://github.com/apache/parquet-format/blob/master/PageIndex.md
pub fn with_page_index(mut self) -> Self {
self.enable_page_index = true;
self
}
- /// Set the `ReaderProperties` configuration.
+ /// Set the [`ReaderProperties`] configuration.
pub fn with_reader_properties(mut self, properties: ReaderProperties) -> Self {
self.props = Some(properties);
self