You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ap...@apache.org on 2023/06/06 12:10:28 UTC
[arrow] branch main updated: GH-34375: [C++][Parquet] Ignore page header stats when page index enabled (#35455)
This is an automated email from the ASF dual-hosted git repository.
apitrou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new 7f8ccb5ff1 GH-34375: [C++][Parquet] Ignore page header stats when page index enabled (#35455)
7f8ccb5ff1 is described below
commit 7f8ccb5ff17980a7ca9c1668f33689738c634144
Author: Gang Wu <us...@gmail.com>
AuthorDate: Tue Jun 6 21:06:44 2023 +0900
GH-34375: [C++][Parquet] Ignore page header stats when page index enabled (#35455)
### Rationale for this change
Page-level statistics are probably not used in production, and after adding column indexes they are useless.
parquet-mr already stopped writing them in https://issues.apache.org/jira/browse/PARQUET-1365.
### What changes are included in this PR?
Once page index is enabled for one column, it does not write page stats to the header any more.
### Are these changes tested?
Added a test to check page stats have been skipped.
### Are there any user-facing changes?
Yes (behavior change when page index is enabled).
* Closes: #34375
Authored-by: Gang Wu <us...@gmail.com>
Signed-off-by: Antoine Pitrou <an...@python.org>
---
cpp/src/parquet/arrow/arrow_reader_writer_test.cc | 21 +++++++++++++++++++--
cpp/src/parquet/column_writer.cc | 12 ++++++++++--
cpp/src/parquet/properties.h | 7 +++++--
docs/source/cpp/parquet.rst | 11 +++++------
4 files changed, 39 insertions(+), 12 deletions(-)
diff --git a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
index ad33ca296a..ba3702ede4 100644
--- a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
+++ b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
@@ -5252,17 +5252,34 @@ class ParquetPageIndexRoundTripTest : public ::testing::Test {
auto row_group_index_reader = page_index_reader->RowGroup(rg);
ASSERT_NE(row_group_index_reader, nullptr);
+ auto row_group_reader = reader->RowGroup(rg);
+ ASSERT_NE(row_group_reader, nullptr);
+
for (int col = 0; col < metadata->num_columns(); ++col) {
auto column_index = row_group_index_reader->GetColumnIndex(col);
column_indexes_.emplace_back(column_index.get());
+ bool expect_no_page_index =
+ expect_columns_without_index.find(col) != expect_columns_without_index.cend();
+
auto offset_index = row_group_index_reader->GetOffsetIndex(col);
- if (expect_columns_without_index.find(col) !=
- expect_columns_without_index.cend()) {
+ if (expect_no_page_index) {
ASSERT_EQ(offset_index, nullptr);
} else {
CheckOffsetIndex(offset_index.get(), expect_num_pages, &offset_lower_bound);
}
+
+ // Verify page stats are not written to page header if page index is enabled.
+ auto page_reader = row_group_reader->GetColumnPageReader(col);
+ ASSERT_NE(page_reader, nullptr);
+ std::shared_ptr<Page> page = nullptr;
+ while ((page = page_reader->NextPage()) != nullptr) {
+ if (page->type() == PageType::DATA_PAGE ||
+ page->type() == PageType::DATA_PAGE_V2) {
+ ASSERT_EQ(std::static_pointer_cast<DataPage>(page)->statistics().is_set(),
+ expect_no_page_index);
+ }
+ }
}
}
}
diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc
index f298f282fe..33e9f8f665 100644
--- a/cpp/src/parquet/column_writer.cc
+++ b/cpp/src/parquet/column_writer.cc
@@ -460,7 +460,11 @@ class SerializedPageWriter : public PageWriter {
ToThrift(page.definition_level_encoding()));
data_page_header.__set_repetition_level_encoding(
ToThrift(page.repetition_level_encoding()));
- data_page_header.__set_statistics(ToThrift(page.statistics()));
+
+ // Write page statistics only when page index is not enabled.
+ if (column_index_builder_ == nullptr) {
+ data_page_header.__set_statistics(ToThrift(page.statistics()));
+ }
page_header.__set_type(format::PageType::DATA_PAGE);
page_header.__set_data_page_header(data_page_header);
@@ -479,7 +483,11 @@ class SerializedPageWriter : public PageWriter {
page.repetition_levels_byte_length());
data_page_header.__set_is_compressed(page.is_compressed());
- data_page_header.__set_statistics(ToThrift(page.statistics()));
+
+ // Write page statistics only when page index is not enabled.
+ if (column_index_builder_ == nullptr) {
+ data_page_header.__set_statistics(ToThrift(page.statistics()));
+ }
page_header.__set_type(format::PageType::DATA_PAGE_V2);
page_header.__set_data_page_header_v2(data_page_header);
diff --git a/cpp/src/parquet/properties.h b/cpp/src/parquet/properties.h
index 0a9864de62..d1012fccf0 100644
--- a/cpp/src/parquet/properties.h
+++ b/cpp/src/parquet/properties.h
@@ -524,8 +524,11 @@ class PARQUET_EXPORT WriterProperties {
/// Enable writing page index in general for all columns. Default disabled.
///
- /// Page index contains statistics for data pages and can be used to skip pages
- /// when scanning data in ordered and unordered columns.
+ /// Writing statistics to the page index disables the old method of writing
+ /// statistics to each data page header.
+ /// The page index makes filtering more efficient than the page header, as
+ /// it gathers all the statistics for a Parquet file in a single place,
+ /// avoiding scattered I/O.
///
/// Please check the link below for more details:
/// https://github.com/apache/parquet-format/blob/master/PageIndex.md
diff --git a/docs/source/cpp/parquet.rst b/docs/source/cpp/parquet.rst
index 0ea6063b2a..95f2d8d98d 100644
--- a/docs/source/cpp/parquet.rst
+++ b/docs/source/cpp/parquet.rst
@@ -304,6 +304,8 @@ Statistics are enabled by default for all columns. You can disable statistics fo
all columns or specific columns using ``disable_statistics`` on the builder.
There is a ``max_statistics_size`` which limits the maximum number of bytes that
may be used for min and max values, useful for types like strings or binary blobs.
+If a column has enabled page index using ``enable_write_page_index``, then it does
+not write statistics to the page header because it is duplicated in the ColumnIndex.
There are also Arrow-specific settings that can be configured with
:class:`parquet::ArrowWriterProperties`:
@@ -573,13 +575,13 @@ Miscellaneous
+--------------------------+----------+----------+---------+
| Feature | Reading | Writing | Notes |
+==========================+==========+==========+=========+
-| Column Index | ✓ | | \(1) |
+| Column Index | ✓ | ✓ | \(1) |
+--------------------------+----------+----------+---------+
-| Offset Index | ✓ | | \(1) |
+| Offset Index | ✓ | ✓ | \(1) |
+--------------------------+----------+----------+---------+
| Bloom Filter | ✓ | ✓ | \(2) |
+--------------------------+----------+----------+---------+
-| CRC checksums | ✓ | ✓ | \(3) |
+| CRC checksums | ✓ | ✓ | |
+--------------------------+----------+----------+---------+
* \(1) Access to the Column and Offset Index structures is provided, but
@@ -587,6 +589,3 @@ Miscellaneous
* \(2) APIs are provided for creating, serializing and deserializing Bloom
Filters, but they are not integrated into data read APIs.
-
-* \(3) For now, only the checksums of V1 Data Pages and Dictionary Pages
- are computed.