You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ap...@apache.org on 2023/06/06 12:10:28 UTC

[arrow] branch main updated: GH-34375: [C++][Parquet] Ignore page header stats when page index enabled (#35455)

This is an automated email from the ASF dual-hosted git repository.

apitrou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
     new 7f8ccb5ff1 GH-34375: [C++][Parquet] Ignore page header stats when page index enabled (#35455)
7f8ccb5ff1 is described below

commit 7f8ccb5ff17980a7ca9c1668f33689738c634144
Author: Gang Wu <us...@gmail.com>
AuthorDate: Tue Jun 6 21:06:44 2023 +0900

    GH-34375: [C++][Parquet] Ignore page header stats when page index enabled (#35455)
    
    ### Rationale for this change
    
    Page-level statistics are probably not used in production, and after adding column indexes they are useless.
    parquet-mr already stopped writing them in https://issues.apache.org/jira/browse/PARQUET-1365.
    
    ### What changes are included in this PR?
    
    Once page index is enabled for one column, it does not write page stats to the header any more.
    
    ### Are these changes tested?
    
    Added a test to check page stats have been skipped.
    
    ### Are there any user-facing changes?
    
    Yes (behavior change when page index is enabled).
    
    * Closes: #34375
    
    Authored-by: Gang Wu <us...@gmail.com>
    Signed-off-by: Antoine Pitrou <an...@python.org>
---
 cpp/src/parquet/arrow/arrow_reader_writer_test.cc | 21 +++++++++++++++++++--
 cpp/src/parquet/column_writer.cc                  | 12 ++++++++++--
 cpp/src/parquet/properties.h                      |  7 +++++--
 docs/source/cpp/parquet.rst                       | 11 +++++------
 4 files changed, 39 insertions(+), 12 deletions(-)

diff --git a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
index ad33ca296a..ba3702ede4 100644
--- a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
+++ b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
@@ -5252,17 +5252,34 @@ class ParquetPageIndexRoundTripTest : public ::testing::Test {
       auto row_group_index_reader = page_index_reader->RowGroup(rg);
       ASSERT_NE(row_group_index_reader, nullptr);
 
+      auto row_group_reader = reader->RowGroup(rg);
+      ASSERT_NE(row_group_reader, nullptr);
+
       for (int col = 0; col < metadata->num_columns(); ++col) {
         auto column_index = row_group_index_reader->GetColumnIndex(col);
         column_indexes_.emplace_back(column_index.get());
 
+        bool expect_no_page_index =
+            expect_columns_without_index.find(col) != expect_columns_without_index.cend();
+
         auto offset_index = row_group_index_reader->GetOffsetIndex(col);
-        if (expect_columns_without_index.find(col) !=
-            expect_columns_without_index.cend()) {
+        if (expect_no_page_index) {
           ASSERT_EQ(offset_index, nullptr);
         } else {
           CheckOffsetIndex(offset_index.get(), expect_num_pages, &offset_lower_bound);
         }
+
+        // Verify page stats are not written to page header if page index is enabled.
+        auto page_reader = row_group_reader->GetColumnPageReader(col);
+        ASSERT_NE(page_reader, nullptr);
+        std::shared_ptr<Page> page = nullptr;
+        while ((page = page_reader->NextPage()) != nullptr) {
+          if (page->type() == PageType::DATA_PAGE ||
+              page->type() == PageType::DATA_PAGE_V2) {
+            ASSERT_EQ(std::static_pointer_cast<DataPage>(page)->statistics().is_set(),
+                      expect_no_page_index);
+          }
+        }
       }
     }
   }
diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc
index f298f282fe..33e9f8f665 100644
--- a/cpp/src/parquet/column_writer.cc
+++ b/cpp/src/parquet/column_writer.cc
@@ -460,7 +460,11 @@ class SerializedPageWriter : public PageWriter {
         ToThrift(page.definition_level_encoding()));
     data_page_header.__set_repetition_level_encoding(
         ToThrift(page.repetition_level_encoding()));
-    data_page_header.__set_statistics(ToThrift(page.statistics()));
+
+    // Write page statistics only when page index is not enabled.
+    if (column_index_builder_ == nullptr) {
+      data_page_header.__set_statistics(ToThrift(page.statistics()));
+    }
 
     page_header.__set_type(format::PageType::DATA_PAGE);
     page_header.__set_data_page_header(data_page_header);
@@ -479,7 +483,11 @@ class SerializedPageWriter : public PageWriter {
         page.repetition_levels_byte_length());
 
     data_page_header.__set_is_compressed(page.is_compressed());
-    data_page_header.__set_statistics(ToThrift(page.statistics()));
+
+    // Write page statistics only when page index is not enabled.
+    if (column_index_builder_ == nullptr) {
+      data_page_header.__set_statistics(ToThrift(page.statistics()));
+    }
 
     page_header.__set_type(format::PageType::DATA_PAGE_V2);
     page_header.__set_data_page_header_v2(data_page_header);
diff --git a/cpp/src/parquet/properties.h b/cpp/src/parquet/properties.h
index 0a9864de62..d1012fccf0 100644
--- a/cpp/src/parquet/properties.h
+++ b/cpp/src/parquet/properties.h
@@ -524,8 +524,11 @@ class PARQUET_EXPORT WriterProperties {
 
     /// Enable writing page index in general for all columns. Default disabled.
     ///
-    /// Page index contains statistics for data pages and can be used to skip pages
-    /// when scanning data in ordered and unordered columns.
+    /// Writing statistics to the page index disables the old method of writing
+    /// statistics to each data page header.
+    /// The page index makes filtering more efficient than the page header, as
+    /// it gathers all the statistics for a Parquet file in a single place,
+    /// avoiding scattered I/O.
     ///
     /// Please check the link below for more details:
     /// https://github.com/apache/parquet-format/blob/master/PageIndex.md
diff --git a/docs/source/cpp/parquet.rst b/docs/source/cpp/parquet.rst
index 0ea6063b2a..95f2d8d98d 100644
--- a/docs/source/cpp/parquet.rst
+++ b/docs/source/cpp/parquet.rst
@@ -304,6 +304,8 @@ Statistics are enabled by default for all columns. You can disable statistics fo
 all columns or specific columns using ``disable_statistics`` on the builder.
 There is a ``max_statistics_size`` which limits the maximum number of bytes that
 may be used for min and max values, useful for types like strings or binary blobs.
+If a column has enabled page index using ``enable_write_page_index``, then it does
+not write statistics to the page header because it is duplicated in the ColumnIndex.
 
 There are also Arrow-specific settings that can be configured with
 :class:`parquet::ArrowWriterProperties`:
@@ -573,13 +575,13 @@ Miscellaneous
 +--------------------------+----------+----------+---------+
 | Feature                  | Reading  | Writing  | Notes   |
 +==========================+==========+==========+=========+
-| Column Index             | ✓        |          | \(1)    |
+| Column Index             | ✓        | ✓        | \(1)    |
 +--------------------------+----------+----------+---------+
-| Offset Index             | ✓        |          | \(1)    |
+| Offset Index             | ✓        | ✓        | \(1)    |
 +--------------------------+----------+----------+---------+
 | Bloom Filter             | ✓        | ✓        | \(2)    |
 +--------------------------+----------+----------+---------+
-| CRC checksums            | ✓        | ✓        | \(3)    |
+| CRC checksums            | ✓        | ✓        |         |
 +--------------------------+----------+----------+---------+
 
 * \(1) Access to the Column and Offset Index structures is provided, but
@@ -587,6 +589,3 @@ Miscellaneous
 
 * \(2) APIs are provided for creating, serializing and deserializing Bloom
   Filters, but they are not integrated into data read APIs.
-
-* \(3) For now, only the checksums of V1 Data Pages and Dictionary Pages
-  are computed.