You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ap...@apache.org on 2023/06/14 19:07:29 UTC

[arrow] branch main updated: GH-35926: [C++][Parquet] Allow disabling ColumnIndex by disabling statistics (#35958)

This is an automated email from the ASF dual-hosted git repository.

apitrou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
     new 75e4764ddb GH-35926: [C++][Parquet] Allow disabling ColumnIndex by disabling statistics (#35958)
75e4764ddb is described below

commit 75e4764ddbd06cd9e5146b435fb42fac8a21d485
Author: mwish <ma...@gmail.com>
AuthorDate: Thu Jun 15 03:07:21 2023 +0800

    GH-35926: [C++][Parquet] Allow disabling ColumnIndex by disabling statistics (#35958)
    
    When statistics are disabled for a column, then only OffsetIndex should be emitted, not ColumnIndex.
    Also, fix a bug in the `ColumnProperties` constructor where `page_index_enabled` would not be forwarded correctly.
    
    * Closes: #35926
    
    Authored-by: mwish <ma...@gmail.com>
    Signed-off-by: Antoine Pitrou <an...@python.org>
---
 cpp/src/parquet/arrow/arrow_reader_writer_test.cc | 64 +++++++++++++++++++++++
 cpp/src/parquet/file_writer.cc                    |  3 +-
 cpp/src/parquet/printer.cc                        |  6 ++-
 cpp/src/parquet/properties.h                      |  2 +-
 4 files changed, 71 insertions(+), 4 deletions(-)

diff --git a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
index ba3702ede4..47570405c9 100644
--- a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
+++ b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
@@ -5350,6 +5350,70 @@ TEST_F(ParquetPageIndexRoundTripTest, SimpleRoundTrip) {
                             /*null_counts=*/{1}}));
 }
 
+TEST_F(ParquetPageIndexRoundTripTest, SimpleRoundTripWithStatsDisabled) {
+  auto writer_properties = WriterProperties::Builder()
+                               .enable_write_page_index()
+                               ->disable_statistics()
+                               ->build();
+  auto schema = ::arrow::schema({::arrow::field("c0", ::arrow::int64()),
+                                 ::arrow::field("c1", ::arrow::utf8()),
+                                 ::arrow::field("c2", ::arrow::list(::arrow::int64()))});
+  WriteFile(writer_properties, ::arrow::TableFromJSON(schema, {R"([
+      [1,     "a",  [1]      ],
+      [2,     "b",  [1, 2]   ],
+      [3,     "c",  [null]   ],
+      [null,  "d",  []       ],
+      [5,     null, [3, 3, 3]],
+      [6,     "f",  null     ]
+    ])"}));
+
+  ReadPageIndexes(/*expect_num_row_groups=*/1, /*expect_num_pages=*/1);
+  for (auto& column_index : column_indexes_) {
+    // Means page index is empty.
+    EXPECT_EQ(ColumnIndexObject{}, column_index);
+  }
+}
+
+TEST_F(ParquetPageIndexRoundTripTest, SimpleRoundTripWithColumnStatsDisabled) {
+  auto writer_properties = WriterProperties::Builder()
+                               .enable_write_page_index()
+                               ->disable_statistics("c0")
+                               ->max_row_group_length(4)
+                               ->build();
+  auto schema = ::arrow::schema({::arrow::field("c0", ::arrow::int64()),
+                                 ::arrow::field("c1", ::arrow::utf8()),
+                                 ::arrow::field("c2", ::arrow::list(::arrow::int64()))});
+  WriteFile(writer_properties, ::arrow::TableFromJSON(schema, {R"([
+      [1,     "a",  [1]      ],
+      [2,     "b",  [1, 2]   ],
+      [3,     "c",  [null]   ],
+      [null,  "d",  []       ],
+      [5,     null, [3, 3, 3]],
+      [6,     "f",  null     ]
+    ])"}));
+
+  ReadPageIndexes(/*expect_num_row_groups=*/2, /*expect_num_pages=*/1);
+
+  ColumnIndexObject empty_column_index{};
+  EXPECT_THAT(
+      column_indexes_,
+      ::testing::ElementsAre(
+          empty_column_index,
+          ColumnIndexObject{/*null_pages=*/{false}, /*min_values=*/{"a"},
+                            /*max_values=*/{"d"}, BoundaryOrder::Ascending,
+                            /*null_counts=*/{0}},
+          ColumnIndexObject{/*null_pages=*/{false}, /*min_values=*/{encode_int64(1)},
+                            /*max_values=*/{encode_int64(2)}, BoundaryOrder::Ascending,
+                            /*null_counts=*/{2}},
+          empty_column_index,
+          ColumnIndexObject{/*null_pages=*/{false}, /*min_values=*/{"f"},
+                            /*max_values=*/{"f"}, BoundaryOrder::Ascending,
+                            /*null_counts=*/{1}},
+          ColumnIndexObject{/*null_pages=*/{false}, /*min_values=*/{encode_int64(3)},
+                            /*max_values=*/{encode_int64(3)}, BoundaryOrder::Ascending,
+                            /*null_counts=*/{1}}));
+}
+
 TEST_F(ParquetPageIndexRoundTripTest, DropLargeStats) {
   auto writer_properties = WriterProperties::Builder()
                                .enable_write_page_index()
diff --git a/cpp/src/parquet/file_writer.cc b/cpp/src/parquet/file_writer.cc
index c91567e70d..42ce7591cb 100644
--- a/cpp/src/parquet/file_writer.cc
+++ b/cpp/src/parquet/file_writer.cc
@@ -146,7 +146,8 @@ class RowGroupSerializer : public RowGroupWriter::Contents {
     auto data_encryptor =
         file_encryptor_ ? file_encryptor_->GetColumnDataEncryptor(path->ToDotString())
                         : nullptr;
-    auto ci_builder = page_index_builder_ && properties_->page_index_enabled(path)
+    auto ci_builder = page_index_builder_ && properties_->page_index_enabled(path) &&
+                              properties_->statistics_enabled(path)
                           ? page_index_builder_->GetColumnIndexBuilder(column_ordinal)
                           : nullptr;
     auto oi_builder = page_index_builder_ && properties_->page_index_enabled(path)
diff --git a/cpp/src/parquet/printer.cc b/cpp/src/parquet/printer.cc
index 6e968c1218..2c81abb9ee 100644
--- a/cpp/src/parquet/printer.cc
+++ b/cpp/src/parquet/printer.cc
@@ -281,13 +281,15 @@ void ParquetFilePrinter::JSONPrint(std::ostream& stream, std::list<int> selected
              << "\"StatsSet\": ";
       if (column_chunk->is_stats_set()) {
         stream << "\"True\", \"Stats\": {";
-        std::string min = stats->EncodeMin(), max = stats->EncodeMax();
-        stream << "\"NumNulls\": \"" << stats->null_count();
+        if (stats->HasNullCount()) {
+          stream << "\"NumNulls\": \"" << stats->null_count();
+        }
         if (stats->HasDistinctCount()) {
           stream << "\", "
                  << "\"DistinctValues\": \"" << stats->distinct_count();
         }
         if (stats->HasMinMax()) {
+          std::string min = stats->EncodeMin(), max = stats->EncodeMax();
           stream << "\", "
                  << "\"Max\": \"" << FormatStatValue(descr->physical_type(), max)
                  << "\", "
diff --git a/cpp/src/parquet/properties.h b/cpp/src/parquet/properties.h
index d1012fccf0..f16f6c49d3 100644
--- a/cpp/src/parquet/properties.h
+++ b/cpp/src/parquet/properties.h
@@ -154,7 +154,7 @@ class PARQUET_EXPORT ColumnProperties {
         statistics_enabled_(statistics_enabled),
         max_stats_size_(max_stats_size),
         compression_level_(Codec::UseDefaultCompressionLevel()),
-        page_index_enabled_(DEFAULT_IS_PAGE_INDEX_ENABLED) {}
+        page_index_enabled_(page_index_enabled) {}
 
   void set_encoding(Encoding::type encoding) { encoding_ = encoding; }