You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ap...@apache.org on 2023/06/14 19:07:29 UTC
[arrow] branch main updated: GH-35926: [C++][Parquet] Allow disabling ColumnIndex by disabling statistics (#35958)
This is an automated email from the ASF dual-hosted git repository.
apitrou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new 75e4764ddb GH-35926: [C++][Parquet] Allow disabling ColumnIndex by disabling statistics (#35958)
75e4764ddb is described below
commit 75e4764ddbd06cd9e5146b435fb42fac8a21d485
Author: mwish <ma...@gmail.com>
AuthorDate: Thu Jun 15 03:07:21 2023 +0800
GH-35926: [C++][Parquet] Allow disabling ColumnIndex by disabling statistics (#35958)
When statistics are disabled for a column, then only OffsetIndex should be emitted, not ColumnIndex.
Also, fix a bug in the `ColumnProperties` constructor where `page_index_enabled` would not be forwarded correctly.
* Closes: #35926
Authored-by: mwish <ma...@gmail.com>
Signed-off-by: Antoine Pitrou <an...@python.org>
---
cpp/src/parquet/arrow/arrow_reader_writer_test.cc | 64 +++++++++++++++++++++++
cpp/src/parquet/file_writer.cc | 3 +-
cpp/src/parquet/printer.cc | 6 ++-
cpp/src/parquet/properties.h | 2 +-
4 files changed, 71 insertions(+), 4 deletions(-)
diff --git a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
index ba3702ede4..47570405c9 100644
--- a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
+++ b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
@@ -5350,6 +5350,70 @@ TEST_F(ParquetPageIndexRoundTripTest, SimpleRoundTrip) {
/*null_counts=*/{1}}));
}
+TEST_F(ParquetPageIndexRoundTripTest, SimpleRoundTripWithStatsDisabled) {
+ auto writer_properties = WriterProperties::Builder()
+ .enable_write_page_index()
+ ->disable_statistics()
+ ->build();
+ auto schema = ::arrow::schema({::arrow::field("c0", ::arrow::int64()),
+ ::arrow::field("c1", ::arrow::utf8()),
+ ::arrow::field("c2", ::arrow::list(::arrow::int64()))});
+ WriteFile(writer_properties, ::arrow::TableFromJSON(schema, {R"([
+ [1, "a", [1] ],
+ [2, "b", [1, 2] ],
+ [3, "c", [null] ],
+ [null, "d", [] ],
+ [5, null, [3, 3, 3]],
+ [6, "f", null ]
+ ])"}));
+
+ ReadPageIndexes(/*expect_num_row_groups=*/1, /*expect_num_pages=*/1);
+ for (auto& column_index : column_indexes_) {
+ // Means page index is empty.
+ EXPECT_EQ(ColumnIndexObject{}, column_index);
+ }
+}
+
+TEST_F(ParquetPageIndexRoundTripTest, SimpleRoundTripWithColumnStatsDisabled) {
+ auto writer_properties = WriterProperties::Builder()
+ .enable_write_page_index()
+ ->disable_statistics("c0")
+ ->max_row_group_length(4)
+ ->build();
+ auto schema = ::arrow::schema({::arrow::field("c0", ::arrow::int64()),
+ ::arrow::field("c1", ::arrow::utf8()),
+ ::arrow::field("c2", ::arrow::list(::arrow::int64()))});
+ WriteFile(writer_properties, ::arrow::TableFromJSON(schema, {R"([
+ [1, "a", [1] ],
+ [2, "b", [1, 2] ],
+ [3, "c", [null] ],
+ [null, "d", [] ],
+ [5, null, [3, 3, 3]],
+ [6, "f", null ]
+ ])"}));
+
+ ReadPageIndexes(/*expect_num_row_groups=*/2, /*expect_num_pages=*/1);
+
+ ColumnIndexObject empty_column_index{};
+ EXPECT_THAT(
+ column_indexes_,
+ ::testing::ElementsAre(
+ empty_column_index,
+ ColumnIndexObject{/*null_pages=*/{false}, /*min_values=*/{"a"},
+ /*max_values=*/{"d"}, BoundaryOrder::Ascending,
+ /*null_counts=*/{0}},
+ ColumnIndexObject{/*null_pages=*/{false}, /*min_values=*/{encode_int64(1)},
+ /*max_values=*/{encode_int64(2)}, BoundaryOrder::Ascending,
+ /*null_counts=*/{2}},
+ empty_column_index,
+ ColumnIndexObject{/*null_pages=*/{false}, /*min_values=*/{"f"},
+ /*max_values=*/{"f"}, BoundaryOrder::Ascending,
+ /*null_counts=*/{1}},
+ ColumnIndexObject{/*null_pages=*/{false}, /*min_values=*/{encode_int64(3)},
+ /*max_values=*/{encode_int64(3)}, BoundaryOrder::Ascending,
+ /*null_counts=*/{1}}));
+}
+
TEST_F(ParquetPageIndexRoundTripTest, DropLargeStats) {
auto writer_properties = WriterProperties::Builder()
.enable_write_page_index()
diff --git a/cpp/src/parquet/file_writer.cc b/cpp/src/parquet/file_writer.cc
index c91567e70d..42ce7591cb 100644
--- a/cpp/src/parquet/file_writer.cc
+++ b/cpp/src/parquet/file_writer.cc
@@ -146,7 +146,8 @@ class RowGroupSerializer : public RowGroupWriter::Contents {
auto data_encryptor =
file_encryptor_ ? file_encryptor_->GetColumnDataEncryptor(path->ToDotString())
: nullptr;
- auto ci_builder = page_index_builder_ && properties_->page_index_enabled(path)
+ auto ci_builder = page_index_builder_ && properties_->page_index_enabled(path) &&
+ properties_->statistics_enabled(path)
? page_index_builder_->GetColumnIndexBuilder(column_ordinal)
: nullptr;
auto oi_builder = page_index_builder_ && properties_->page_index_enabled(path)
diff --git a/cpp/src/parquet/printer.cc b/cpp/src/parquet/printer.cc
index 6e968c1218..2c81abb9ee 100644
--- a/cpp/src/parquet/printer.cc
+++ b/cpp/src/parquet/printer.cc
@@ -281,13 +281,15 @@ void ParquetFilePrinter::JSONPrint(std::ostream& stream, std::list<int> selected
<< "\"StatsSet\": ";
if (column_chunk->is_stats_set()) {
stream << "\"True\", \"Stats\": {";
- std::string min = stats->EncodeMin(), max = stats->EncodeMax();
- stream << "\"NumNulls\": \"" << stats->null_count();
+ if (stats->HasNullCount()) {
+ stream << "\"NumNulls\": \"" << stats->null_count();
+ }
if (stats->HasDistinctCount()) {
stream << "\", "
<< "\"DistinctValues\": \"" << stats->distinct_count();
}
if (stats->HasMinMax()) {
+ std::string min = stats->EncodeMin(), max = stats->EncodeMax();
stream << "\", "
<< "\"Max\": \"" << FormatStatValue(descr->physical_type(), max)
<< "\", "
diff --git a/cpp/src/parquet/properties.h b/cpp/src/parquet/properties.h
index d1012fccf0..f16f6c49d3 100644
--- a/cpp/src/parquet/properties.h
+++ b/cpp/src/parquet/properties.h
@@ -154,7 +154,7 @@ class PARQUET_EXPORT ColumnProperties {
statistics_enabled_(statistics_enabled),
max_stats_size_(max_stats_size),
compression_level_(Codec::UseDefaultCompressionLevel()),
- page_index_enabled_(DEFAULT_IS_PAGE_INDEX_ENABLED) {}
+ page_index_enabled_(page_index_enabled) {}
void set_encoding(Encoding::type encoding) { encoding_ = encoding; }