You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ap...@apache.org on 2023/05/31 16:09:35 UTC
[arrow] branch main updated: GH-35757: [C++][Parquet] using page-encoding-stats to build encodings (#35758)
This is an automated email from the ASF dual-hosted git repository.
apitrou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new fee8bf9ccc GH-35757: [C++][Parquet] using page-encoding-stats to build encodings (#35758)
fee8bf9ccc is described below
commit fee8bf9cccff3bb2513631ce5782f241922d2f21
Author: mwish <ma...@gmail.com>
AuthorDate: Thu Jun 1 00:09:26 2023 +0800
GH-35757: [C++][Parquet] using page-encoding-stats to build encodings (#35758)
### Rationale for this change
This patch uses `encoding_stats` to build the encodings. Currently, `ColumnChunk.encodings` is repeated ( May has multiple `PLAIN` ) and having some wired logic by getting `WriterProperties`. And if later we want to support Fallback to non "PLAIN" encoding, `encoding_stats` would be right, but `encoding` would be "PLAIN". So this patch using `encoding_stats` to build the `encoding`, and force them to be consistent.
### What changes are included in this PR?
### Are these changes tested?
Already has test
### Are there any user-facing changes?
Some behavior would be changed, but I think my change is right :)
* Closes: #35757
Lead-authored-by: mwish <ma...@gmail.com>
Co-authored-by: Antoine Pitrou <pi...@free.fr>
Co-authored-by: Gang Wu <us...@gmail.com>
Signed-off-by: Antoine Pitrou <an...@python.org>
---
cpp/src/parquet/column_writer_test.cc | 17 ++++-----
cpp/src/parquet/metadata.cc | 52 ++++++++++++++-------------
cpp/src/parquet/metadata_test.cc | 37 +++++++++++++++----
python/pyarrow/tests/parquet/test_metadata.py | 2 +-
4 files changed, 69 insertions(+), 39 deletions(-)
diff --git a/cpp/src/parquet/column_writer_test.cc b/cpp/src/parquet/column_writer_test.cc
index 8b133967e8..ed9bb4b407 100644
--- a/cpp/src/parquet/column_writer_test.cc
+++ b/cpp/src/parquet/column_writer_test.cc
@@ -184,24 +184,25 @@ class TestPrimitiveWriter : public PrimitiveTypedTest<TestType> {
ASSERT_EQ(VERY_LARGE_SIZE, this->values_read_);
this->values_.resize(VERY_LARGE_SIZE);
ASSERT_EQ(this->values_, this->values_out_);
- std::vector<Encoding::type> encodings = this->metadata_encodings();
+ std::vector<Encoding::type> encodings_vector = this->metadata_encodings();
+ std::set<Encoding::type> encodings(encodings_vector.begin(), encodings_vector.end());
if (this->type_num() == Type::BOOLEAN) {
// Dictionary encoding is not allowed for boolean type
// There are 2 encodings (PLAIN, RLE) in a non dictionary encoding case
- std::vector<Encoding::type> expected({Encoding::PLAIN, Encoding::RLE});
+ std::set<Encoding::type> expected({Encoding::PLAIN, Encoding::RLE});
ASSERT_EQ(encodings, expected);
} else if (version == ParquetVersion::PARQUET_1_0) {
- // There are 4 encodings (PLAIN_DICTIONARY, PLAIN, RLE, PLAIN) in a fallback case
+ // There are 3 encodings (PLAIN_DICTIONARY, PLAIN, RLE) in a fallback case
// for version 1.0
- std::vector<Encoding::type> expected(
- {Encoding::PLAIN_DICTIONARY, Encoding::PLAIN, Encoding::RLE, Encoding::PLAIN});
+ std::set<Encoding::type> expected(
+ {Encoding::PLAIN_DICTIONARY, Encoding::PLAIN, Encoding::RLE});
ASSERT_EQ(encodings, expected);
} else {
- // There are 4 encodings (RLE_DICTIONARY, PLAIN, RLE, PLAIN) in a fallback case for
+ // There are 3 encodings (RLE_DICTIONARY, PLAIN, RLE) in a fallback case for
// version 2.0
- std::vector<Encoding::type> expected(
- {Encoding::RLE_DICTIONARY, Encoding::PLAIN, Encoding::RLE, Encoding::PLAIN});
+ std::set<Encoding::type> expected(
+ {Encoding::RLE_DICTIONARY, Encoding::PLAIN, Encoding::RLE});
ASSERT_EQ(encodings, expected);
}
diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc
index 0bbd965807..541bcc18b8 100644
--- a/cpp/src/parquet/metadata.cc
+++ b/cpp/src/parquet/metadata.cc
@@ -1462,40 +1462,44 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl {
column_chunk_->meta_data.__set_total_compressed_size(compressed_size);
std::vector<format::Encoding::type> thrift_encodings;
- if (has_dictionary) {
- thrift_encodings.push_back(ToThrift(properties_->dictionary_index_encoding()));
- if (properties_->version() == ParquetVersion::PARQUET_1_0) {
- thrift_encodings.push_back(ToThrift(Encoding::PLAIN));
- } else {
- thrift_encodings.push_back(ToThrift(properties_->dictionary_page_encoding()));
- }
- } else { // Dictionary not enabled
- thrift_encodings.push_back(ToThrift(properties_->encoding(column_->path())));
- }
- thrift_encodings.push_back(ToThrift(Encoding::RLE));
- // Only PLAIN encoding is supported for fallback in V1
- // TODO(majetideepak): Use user specified encoding for V2
- if (dictionary_fallback) {
- thrift_encodings.push_back(ToThrift(Encoding::PLAIN));
- }
- column_chunk_->meta_data.__set_encodings(thrift_encodings);
std::vector<format::PageEncodingStats> thrift_encoding_stats;
+ auto add_encoding = [&thrift_encodings](format::Encoding::type value) {
+ auto it = std::find(thrift_encodings.begin(), thrift_encodings.end(), value);
+ if (it == thrift_encodings.end()) {
+ thrift_encodings.push_back(value);
+ }
+ };
// Add dictionary page encoding stats
- for (const auto& entry : dict_encoding_stats) {
- format::PageEncodingStats dict_enc_stat;
- dict_enc_stat.__set_page_type(format::PageType::DICTIONARY_PAGE);
- dict_enc_stat.__set_encoding(ToThrift(entry.first));
- dict_enc_stat.__set_count(entry.second);
- thrift_encoding_stats.push_back(dict_enc_stat);
+ if (has_dictionary) {
+ for (const auto& entry : dict_encoding_stats) {
+ format::PageEncodingStats dict_enc_stat;
+ dict_enc_stat.__set_page_type(format::PageType::DICTIONARY_PAGE);
+ // Dictionary Encoding would be PLAIN_DICTIONARY in v1 and
+ // PLAIN in v2.
+ format::Encoding::type dict_encoding = ToThrift(entry.first);
+ dict_enc_stat.__set_encoding(dict_encoding);
+ dict_enc_stat.__set_count(entry.second);
+ thrift_encoding_stats.push_back(dict_enc_stat);
+ add_encoding(dict_encoding);
+ }
}
+ // Always add encoding for RL/DL.
+ // BIT_PACKED is supported in `LevelEncoder`, but would only be used
+ // in benchmark and testing.
+ // And for now, we always add RLE even if there are no levels at all,
+ // while parquet-mr is more fine-grained.
+ add_encoding(format::Encoding::RLE);
// Add data page encoding stats
for (const auto& entry : data_encoding_stats) {
format::PageEncodingStats data_enc_stat;
data_enc_stat.__set_page_type(format::PageType::DATA_PAGE);
- data_enc_stat.__set_encoding(ToThrift(entry.first));
+ format::Encoding::type data_encoding = ToThrift(entry.first);
+ data_enc_stat.__set_encoding(data_encoding);
data_enc_stat.__set_count(entry.second);
thrift_encoding_stats.push_back(data_enc_stat);
+ add_encoding(data_encoding);
}
+ column_chunk_->meta_data.__set_encodings(thrift_encodings);
column_chunk_->meta_data.__set_encoding_stats(thrift_encoding_stats);
const auto& encrypt_md =
diff --git a/cpp/src/parquet/metadata_test.cc b/cpp/src/parquet/metadata_test.cc
index fc4dfb3ac8..13f73b374e 100644
--- a/cpp/src/parquet/metadata_test.cc
+++ b/cpp/src/parquet/metadata_test.cc
@@ -66,7 +66,6 @@ std::unique_ptr<parquet::FileMetaData> GenerateTableMetaData(
// column metadata
col1_builder->SetStatistics(stats_int);
col2_builder->SetStatistics(stats_float);
- dict_encoding_stats.clear();
col1_builder->Finish(nrows / 2, /*dictionary_page_offset=*/0, 0, 10, 512, 600,
/*has_dictionary=*/false, false, dict_encoding_stats,
data_encoding_stats);
@@ -80,6 +79,13 @@ std::unique_ptr<parquet::FileMetaData> GenerateTableMetaData(
return f_builder->Finish();
}
+void AssertEncodings(const ColumnChunkMetaData& data,
+ const std::set<parquet::Encoding::type>& expected) {
+ std::set<parquet::Encoding::type> encodings(data.encodings().begin(),
+ data.encodings().end());
+ ASSERT_EQ(encodings, expected);
+}
+
TEST(Metadata, TestBuildAccess) {
parquet::schema::NodeVector fields;
parquet::schema::NodePtr root;
@@ -160,8 +166,18 @@ TEST(Metadata, TestBuildAccess) {
ASSERT_EQ(DEFAULT_COMPRESSION_TYPE, rg1_column2->compression());
ASSERT_EQ(nrows / 2, rg1_column1->num_values());
ASSERT_EQ(nrows / 2, rg1_column2->num_values());
- ASSERT_EQ(3, rg1_column1->encodings().size());
- ASSERT_EQ(3, rg1_column2->encodings().size());
+ {
+ std::set<parquet::Encoding::type> encodings{parquet::Encoding::RLE,
+ parquet::Encoding::RLE_DICTIONARY,
+ parquet::Encoding::PLAIN};
+ AssertEncodings(*rg1_column1, encodings);
+ }
+ {
+ std::set<parquet::Encoding::type> encodings{parquet::Encoding::RLE,
+ parquet::Encoding::RLE_DICTIONARY,
+ parquet::Encoding::PLAIN};
+ AssertEncodings(*rg1_column2, encodings);
+ }
ASSERT_EQ(512, rg1_column1->total_compressed_size());
ASSERT_EQ(512, rg1_column2->total_compressed_size());
ASSERT_EQ(600, rg1_column1->total_uncompressed_size());
@@ -197,8 +213,17 @@ TEST(Metadata, TestBuildAccess) {
ASSERT_EQ(nrows / 2, rg2_column2->num_values());
ASSERT_EQ(DEFAULT_COMPRESSION_TYPE, rg2_column1->compression());
ASSERT_EQ(DEFAULT_COMPRESSION_TYPE, rg2_column2->compression());
- ASSERT_EQ(2, rg2_column1->encodings().size());
- ASSERT_EQ(3, rg2_column2->encodings().size());
+ {
+ std::set<parquet::Encoding::type> encodings{parquet::Encoding::RLE,
+ parquet::Encoding::PLAIN};
+ AssertEncodings(*rg2_column1, encodings);
+ }
+ {
+ std::set<parquet::Encoding::type> encodings{parquet::Encoding::RLE,
+ parquet::Encoding::RLE_DICTIONARY,
+ parquet::Encoding::PLAIN};
+ AssertEncodings(*rg2_column2, encodings);
+ }
ASSERT_EQ(512, rg2_column1->total_compressed_size());
ASSERT_EQ(512, rg2_column2->total_compressed_size());
ASSERT_EQ(600, rg2_column1->total_uncompressed_size());
@@ -209,7 +234,7 @@ TEST(Metadata, TestBuildAccess) {
ASSERT_EQ(10, rg2_column1->data_page_offset());
ASSERT_EQ(26, rg2_column2->data_page_offset());
ASSERT_EQ(2, rg2_column1->encoding_stats().size());
- ASSERT_EQ(2, rg2_column2->encoding_stats().size());
+ ASSERT_EQ(3, rg2_column2->encoding_stats().size());
// Test FileMetaData::set_file_path
ASSERT_TRUE(rg2_column1->file_path().empty());
diff --git a/python/pyarrow/tests/parquet/test_metadata.py b/python/pyarrow/tests/parquet/test_metadata.py
index fef1cc564b..342fdb21ae 100644
--- a/python/pyarrow/tests/parquet/test_metadata.py
+++ b/python/pyarrow/tests/parquet/test_metadata.py
@@ -128,7 +128,7 @@ def test_parquet_metadata_api():
assert col_meta.is_stats_set is True
assert isinstance(col_meta.statistics, pq.Statistics)
assert col_meta.compression == 'SNAPPY'
- assert col_meta.encodings == ('PLAIN', 'RLE')
+ assert set(col_meta.encodings) == {'PLAIN', 'RLE'}
assert col_meta.has_dictionary_page is False
assert col_meta.dictionary_page_offset is None
assert col_meta.data_page_offset > 0