You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ap...@apache.org on 2021/09/22 07:06:46 UTC
[arrow] branch master updated: PARQUET-2089: [C++] Align RowGroup
file_offset with specification
This is an automated email from the ASF dual-hosted git repository.
apitrou pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new ec7aeb5 PARQUET-2089: [C++] Align RowGroup file_offset with specification
ec7aeb5 is described below
commit ec7aeb577330d8bfe6410dcc0a12c3111ecf6e9e
Author: Micah Kornfield <mi...@google.com>
AuthorDate: Wed Sep 22 09:03:58 2021 +0200
PARQUET-2089: [C++] Align RowGroup file_offset with specification
CC @sunchao
Closes #11149 from emkornfield/file_offsets
Authored-by: Micah Kornfield <mi...@google.com>
Signed-off-by: Antoine Pitrou <an...@python.org>
---
cpp/src/parquet/metadata.cc | 10 +++++++++-
cpp/src/parquet/metadata_test.cc | 17 ++++++++++++-----
2 files changed, 21 insertions(+), 6 deletions(-)
diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc
index 3ef02c4..0f99530 100644
--- a/cpp/src/parquet/metadata.cc
+++ b/cpp/src/parquet/metadata.cc
@@ -1567,7 +1567,15 @@ class RowGroupMetaDataBuilder::RowGroupMetaDataBuilderImpl {
throw ParquetException(ss.str());
}
if (i == 0) {
- file_offset = row_group_->columns[0].file_offset;
+ const format::ColumnMetaData& first_col = row_group_->columns[0].meta_data;
+ // As per spec, file_offset for the row group points to the first
+ // dictionary or data page of the column.
+ if (first_col.__isset.dictionary_page_offset &&
+ first_col.dictionary_page_offset > 0) {
+ file_offset = first_col.dictionary_page_offset;
+ } else {
+ file_offset = first_col.data_page_offset;
+ }
}
// sometimes column metadata is encrypted and not available to read,
// so we must get total_compressed_size from column builder
diff --git a/cpp/src/parquet/metadata_test.cc b/cpp/src/parquet/metadata_test.cc
index d34898e..a89d3d9 100644
--- a/cpp/src/parquet/metadata_test.cc
+++ b/cpp/src/parquet/metadata_test.cc
@@ -63,7 +63,9 @@ std::unique_ptr<parquet::FileMetaData> GenerateTableMetaData(
// column metadata
col1_builder->SetStatistics(stats_int);
col2_builder->SetStatistics(stats_float);
- col1_builder->Finish(nrows / 2, 6, 0, 10, 512, 600, true, false, dict_encoding_stats,
+ dict_encoding_stats.clear();
+ col1_builder->Finish(nrows / 2, /*dictionary_page_offset=*/0, 0, 10, 512, 600,
+ /*has_dictionary=*/false, false, dict_encoding_stats,
data_encoding_stats);
col2_builder->Finish(nrows / 2, 16, 0, 26, 512, 600, true, false, dict_encoding_stats,
data_encoding_stats);
@@ -136,6 +138,8 @@ TEST(Metadata, TestBuildAccess) {
ASSERT_EQ(nrows / 2, rg1_accessor->num_rows());
ASSERT_EQ(1024, rg1_accessor->total_byte_size());
ASSERT_EQ(1024, rg1_accessor->total_compressed_size());
+ EXPECT_EQ(rg1_accessor->file_offset(),
+ rg1_accessor->ColumnChunk(0)->dictionary_page_offset());
auto rg1_column1 = rg1_accessor->ColumnChunk(0);
auto rg1_column2 = rg1_accessor->ColumnChunk(1);
@@ -171,6 +175,8 @@ TEST(Metadata, TestBuildAccess) {
ASSERT_EQ(nrows / 2, rg2_accessor->num_rows());
ASSERT_EQ(1024, rg2_accessor->total_byte_size());
ASSERT_EQ(1024, rg2_accessor->total_compressed_size());
+ EXPECT_EQ(rg2_accessor->file_offset(),
+ rg2_accessor->ColumnChunk(0)->data_page_offset());
auto rg2_column1 = rg2_accessor->ColumnChunk(0);
auto rg2_column2 = rg2_accessor->ColumnChunk(1);
@@ -188,18 +194,19 @@ TEST(Metadata, TestBuildAccess) {
ASSERT_EQ(nrows / 2, rg2_column2->num_values());
ASSERT_EQ(DEFAULT_COMPRESSION_TYPE, rg2_column1->compression());
ASSERT_EQ(DEFAULT_COMPRESSION_TYPE, rg2_column2->compression());
- ASSERT_EQ(3, rg2_column1->encodings().size());
+ ASSERT_EQ(2, rg2_column1->encodings().size());
ASSERT_EQ(3, rg2_column2->encodings().size());
ASSERT_EQ(512, rg2_column1->total_compressed_size());
ASSERT_EQ(512, rg2_column2->total_compressed_size());
ASSERT_EQ(600, rg2_column1->total_uncompressed_size());
ASSERT_EQ(600, rg2_column2->total_uncompressed_size());
- ASSERT_EQ(6, rg2_column1->dictionary_page_offset());
+ EXPECT_FALSE(rg2_column1->has_dictionary_page());
+ ASSERT_EQ(0, rg2_column1->dictionary_page_offset());
ASSERT_EQ(16, rg2_column2->dictionary_page_offset());
ASSERT_EQ(10, rg2_column1->data_page_offset());
ASSERT_EQ(26, rg2_column2->data_page_offset());
- ASSERT_EQ(3, rg2_column1->encoding_stats().size());
- ASSERT_EQ(3, rg2_column2->encoding_stats().size());
+ ASSERT_EQ(2, rg2_column1->encoding_stats().size());
+ ASSERT_EQ(2, rg2_column2->encoding_stats().size());
// Test FileMetaData::set_file_path
ASSERT_TRUE(rg2_column1->file_path().empty());