You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ap...@apache.org on 2021/09/22 07:06:46 UTC

[arrow] branch master updated: PARQUET-2089: [C++] Align RowGroup file_offset with specification

This is an automated email from the ASF dual-hosted git repository.

apitrou pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new ec7aeb5  PARQUET-2089: [C++] Align RowGroup file_offset with specification
ec7aeb5 is described below

commit ec7aeb577330d8bfe6410dcc0a12c3111ecf6e9e
Author: Micah Kornfield <mi...@google.com>
AuthorDate: Wed Sep 22 09:03:58 2021 +0200

    PARQUET-2089: [C++] Align RowGroup file_offset with specification
    
    CC @sunchao
    
    Closes #11149 from emkornfield/file_offsets
    
    Authored-by: Micah Kornfield <mi...@google.com>
    Signed-off-by: Antoine Pitrou <an...@python.org>
---
 cpp/src/parquet/metadata.cc      | 10 +++++++++-
 cpp/src/parquet/metadata_test.cc | 17 ++++++++++++-----
 2 files changed, 21 insertions(+), 6 deletions(-)

diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc
index 3ef02c4..0f99530 100644
--- a/cpp/src/parquet/metadata.cc
+++ b/cpp/src/parquet/metadata.cc
@@ -1567,7 +1567,15 @@ class RowGroupMetaDataBuilder::RowGroupMetaDataBuilderImpl {
         throw ParquetException(ss.str());
       }
       if (i == 0) {
-        file_offset = row_group_->columns[0].file_offset;
+        const format::ColumnMetaData& first_col = row_group_->columns[0].meta_data;
+        // As per spec, file_offset for the row group points to the first
+        // dictionary or data page of the column.
+        if (first_col.__isset.dictionary_page_offset &&
+            first_col.dictionary_page_offset > 0) {
+          file_offset = first_col.dictionary_page_offset;
+        } else {
+          file_offset = first_col.data_page_offset;
+        }
       }
       // sometimes column metadata is encrypted and not available to read,
       // so we must get total_compressed_size from column builder
diff --git a/cpp/src/parquet/metadata_test.cc b/cpp/src/parquet/metadata_test.cc
index d34898e..a89d3d9 100644
--- a/cpp/src/parquet/metadata_test.cc
+++ b/cpp/src/parquet/metadata_test.cc
@@ -63,7 +63,9 @@ std::unique_ptr<parquet::FileMetaData> GenerateTableMetaData(
   // column metadata
   col1_builder->SetStatistics(stats_int);
   col2_builder->SetStatistics(stats_float);
-  col1_builder->Finish(nrows / 2, 6, 0, 10, 512, 600, true, false, dict_encoding_stats,
+  dict_encoding_stats.clear();
+  col1_builder->Finish(nrows / 2, /*dictionary_page_offset=*/0, 0, 10, 512, 600,
+                       /*has_dictionary=*/false, false, dict_encoding_stats,
                        data_encoding_stats);
   col2_builder->Finish(nrows / 2, 16, 0, 26, 512, 600, true, false, dict_encoding_stats,
                        data_encoding_stats);
@@ -136,6 +138,8 @@ TEST(Metadata, TestBuildAccess) {
     ASSERT_EQ(nrows / 2, rg1_accessor->num_rows());
     ASSERT_EQ(1024, rg1_accessor->total_byte_size());
     ASSERT_EQ(1024, rg1_accessor->total_compressed_size());
+    EXPECT_EQ(rg1_accessor->file_offset(),
+              rg1_accessor->ColumnChunk(0)->dictionary_page_offset());
 
     auto rg1_column1 = rg1_accessor->ColumnChunk(0);
     auto rg1_column2 = rg1_accessor->ColumnChunk(1);
@@ -171,6 +175,8 @@ TEST(Metadata, TestBuildAccess) {
     ASSERT_EQ(nrows / 2, rg2_accessor->num_rows());
     ASSERT_EQ(1024, rg2_accessor->total_byte_size());
     ASSERT_EQ(1024, rg2_accessor->total_compressed_size());
+    EXPECT_EQ(rg2_accessor->file_offset(),
+              rg2_accessor->ColumnChunk(0)->data_page_offset());
 
     auto rg2_column1 = rg2_accessor->ColumnChunk(0);
     auto rg2_column2 = rg2_accessor->ColumnChunk(1);
@@ -188,18 +194,19 @@ TEST(Metadata, TestBuildAccess) {
     ASSERT_EQ(nrows / 2, rg2_column2->num_values());
     ASSERT_EQ(DEFAULT_COMPRESSION_TYPE, rg2_column1->compression());
     ASSERT_EQ(DEFAULT_COMPRESSION_TYPE, rg2_column2->compression());
-    ASSERT_EQ(3, rg2_column1->encodings().size());
+    ASSERT_EQ(2, rg2_column1->encodings().size());
     ASSERT_EQ(3, rg2_column2->encodings().size());
     ASSERT_EQ(512, rg2_column1->total_compressed_size());
     ASSERT_EQ(512, rg2_column2->total_compressed_size());
     ASSERT_EQ(600, rg2_column1->total_uncompressed_size());
     ASSERT_EQ(600, rg2_column2->total_uncompressed_size());
-    ASSERT_EQ(6, rg2_column1->dictionary_page_offset());
+    EXPECT_FALSE(rg2_column1->has_dictionary_page());
+    ASSERT_EQ(0, rg2_column1->dictionary_page_offset());
     ASSERT_EQ(16, rg2_column2->dictionary_page_offset());
     ASSERT_EQ(10, rg2_column1->data_page_offset());
     ASSERT_EQ(26, rg2_column2->data_page_offset());
-    ASSERT_EQ(3, rg2_column1->encoding_stats().size());
-    ASSERT_EQ(3, rg2_column2->encoding_stats().size());
+    ASSERT_EQ(2, rg2_column1->encoding_stats().size());
+    ASSERT_EQ(2, rg2_column2->encoding_stats().size());
 
     // Test FileMetaData::set_file_path
     ASSERT_TRUE(rg2_column1->file_path().empty());