You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ap...@apache.org on 2022/12/01 10:58:02 UTC

[arrow] branch master updated: ARROW-18413: [C++][Parquet] Expose page index info from ColumnChunkMetaData (#14742)

This is an automated email from the ASF dual-hosted git repository.

apitrou pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 958fbfa5fe ARROW-18413: [C++][Parquet] Expose page index info from ColumnChunkMetaData (#14742)
958fbfa5fe is described below

commit 958fbfa5fe567a908f5cbcc09dfc54a00e480be9
Author: Gang Wu <us...@gmail.com>
AuthorDate: Thu Dec 1 18:57:52 2022 +0800

    ARROW-18413: [C++][Parquet] Expose page index info from ColumnChunkMetaData (#14742)
    
    This is the first step to support page index of parquet.
    
    Lead-authored-by: Gang Wu <us...@gmail.com>
    Co-authored-by: Antoine Pitrou <an...@python.org>
    Signed-off-by: Antoine Pitrou <an...@python.org>
---
 cpp/src/parquet/metadata.cc      | 22 ++++++++++++++++++++++
 cpp/src/parquet/metadata.h       | 11 +++++++++++
 cpp/src/parquet/metadata_test.cc | 40 ++++++++++++++++++++++++++++++++++++++++
 3 files changed, 73 insertions(+)

diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc
index c97e194955..1e1f96d906 100644
--- a/cpp/src/parquet/metadata.cc
+++ b/cpp/src/parquet/metadata.cc
@@ -312,6 +312,20 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl {
     }
   }
 
+  std::optional<IndexLocation> GetColumIndexLocation() const {
+    if (column_->__isset.column_index_offset && column_->__isset.column_index_length) {
+      return IndexLocation{column_->column_index_offset, column_->column_index_length};
+    }
+    return std::nullopt;
+  }
+
+  std::optional<IndexLocation> GetOffsetIndexLocation() const {
+    if (column_->__isset.offset_index_offset && column_->__isset.offset_index_length) {
+      return IndexLocation{column_->offset_index_offset, column_->offset_index_length};
+    }
+    return std::nullopt;
+  }
+
  private:
   mutable std::shared_ptr<Statistics> possible_stats_;
   std::vector<Encoding::type> encodings_;
@@ -420,6 +434,14 @@ std::unique_ptr<ColumnCryptoMetaData> ColumnChunkMetaData::crypto_metadata() con
   return impl_->crypto_metadata();
 }
 
+std::optional<IndexLocation> ColumnChunkMetaData::GetColumIndexLocation() const {
+  return impl_->GetColumIndexLocation();
+}
+
+std::optional<IndexLocation> ColumnChunkMetaData::GetOffsetIndexLocation() const {
+  return impl_->GetOffsetIndexLocation();
+}
+
 bool ColumnChunkMetaData::Equals(const ColumnChunkMetaData& other) const {
   return impl_->Equals(*other.impl_);
 }
diff --git a/cpp/src/parquet/metadata.h b/cpp/src/parquet/metadata.h
index bd59c628dc..8c619c5c63 100644
--- a/cpp/src/parquet/metadata.h
+++ b/cpp/src/parquet/metadata.h
@@ -20,6 +20,7 @@
 #include <cstdint>
 #include <map>
 #include <memory>
+#include <optional>
 #include <string>
 #include <utility>
 #include <vector>
@@ -118,6 +119,14 @@ struct PageEncodingStats {
   int32_t count;
 };
 
+/// \brief Public struct for location to page index in ColumnChunkMetaData.
+struct IndexLocation {
+  /// File offset of the given index, in bytes
+  int64_t offset;
+  /// Length of the given index, in bytes
+  int32_t length;
+};
+
 /// \brief ColumnChunkMetaData is a proxy around format::ColumnChunkMetaData.
 class PARQUET_EXPORT ColumnChunkMetaData {
  public:
@@ -170,6 +179,8 @@ class PARQUET_EXPORT ColumnChunkMetaData {
   int64_t total_compressed_size() const;
   int64_t total_uncompressed_size() const;
   std::unique_ptr<ColumnCryptoMetaData> crypto_metadata() const;
+  std::optional<IndexLocation> GetColumIndexLocation() const;
+  std::optional<IndexLocation> GetOffsetIndexLocation() const;
 
  private:
   explicit ColumnChunkMetaData(
diff --git a/cpp/src/parquet/metadata_test.cc b/cpp/src/parquet/metadata_test.cc
index a89d3d97fa..cabfb8078c 100644
--- a/cpp/src/parquet/metadata_test.cc
+++ b/cpp/src/parquet/metadata_test.cc
@@ -20,8 +20,10 @@
 #include <gtest/gtest.h>
 
 #include "arrow/util/key_value_metadata.h"
+#include "parquet/file_reader.h"
 #include "parquet/schema.h"
 #include "parquet/statistics.h"
+#include "parquet/test_util.h"
 #include "parquet/thrift_internal.h"
 #include "parquet/types.h"
 
@@ -292,6 +294,44 @@ TEST(Metadata, TestKeyValueMetadata) {
   EXPECT_TRUE(f_accessor->key_value_metadata()->Equals(*kvmeta));
 }
 
+TEST(Metadata, TestReadPageIndex) {
+  std::string dir_string(parquet::test::get_data_dir());
+  std::string path = dir_string + "/alltypes_tiny_pages.parquet";
+  auto reader = ParquetFileReader::OpenFile(path, false);
+  auto file_metadata = reader->metadata();
+  ASSERT_EQ(1, file_metadata->num_row_groups());
+  auto row_group_metadata = file_metadata->RowGroup(0);
+  ASSERT_EQ(13, row_group_metadata->num_columns());
+  std::vector<int64_t> ci_offsets = {323583, 327502, 328009, 331928, 335847,
+                                     339766, 350345, 354264, 364843, 384342,
+                                     -1,     386473, 390392};
+  std::vector<int32_t> ci_lengths = {3919,  507,   3919, 3919, 3919, 10579, 3919,
+                                     10579, 19499, 2131, -1,   3919, 3919};
+  std::vector<int64_t> oi_offsets = {394311, 397814, 398637, 401888, 405139,
+                                     408390, 413670, 416921, 422201, 431936,
+                                     435457, 446002, 449253};
+  std::vector<int32_t> oi_lengths = {3503, 823,  3251, 3251,  3251, 5280, 3251,
+                                     5280, 9735, 3521, 10545, 3251, 3251};
+  for (int i = 0; i < row_group_metadata->num_columns(); ++i) {
+    auto col_chunk_metadata = row_group_metadata->ColumnChunk(i);
+    auto ci_location = col_chunk_metadata->GetColumIndexLocation();
+    if (i == 10) {
+      // column_id 10 does not have column index
+      ASSERT_FALSE(ci_location.has_value());
+    } else {
+      ASSERT_TRUE(ci_location.has_value());
+    }
+    if (ci_location.has_value()) {
+      ASSERT_EQ(ci_offsets.at(i), ci_location->offset);
+      ASSERT_EQ(ci_lengths.at(i), ci_location->length);
+    }
+    auto oi_location = col_chunk_metadata->GetOffsetIndexLocation();
+    ASSERT_TRUE(oi_location.has_value());
+    ASSERT_EQ(oi_offsets.at(i), oi_location->offset);
+    ASSERT_EQ(oi_lengths.at(i), oi_location->length);
+  }
+}
+
 TEST(ApplicationVersion, Basics) {
   ApplicationVersion version("parquet-mr version 1.7.9");
   ApplicationVersion version1("parquet-mr version 1.8.0");