You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ap...@apache.org on 2022/12/01 10:58:02 UTC
[arrow] branch master updated: ARROW-18413: [C++][Parquet] Expose page index info from ColumnChunkMetaData (#14742)
This is an automated email from the ASF dual-hosted git repository.
apitrou pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 958fbfa5fe ARROW-18413: [C++][Parquet] Expose page index info from ColumnChunkMetaData (#14742)
958fbfa5fe is described below
commit 958fbfa5fe567a908f5cbcc09dfc54a00e480be9
Author: Gang Wu <us...@gmail.com>
AuthorDate: Thu Dec 1 18:57:52 2022 +0800
ARROW-18413: [C++][Parquet] Expose page index info from ColumnChunkMetaData (#14742)
This is the first step to support page index of parquet.
Lead-authored-by: Gang Wu <us...@gmail.com>
Co-authored-by: Antoine Pitrou <an...@python.org>
Signed-off-by: Antoine Pitrou <an...@python.org>
---
cpp/src/parquet/metadata.cc | 22 ++++++++++++++++++++++
cpp/src/parquet/metadata.h | 11 +++++++++++
cpp/src/parquet/metadata_test.cc | 40 ++++++++++++++++++++++++++++++++++++++++
3 files changed, 73 insertions(+)
diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc
index c97e194955..1e1f96d906 100644
--- a/cpp/src/parquet/metadata.cc
+++ b/cpp/src/parquet/metadata.cc
@@ -312,6 +312,20 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl {
}
}
+ std::optional<IndexLocation> GetColumIndexLocation() const {
+ if (column_->__isset.column_index_offset && column_->__isset.column_index_length) {
+ return IndexLocation{column_->column_index_offset, column_->column_index_length};
+ }
+ return std::nullopt;
+ }
+
+ std::optional<IndexLocation> GetOffsetIndexLocation() const {
+ if (column_->__isset.offset_index_offset && column_->__isset.offset_index_length) {
+ return IndexLocation{column_->offset_index_offset, column_->offset_index_length};
+ }
+ return std::nullopt;
+ }
+
private:
mutable std::shared_ptr<Statistics> possible_stats_;
std::vector<Encoding::type> encodings_;
@@ -420,6 +434,14 @@ std::unique_ptr<ColumnCryptoMetaData> ColumnChunkMetaData::crypto_metadata() con
return impl_->crypto_metadata();
}
+std::optional<IndexLocation> ColumnChunkMetaData::GetColumIndexLocation() const {
+ return impl_->GetColumIndexLocation();
+}
+
+std::optional<IndexLocation> ColumnChunkMetaData::GetOffsetIndexLocation() const {
+ return impl_->GetOffsetIndexLocation();
+}
+
bool ColumnChunkMetaData::Equals(const ColumnChunkMetaData& other) const {
return impl_->Equals(*other.impl_);
}
diff --git a/cpp/src/parquet/metadata.h b/cpp/src/parquet/metadata.h
index bd59c628dc..8c619c5c63 100644
--- a/cpp/src/parquet/metadata.h
+++ b/cpp/src/parquet/metadata.h
@@ -20,6 +20,7 @@
#include <cstdint>
#include <map>
#include <memory>
+#include <optional>
#include <string>
#include <utility>
#include <vector>
@@ -118,6 +119,14 @@ struct PageEncodingStats {
int32_t count;
};
+/// \brief Public struct for location to page index in ColumnChunkMetaData.
+struct IndexLocation {
+ /// File offset of the given index, in bytes
+ int64_t offset;
+ /// Length of the given index, in bytes
+ int32_t length;
+};
+
/// \brief ColumnChunkMetaData is a proxy around format::ColumnChunkMetaData.
class PARQUET_EXPORT ColumnChunkMetaData {
public:
@@ -170,6 +179,8 @@ class PARQUET_EXPORT ColumnChunkMetaData {
int64_t total_compressed_size() const;
int64_t total_uncompressed_size() const;
std::unique_ptr<ColumnCryptoMetaData> crypto_metadata() const;
+ std::optional<IndexLocation> GetColumIndexLocation() const;
+ std::optional<IndexLocation> GetOffsetIndexLocation() const;
private:
explicit ColumnChunkMetaData(
diff --git a/cpp/src/parquet/metadata_test.cc b/cpp/src/parquet/metadata_test.cc
index a89d3d97fa..cabfb8078c 100644
--- a/cpp/src/parquet/metadata_test.cc
+++ b/cpp/src/parquet/metadata_test.cc
@@ -20,8 +20,10 @@
#include <gtest/gtest.h>
#include "arrow/util/key_value_metadata.h"
+#include "parquet/file_reader.h"
#include "parquet/schema.h"
#include "parquet/statistics.h"
+#include "parquet/test_util.h"
#include "parquet/thrift_internal.h"
#include "parquet/types.h"
@@ -292,6 +294,44 @@ TEST(Metadata, TestKeyValueMetadata) {
EXPECT_TRUE(f_accessor->key_value_metadata()->Equals(*kvmeta));
}
+TEST(Metadata, TestReadPageIndex) {
+ std::string dir_string(parquet::test::get_data_dir());
+ std::string path = dir_string + "/alltypes_tiny_pages.parquet";
+ auto reader = ParquetFileReader::OpenFile(path, false);
+ auto file_metadata = reader->metadata();
+ ASSERT_EQ(1, file_metadata->num_row_groups());
+ auto row_group_metadata = file_metadata->RowGroup(0);
+ ASSERT_EQ(13, row_group_metadata->num_columns());
+ std::vector<int64_t> ci_offsets = {323583, 327502, 328009, 331928, 335847,
+ 339766, 350345, 354264, 364843, 384342,
+ -1, 386473, 390392};
+ std::vector<int32_t> ci_lengths = {3919, 507, 3919, 3919, 3919, 10579, 3919,
+ 10579, 19499, 2131, -1, 3919, 3919};
+ std::vector<int64_t> oi_offsets = {394311, 397814, 398637, 401888, 405139,
+ 408390, 413670, 416921, 422201, 431936,
+ 435457, 446002, 449253};
+ std::vector<int32_t> oi_lengths = {3503, 823, 3251, 3251, 3251, 5280, 3251,
+ 5280, 9735, 3521, 10545, 3251, 3251};
+ for (int i = 0; i < row_group_metadata->num_columns(); ++i) {
+ auto col_chunk_metadata = row_group_metadata->ColumnChunk(i);
+ auto ci_location = col_chunk_metadata->GetColumIndexLocation();
+ if (i == 10) {
+ // column_id 10 does not have column index
+ ASSERT_FALSE(ci_location.has_value());
+ } else {
+ ASSERT_TRUE(ci_location.has_value());
+ }
+ if (ci_location.has_value()) {
+ ASSERT_EQ(ci_offsets.at(i), ci_location->offset);
+ ASSERT_EQ(ci_lengths.at(i), ci_location->length);
+ }
+ auto oi_location = col_chunk_metadata->GetOffsetIndexLocation();
+ ASSERT_TRUE(oi_location.has_value());
+ ASSERT_EQ(oi_offsets.at(i), oi_location->offset);
+ ASSERT_EQ(oi_lengths.at(i), oi_location->length);
+ }
+}
+
TEST(ApplicationVersion, Basics) {
ApplicationVersion version("parquet-mr version 1.7.9");
ApplicationVersion version1("parquet-mr version 1.8.0");