You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@parquet.apache.org by uw...@apache.org on 2017/05/27 14:32:15 UTC
parquet-cpp git commit: PARQUET-978: [C++] Minimizing footer reads
for small(ish) metadata
Repository: parquet-cpp
Updated Branches:
refs/heads/master e93a085d7 -> b36c9ac00
PARQUET-978: [C++] Minimizing footer reads for small(ish) metadata
I realized we don't have a full end to end `writer-reader-test` for parquet. Would have been easier to test this PR. Does it make sense adding one?
Author: Deepak Majeti <de...@hpe.com>
Closes #341 from majetideepak/PARQUET-978 and squashes the following commits:
104df0c [Deepak Majeti] clang format
dbb28be [Deepak Majeti] check metadata size in reader-test
2b13715 [Deepak Majeti] Fix comment in writer-internal.cc
850b61c [Deepak Majeti] PARQUET-978
Project: http://git-wip-us.apache.org/repos/asf/parquet-cpp/repo
Commit: http://git-wip-us.apache.org/repos/asf/parquet-cpp/commit/b36c9ac0
Tree: http://git-wip-us.apache.org/repos/asf/parquet-cpp/tree/b36c9ac0
Diff: http://git-wip-us.apache.org/repos/asf/parquet-cpp/diff/b36c9ac0
Branch: refs/heads/master
Commit: b36c9ac000ae64a27c51aecb48da9b270c83b2e8
Parents: e93a085
Author: Deepak Majeti <de...@hpe.com>
Authored: Sat May 27 16:32:10 2017 +0200
Committer: Uwe L. Korn <uw...@apache.org>
Committed: Sat May 27 16:32:10 2017 +0200
----------------------------------------------------------------------
src/parquet/file/reader-internal.cc | 29 +++++++++++++++++++++--------
src/parquet/file/writer-internal.cc | 1 -
src/parquet/reader-test.cc | 2 ++
3 files changed, 23 insertions(+), 9 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/b36c9ac0/src/parquet/file/reader-internal.cc
----------------------------------------------------------------------
diff --git a/src/parquet/file/reader-internal.cc b/src/parquet/file/reader-internal.cc
index c05bb12..3543759 100644
--- a/src/parquet/file/reader-internal.cc
+++ b/src/parquet/file/reader-internal.cc
@@ -202,6 +202,8 @@ std::unique_ptr<PageReader> SerializedRowGroup::GetColumnPageReader(int i) {
// ----------------------------------------------------------------------
// SerializedFile: Parquet on-disk layout
+// PARQUET-978: Minimize footer reads by reading 64 KB from the end of the file
+static constexpr int64_t DEFAULT_FOOTER_READ_SIZE = 64 * 1024;
static constexpr uint32_t FOOTER_SIZE = 8;
static constexpr uint8_t PARQUET_MAGIC[4] = {'P', 'A', 'R', '1'};
@@ -255,15 +257,19 @@ void SerializedFile::ParseMetaData() {
throw ParquetException("Corrupted file, smaller than file footer");
}
- uint8_t footer_buffer[FOOTER_SIZE];
+ uint8_t footer_buffer[DEFAULT_FOOTER_READ_SIZE];
+ int64_t footer_read_size = std::min(file_size, DEFAULT_FOOTER_READ_SIZE);
int64_t bytes_read =
- source_->ReadAt(file_size - FOOTER_SIZE, FOOTER_SIZE, footer_buffer);
+ source_->ReadAt(file_size - footer_read_size, footer_read_size, footer_buffer);
- if (bytes_read != FOOTER_SIZE || memcmp(footer_buffer + 4, PARQUET_MAGIC, 4) != 0) {
+ // Check if all bytes are read. Check if last 4 bytes read have the magic bits
+ if (bytes_read != footer_read_size ||
+ memcmp(footer_buffer + footer_read_size - 4, PARQUET_MAGIC, 4) != 0) {
throw ParquetException("Invalid parquet file. Corrupt footer.");
}
- uint32_t metadata_len = *reinterpret_cast<uint32_t*>(footer_buffer);
+ uint32_t metadata_len =
+ *reinterpret_cast<uint32_t*>(footer_buffer + footer_read_size - FOOTER_SIZE);
int64_t metadata_start = file_size - FOOTER_SIZE - metadata_len;
if (FOOTER_SIZE + metadata_len > file_size) {
throw ParquetException(
@@ -273,10 +279,17 @@ void SerializedFile::ParseMetaData() {
std::shared_ptr<PoolBuffer> metadata_buffer =
AllocateBuffer(properties_.memory_pool(), metadata_len);
- bytes_read =
- source_->ReadAt(metadata_start, metadata_len, metadata_buffer->mutable_data());
- if (bytes_read != metadata_len) {
- throw ParquetException("Invalid parquet file. Could not read metadata bytes.");
+
+ // Check if the footer_buffer contains the entire metadata
+ if (footer_read_size >= (metadata_len + FOOTER_SIZE)) {
+ memcpy(metadata_buffer->mutable_data(),
+ footer_buffer + (footer_read_size - metadata_len - FOOTER_SIZE), metadata_len);
+ } else {
+ bytes_read =
+ source_->ReadAt(metadata_start, metadata_len, metadata_buffer->mutable_data());
+ if (bytes_read != metadata_len) {
+ throw ParquetException("Invalid parquet file. Could not read metadata bytes.");
+ }
}
file_metadata_ = FileMetaData::Make(metadata_buffer->data(), &metadata_len);
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/b36c9ac0/src/parquet/file/writer-internal.cc
----------------------------------------------------------------------
diff --git a/src/parquet/file/writer-internal.cc b/src/parquet/file/writer-internal.cc
index b69e87e..633498c 100644
--- a/src/parquet/file/writer-internal.cc
+++ b/src/parquet/file/writer-internal.cc
@@ -62,7 +62,6 @@ static format::Statistics ToThrift(const EncodedStatistics& row_group_statistics
void SerializedPageWriter::Close(bool has_dictionary, bool fallback) {
// index_page_offset = 0 since they are not supported
- // TODO: Remove default fallback = 'false' when implemented
metadata_->Finish(num_values_, dictionary_page_offset_, 0, data_page_offset_,
total_compressed_size_, total_uncompressed_size_, has_dictionary, fallback);
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/b36c9ac0/src/parquet/reader-test.cc
----------------------------------------------------------------------
diff --git a/src/parquet/reader-test.cc b/src/parquet/reader-test.cc
index 71f982b..e8a6813 100644
--- a/src/parquet/reader-test.cc
+++ b/src/parquet/reader-test.cc
@@ -83,6 +83,8 @@ TEST_F(TestAllTypesPlain, TestBatchRead) {
ASSERT_EQ(8, reader_->metadata()->num_rows());
// This file only has 1 row group
ASSERT_EQ(1, reader_->metadata()->num_row_groups());
+ // Size of the metadata is 730 bytes
+ ASSERT_EQ(730, reader_->metadata()->size());
// This row group must have 8 rows
ASSERT_EQ(8, group->metadata()->num_rows());