You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@parquet.apache.org by uw...@apache.org on 2017/05/27 14:32:15 UTC

parquet-cpp git commit: PARQUET-978: [C++] Minimizing footer reads for small(ish) metadata

Repository: parquet-cpp
Updated Branches:
  refs/heads/master e93a085d7 -> b36c9ac00


PARQUET-978: [C++] Minimizing footer reads for small(ish) metadata

I realized we don't have a full end to end `writer-reader-test` for parquet. Would have been easier to test this PR. Does it make sense adding one?

Author: Deepak Majeti <de...@hpe.com>

Closes #341 from majetideepak/PARQUET-978 and squashes the following commits:

104df0c [Deepak Majeti] clang format
dbb28be [Deepak Majeti] check metadata size in reader-test
2b13715 [Deepak Majeti] Fix comment in writer-internal.cc
850b61c [Deepak Majeti] PARQUET-978


Project: http://git-wip-us.apache.org/repos/asf/parquet-cpp/repo
Commit: http://git-wip-us.apache.org/repos/asf/parquet-cpp/commit/b36c9ac0
Tree: http://git-wip-us.apache.org/repos/asf/parquet-cpp/tree/b36c9ac0
Diff: http://git-wip-us.apache.org/repos/asf/parquet-cpp/diff/b36c9ac0

Branch: refs/heads/master
Commit: b36c9ac000ae64a27c51aecb48da9b270c83b2e8
Parents: e93a085
Author: Deepak Majeti <de...@hpe.com>
Authored: Sat May 27 16:32:10 2017 +0200
Committer: Uwe L. Korn <uw...@apache.org>
Committed: Sat May 27 16:32:10 2017 +0200

----------------------------------------------------------------------
 src/parquet/file/reader-internal.cc | 29 +++++++++++++++++++++--------
 src/parquet/file/writer-internal.cc |  1 -
 src/parquet/reader-test.cc          |  2 ++
 3 files changed, 23 insertions(+), 9 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/b36c9ac0/src/parquet/file/reader-internal.cc
----------------------------------------------------------------------
diff --git a/src/parquet/file/reader-internal.cc b/src/parquet/file/reader-internal.cc
index c05bb12..3543759 100644
--- a/src/parquet/file/reader-internal.cc
+++ b/src/parquet/file/reader-internal.cc
@@ -202,6 +202,8 @@ std::unique_ptr<PageReader> SerializedRowGroup::GetColumnPageReader(int i) {
 // ----------------------------------------------------------------------
 // SerializedFile: Parquet on-disk layout
 
+// PARQUET-978: Minimize footer reads by reading 64 KB from the end of the file
+static constexpr int64_t DEFAULT_FOOTER_READ_SIZE = 64 * 1024;
 static constexpr uint32_t FOOTER_SIZE = 8;
 static constexpr uint8_t PARQUET_MAGIC[4] = {'P', 'A', 'R', '1'};
 
@@ -255,15 +257,19 @@ void SerializedFile::ParseMetaData() {
     throw ParquetException("Corrupted file, smaller than file footer");
   }
 
-  uint8_t footer_buffer[FOOTER_SIZE];
+  uint8_t footer_buffer[DEFAULT_FOOTER_READ_SIZE];
+  int64_t footer_read_size = std::min(file_size, DEFAULT_FOOTER_READ_SIZE);
   int64_t bytes_read =
-      source_->ReadAt(file_size - FOOTER_SIZE, FOOTER_SIZE, footer_buffer);
+      source_->ReadAt(file_size - footer_read_size, footer_read_size, footer_buffer);
 
-  if (bytes_read != FOOTER_SIZE || memcmp(footer_buffer + 4, PARQUET_MAGIC, 4) != 0) {
+  // Check if all bytes are read. Check if last 4 bytes read have the magic bits
+  if (bytes_read != footer_read_size ||
+      memcmp(footer_buffer + footer_read_size - 4, PARQUET_MAGIC, 4) != 0) {
     throw ParquetException("Invalid parquet file. Corrupt footer.");
   }
 
-  uint32_t metadata_len = *reinterpret_cast<uint32_t*>(footer_buffer);
+  uint32_t metadata_len =
+      *reinterpret_cast<uint32_t*>(footer_buffer + footer_read_size - FOOTER_SIZE);
   int64_t metadata_start = file_size - FOOTER_SIZE - metadata_len;
   if (FOOTER_SIZE + metadata_len > file_size) {
     throw ParquetException(
@@ -273,10 +279,17 @@ void SerializedFile::ParseMetaData() {
 
   std::shared_ptr<PoolBuffer> metadata_buffer =
       AllocateBuffer(properties_.memory_pool(), metadata_len);
-  bytes_read =
-      source_->ReadAt(metadata_start, metadata_len, metadata_buffer->mutable_data());
-  if (bytes_read != metadata_len) {
-    throw ParquetException("Invalid parquet file. Could not read metadata bytes.");
+
+  // Check if the footer_buffer contains the entire metadata
+  if (footer_read_size >= (metadata_len + FOOTER_SIZE)) {
+    memcpy(metadata_buffer->mutable_data(),
+        footer_buffer + (footer_read_size - metadata_len - FOOTER_SIZE), metadata_len);
+  } else {
+    bytes_read =
+        source_->ReadAt(metadata_start, metadata_len, metadata_buffer->mutable_data());
+    if (bytes_read != metadata_len) {
+      throw ParquetException("Invalid parquet file. Could not read metadata bytes.");
+    }
   }
 
   file_metadata_ = FileMetaData::Make(metadata_buffer->data(), &metadata_len);

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/b36c9ac0/src/parquet/file/writer-internal.cc
----------------------------------------------------------------------
diff --git a/src/parquet/file/writer-internal.cc b/src/parquet/file/writer-internal.cc
index b69e87e..633498c 100644
--- a/src/parquet/file/writer-internal.cc
+++ b/src/parquet/file/writer-internal.cc
@@ -62,7 +62,6 @@ static format::Statistics ToThrift(const EncodedStatistics& row_group_statistics
 
 void SerializedPageWriter::Close(bool has_dictionary, bool fallback) {
   // index_page_offset = 0 since they are not supported
-  // TODO: Remove default fallback = 'false' when implemented
   metadata_->Finish(num_values_, dictionary_page_offset_, 0, data_page_offset_,
       total_compressed_size_, total_uncompressed_size_, has_dictionary, fallback);
 

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/b36c9ac0/src/parquet/reader-test.cc
----------------------------------------------------------------------
diff --git a/src/parquet/reader-test.cc b/src/parquet/reader-test.cc
index 71f982b..e8a6813 100644
--- a/src/parquet/reader-test.cc
+++ b/src/parquet/reader-test.cc
@@ -83,6 +83,8 @@ TEST_F(TestAllTypesPlain, TestBatchRead) {
   ASSERT_EQ(8, reader_->metadata()->num_rows());
   // This file only has 1 row group
   ASSERT_EQ(1, reader_->metadata()->num_row_groups());
+  // Size of the metadata is 730 bytes
+  ASSERT_EQ(730, reader_->metadata()->size());
   // This row group must have 8 rows
   ASSERT_EQ(8, group->metadata()->num_rows());