You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@parquet.apache.org by we...@apache.org on 2016/09/01 12:42:49 UTC
parquet-cpp git commit: PARQUET-573: Create a public API for reading
and writing file metadata
Repository: parquet-cpp
Updated Branches:
refs/heads/master aabb3db2c -> c0fd08a97
PARQUET-573: Create a public API for reading and writing file metadata
This patch adds API to read and write metadata as well as improves the writer properties class.
I am planning to add some comments to the code next. Meanwhile, any feedback will be helpful.
Author: Deepak Majeti <de...@hpe.com>
Author: Uwe L. Korn <uw...@xhochy.com>
Closes #143 from majetideepak/metadata and squashes the following commits:
2b8a546 [Deepak Majeti] comments and more testing for metadata api
59147c0 [Deepak Majeti] fix memory leak
34e8975 [Deepak Majeti] review comments and format
a977c6a [Deepak Majeti] added comment for file path
d4f0e82 [Deepak Majeti] friendship between reader and writer. implements PARQUET-692
1047507 [Uwe L. Korn] Better dictionary encoding user experience
7f37f85 [Deepak Majeti] review edits
9dab591 [Deepak Majeti] minor rename
a6b0646 [Deepak Majeti] added more dictionary fallback and enabled options to writer properties
3b9bad3 [Deepak Majeti] Metadata Reader writer
Project: http://git-wip-us.apache.org/repos/asf/parquet-cpp/repo
Commit: http://git-wip-us.apache.org/repos/asf/parquet-cpp/commit/c0fd08a9
Tree: http://git-wip-us.apache.org/repos/asf/parquet-cpp/tree/c0fd08a9
Diff: http://git-wip-us.apache.org/repos/asf/parquet-cpp/diff/c0fd08a9
Branch: refs/heads/master
Commit: c0fd08a97c6817180372c90693b0b356cbce1f11
Parents: aabb3db
Author: Deepak Majeti <de...@hpe.com>
Authored: Thu Sep 1 08:42:36 2016 -0400
Committer: Wes McKinney <we...@apache.org>
Committed: Thu Sep 1 08:42:36 2016 -0400
----------------------------------------------------------------------
CMakeLists.txt | 1 +
example/parquet-dump-schema.cc | 2 +-
src/parquet/CMakeLists.txt | 4 +-
src/parquet/api/reader.h | 3 +
src/parquet/column/properties.h | 173 +++++++--
src/parquet/column/writer.cc | 8 +-
src/parquet/file/CMakeLists.txt | 2 +
src/parquet/file/file-metadata-test.cc | 157 ++++++++
src/parquet/file/file-serialize-test.cc | 10 +-
src/parquet/file/metadata.cc | 549 +++++++++++++++++++++++++++
src/parquet/file/metadata.h | 203 ++++++++++
src/parquet/file/reader-internal.cc | 88 +----
src/parquet/file/reader-internal.h | 25 +-
src/parquet/file/reader.cc | 114 +++---
src/parquet/file/reader.h | 57 +--
src/parquet/reader-test.cc | 6 +-
src/parquet/util/bpacking.h | 2 +
17 files changed, 1141 insertions(+), 263 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/c0fd08a9/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/CMakeLists.txt b/CMakeLists.txt
index f833f2c..5c26e79 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -498,6 +498,7 @@ set(LIBPARQUET_SRCS
src/parquet/compression/snappy-codec.cc
src/parquet/compression/gzip-codec.cc
+ src/parquet/file/metadata.cc
src/parquet/file/reader.cc
src/parquet/file/reader-internal.cc
src/parquet/file/writer.cc
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/c0fd08a9/example/parquet-dump-schema.cc
----------------------------------------------------------------------
diff --git a/example/parquet-dump-schema.cc b/example/parquet-dump-schema.cc
index 760359e..ed7b570 100644
--- a/example/parquet-dump-schema.cc
+++ b/example/parquet-dump-schema.cc
@@ -27,7 +27,7 @@ int main(int argc, char** argv) {
try {
std::unique_ptr<ParquetFileReader> reader = ParquetFileReader::OpenFile(filename);
- PrintSchema(reader->descr()->schema().get(), std::cout);
+ PrintSchema(reader->metadata()->schema_descriptor()->schema().get(), std::cout);
} catch (const std::exception& e) {
std::cerr << "Parquet error: "
<< e.what()
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/c0fd08a9/src/parquet/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/src/parquet/CMakeLists.txt b/src/parquet/CMakeLists.txt
index 7d4e905..a2ebbad 100644
--- a/src/parquet/CMakeLists.txt
+++ b/src/parquet/CMakeLists.txt
@@ -21,8 +21,6 @@ install(FILES
types.h
DESTINATION include/parquet)
-ADD_PARQUET_TEST(public-api-test
- LINKAGE shared)
-
+ADD_PARQUET_TEST(public-api-test)
ADD_PARQUET_TEST(types-test)
ADD_PARQUET_TEST(reader-test)
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/c0fd08a9/src/parquet/api/reader.h
----------------------------------------------------------------------
diff --git a/src/parquet/api/reader.h b/src/parquet/api/reader.h
index 572ecf5..1e0c5e3 100644
--- a/src/parquet/api/reader.h
+++ b/src/parquet/api/reader.h
@@ -23,6 +23,9 @@
#include "parquet/exception.h"
#include "parquet/file/reader.h"
+// Metadata reader API
+#include "parquet/file/metadata.h"
+
// Schemas
#include "parquet/api/schema.h"
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/c0fd08a9/src/parquet/column/properties.h
----------------------------------------------------------------------
diff --git a/src/parquet/column/properties.h b/src/parquet/column/properties.h
index 3234dbc..c8f103b 100644
--- a/src/parquet/column/properties.h
+++ b/src/parquet/column/properties.h
@@ -22,6 +22,7 @@
#include <string>
#include <unordered_map>
+#include "parquet/exception.h"
#include "parquet/types.h"
#include "parquet/schema/types.h"
#include "parquet/util/input.h"
@@ -77,11 +78,13 @@ class PARQUET_EXPORT ReaderProperties {
ReaderProperties PARQUET_EXPORT default_reader_properties();
-static int64_t DEFAULT_PAGE_SIZE = 1024 * 1024;
-static int64_t DEFAULT_DICTIONARY_PAGE_SIZE = DEFAULT_PAGE_SIZE;
-static Encoding::type DEFAULT_ENCODING = Encoding::PLAIN;
+static constexpr int64_t DEFAULT_PAGE_SIZE = 1024 * 1024;
+static constexpr bool DEFAULT_IS_DICTIONARY_ENABLED = true;
+static constexpr int64_t DEFAULT_DICTIONARY_PAGE_SIZE = DEFAULT_PAGE_SIZE;
+static constexpr Encoding::type DEFAULT_ENCODING = Encoding::PLAIN;
static constexpr ParquetVersion::type DEFAULT_WRITER_VERSION =
ParquetVersion::PARQUET_1_0;
+static std::string DEFAULT_CREATED_BY = "Apache parquet-cpp";
static constexpr Compression::type DEFAULT_COMPRESSION_TYPE = Compression::UNCOMPRESSED;
using ColumnCodecs = std::unordered_map<std::string, Compression::type>;
@@ -92,10 +95,12 @@ class PARQUET_EXPORT WriterProperties {
public:
Builder()
: allocator_(default_allocator()),
+ dictionary_enabled_default_(DEFAULT_IS_DICTIONARY_ENABLED),
dictionary_pagesize_(DEFAULT_DICTIONARY_PAGE_SIZE),
- default_encoding_(DEFAULT_ENCODING),
pagesize_(DEFAULT_PAGE_SIZE),
version_(DEFAULT_WRITER_VERSION),
+ created_by_(DEFAULT_CREATED_BY),
+ default_encoding_(DEFAULT_ENCODING),
default_codec_(DEFAULT_COMPRESSION_TYPE) {}
virtual ~Builder() {}
@@ -104,6 +109,34 @@ class PARQUET_EXPORT WriterProperties {
return this;
}
+ Builder* enable_dictionary() {
+ dictionary_enabled_default_ = true;
+ return this;
+ }
+
+ Builder* disable_dictionary() {
+ dictionary_enabled_default_ = false;
+ return this;
+ }
+
+ Builder* enable_dictionary(const std::string& path) {
+ dictionary_enabled_[path] = true;
+ return this;
+ }
+
+ Builder* enable_dictionary(const std::shared_ptr<schema::ColumnPath>& path) {
+ return this->enable_dictionary(path->ToDotString());
+ }
+
+ Builder* disable_dictionary(const std::string& path) {
+ dictionary_enabled_[path] = true;
+ return this;
+ }
+
+ Builder* disable_dictionary(const std::shared_ptr<schema::ColumnPath>& path) {
+ return this->enable_dictionary(path->ToDotString());
+ }
+
Builder* dictionary_pagesize(int64_t dictionary_psize) {
dictionary_pagesize_ = dictionary_psize;
return this;
@@ -114,26 +147,57 @@ class PARQUET_EXPORT WriterProperties {
return this;
}
- Builder* encoding(
- const std::shared_ptr<schema::ColumnPath>& path, Encoding::type encoding_type) {
- return encoding(path->ToDotString(), encoding_type);
+ Builder* version(ParquetVersion::type version) {
+ version_ = version;
+ return this;
}
- Builder* encoding(const std::string& column_path, Encoding::type encoding_type) {
- encodings_[column_path] = encoding_type;
+ Builder* created_by(const std::string& created_by) {
+ created_by_ = created_by;
return this;
}
+ /**
+ * Define the encoding that is used when we don't utilise dictionary encoding.
+ *
+ * This either apply if dictionary encoding is disabled or if we fallback
+ * as the dictionary grew too large.
+ */
Builder* encoding(Encoding::type encoding_type) {
+ if (encoding_type == Encoding::PLAIN_DICTIONARY ||
+ encoding_type == Encoding::RLE_DICTIONARY) {
+ throw ParquetException("Can't use dictionary encoding as fallback encoding");
+ }
default_encoding_ = encoding_type;
return this;
}
- Builder* version(ParquetVersion::type version) {
- version_ = version;
+ /**
+ * Define the encoding that is used when we don't utilise dictionary encoding.
+ *
+ * This either apply if dictionary encoding is disabled or if we fallback
+ * as the dictionary grew too large.
+ */
+ Builder* encoding(const std::string& path, Encoding::type encoding_type) {
+ if (encoding_type == Encoding::PLAIN_DICTIONARY ||
+ encoding_type == Encoding::RLE_DICTIONARY) {
+ throw ParquetException("Can't use dictionary encoding as fallback encoding");
+ }
+ encodings_[path] = encoding_type;
return this;
}
+ /**
+ * Define the encoding that is used when we don't utilise dictionary encoding.
+ *
+ * This either apply if dictionary encoding is disabled or if we fallback
+ * as the dictionary grew too large.
+ */
+ Builder* encoding(
+ const std::shared_ptr<schema::ColumnPath>& path, Encoding::type encoding_type) {
+ return this->encoding(path->ToDotString(), encoding_type);
+ }
+
Builder* compression(Compression::type codec) {
default_codec_ = codec;
return this;
@@ -151,76 +215,101 @@ class PARQUET_EXPORT WriterProperties {
std::shared_ptr<WriterProperties> build() {
return std::shared_ptr<WriterProperties>(
- new WriterProperties(allocator_, dictionary_pagesize_, default_encoding_,
- encodings_, pagesize_, version_, default_codec_, codecs_));
+ new WriterProperties(allocator_, dictionary_enabled_default_,
+ dictionary_enabled_, dictionary_pagesize_, pagesize_, version_, created_by_,
+ default_encoding_, encodings_, default_codec_, codecs_));
}
private:
MemoryAllocator* allocator_;
+ bool dictionary_enabled_default_;
+ std::unordered_map<std::string, bool> dictionary_enabled_;
int64_t dictionary_pagesize_;
+ int64_t pagesize_;
+ ParquetVersion::type version_;
+ std::string created_by_;
// Encoding used for each column if not a specialized one is defined as
// part of encodings_
Encoding::type default_encoding_;
std::unordered_map<std::string, Encoding::type> encodings_;
- int64_t pagesize_;
- ParquetVersion::type version_;
// Default compression codec. This will be used for all columns that do
// not have a specific codec set as part of codecs_
Compression::type default_codec_;
ColumnCodecs codecs_;
};
- MemoryAllocator* allocator() const { return allocator_; }
+ inline MemoryAllocator* allocator() const { return allocator_; }
- int64_t dictionary_pagesize() const { return dictionary_pagesize_; }
+ inline bool dictionary_enabled(const std::shared_ptr<schema::ColumnPath>& path) const {
+ auto it = dictionary_enabled_.find(path->ToDotString());
+ if (it != dictionary_enabled_.end()) { return it->second; }
+ return dictionary_enabled_default_;
+ }
+
+ inline int64_t dictionary_pagesize() const { return dictionary_pagesize_; }
- int64_t data_pagesize() const { return pagesize_; }
+ inline int64_t data_pagesize() const { return pagesize_; }
- ParquetVersion::type version() const { return parquet_version_; }
+ inline ParquetVersion::type version() const { return parquet_version_; }
- Encoding::type encoding(const std::shared_ptr<schema::ColumnPath>& path) const {
- Encoding::type coding = default_encoding_;
+ inline std::string created_by() const { return parquet_created_by_; }
+
+ inline Encoding::type encoding(const std::shared_ptr<schema::ColumnPath>& path) const {
auto it = encodings_.find(path->ToDotString());
- if (it != encodings_.end()) { coding = it->second; }
-
- // Use the correct enum value for dictionary coding based on the used Parquet version
- if (coding == Encoding::PLAIN_DICTIONARY || coding == Encoding::RLE_DICTIONARY) {
- if (parquet_version_ == ParquetVersion::PARQUET_1_0) {
- return Encoding::PLAIN_DICTIONARY;
- } else {
- return Encoding::RLE_DICTIONARY;
- }
+ if (it != encodings_.end()) { return it->second; }
+ return default_encoding_;
+ }
+
+ inline Encoding::type dictionary_index_encoding() const {
+ if (parquet_version_ == ParquetVersion::PARQUET_1_0) {
+ return Encoding::PLAIN_DICTIONARY;
+ } else {
+ return Encoding::RLE_DICTIONARY;
}
- return coding;
}
- Compression::type compression(const std::shared_ptr<schema::ColumnPath>& path) const {
+ inline Encoding::type dictionary_page_encoding() const {
+ if (parquet_version_ == ParquetVersion::PARQUET_1_0) {
+ return Encoding::PLAIN_DICTIONARY;
+ } else {
+ return Encoding::PLAIN;
+ }
+ }
+
+ inline Compression::type compression(
+ const std::shared_ptr<schema::ColumnPath>& path) const {
auto it = codecs_.find(path->ToDotString());
if (it != codecs_.end()) return it->second;
return default_codec_;
}
private:
- explicit WriterProperties(MemoryAllocator* allocator, int64_t dictionary_pagesize,
- Encoding::type default_encoding,
- const std::unordered_map<std::string, Encoding::type>& encodings, int64_t pagesize,
- ParquetVersion::type version, Compression::type default_codec,
- const ColumnCodecs& codecs)
+ explicit WriterProperties(MemoryAllocator* allocator, bool dictionary_enabled_default,
+ std::unordered_map<std::string, bool> dictionary_enabled,
+ int64_t dictionary_pagesize, int64_t pagesize, ParquetVersion::type version,
+ const std::string& created_by, Encoding::type default_encoding,
+ std::unordered_map<std::string, Encoding::type> encodings,
+ Compression::type default_codec, const ColumnCodecs& codecs)
: allocator_(allocator),
+ dictionary_enabled_default_(dictionary_enabled_default),
+ dictionary_enabled_(dictionary_enabled),
dictionary_pagesize_(dictionary_pagesize),
- default_encoding_(default_encoding),
- encodings_(encodings),
pagesize_(pagesize),
parquet_version_(version),
+ parquet_created_by_(created_by),
+ default_encoding_(default_encoding),
+ encodings_(encodings),
default_codec_(default_codec),
codecs_(codecs) {}
-
MemoryAllocator* allocator_;
+ bool dictionary_enabled_default_;
+ std::unordered_map<std::string, bool> dictionary_enabled_;
int64_t dictionary_pagesize_;
- Encoding::type default_encoding_;
- std::unordered_map<std::string, Encoding::type> encodings_;
int64_t pagesize_;
ParquetVersion::type parquet_version_;
+ std::string parquet_created_by_;
+ Encoding::type default_encoding_;
+ std::unordered_map<std::string, Encoding::type> encodings_;
Compression::type default_codec_;
ColumnCodecs codecs_;
};
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/c0fd08a9/src/parquet/column/writer.cc
----------------------------------------------------------------------
diff --git a/src/parquet/column/writer.cc b/src/parquet/column/writer.cc
index 7845c58..124486c 100644
--- a/src/parquet/column/writer.cc
+++ b/src/parquet/column/writer.cc
@@ -182,9 +182,8 @@ void TypedColumnWriter<Type>::WriteDictionaryPage() {
// TODO Get rid of this deep call
dict_encoder->mem_pool()->FreeAll();
- Encoding::type dict_encoding = Encoding::PLAIN_DICTIONARY;
- if (encoding_ == Encoding::RLE_DICTIONARY) { dict_encoding = Encoding::PLAIN; }
- DictionaryPage page(buffer, dict_encoder->num_entries(), dict_encoding);
+ DictionaryPage page(
+ buffer, dict_encoder->num_entries(), properties_->dictionary_index_encoding());
total_bytes_written_ += pager_->WriteDictionaryPage(page);
}
@@ -195,6 +194,9 @@ std::shared_ptr<ColumnWriter> ColumnWriter::Make(const ColumnDescriptor* descr,
std::unique_ptr<PageWriter> pager, int64_t expected_rows,
const WriterProperties* properties) {
Encoding::type encoding = properties->encoding(descr->path());
+ if (properties->dictionary_enabled(descr->path())) {
+ encoding = properties->dictionary_page_encoding();
+ }
switch (descr->physical_type()) {
case Type::BOOLEAN:
return std::make_shared<BoolWriter>(
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/c0fd08a9/src/parquet/file/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/src/parquet/file/CMakeLists.txt b/src/parquet/file/CMakeLists.txt
index acfb513..fa995b8 100644
--- a/src/parquet/file/CMakeLists.txt
+++ b/src/parquet/file/CMakeLists.txt
@@ -16,9 +16,11 @@
# under the License.
install(FILES
+ metadata.h
reader.h
writer.h
DESTINATION include/parquet/file)
ADD_PARQUET_TEST(file-deserialize-test)
+ADD_PARQUET_TEST(file-metadata-test)
ADD_PARQUET_TEST(file-serialize-test)
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/c0fd08a9/src/parquet/file/file-metadata-test.cc
----------------------------------------------------------------------
diff --git a/src/parquet/file/file-metadata-test.cc b/src/parquet/file/file-metadata-test.cc
new file mode 100644
index 0000000..5fbd613
--- /dev/null
+++ b/src/parquet/file/file-metadata-test.cc
@@ -0,0 +1,157 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <gtest/gtest.h>
+#include "parquet/file/metadata.h"
+#include "parquet/schema/descriptor.h"
+#include "parquet/schema/types.h"
+#include "parquet/types.h"
+
+namespace parquet {
+
+namespace metadata {
+
+TEST(Metadata, TestBuildAccess) {
+ parquet::schema::NodeVector fields;
+ parquet::schema::NodePtr root;
+ parquet::SchemaDescriptor schema;
+
+ std::shared_ptr<WriterProperties> props = WriterProperties::Builder().build();
+
+ fields.push_back(parquet::schema::Int32("int_col", Repetition::REQUIRED));
+ fields.push_back(parquet::schema::Float("float_col", Repetition::REQUIRED));
+ root = parquet::schema::GroupNode::Make("schema", Repetition::REPEATED, fields);
+ schema.Init(root);
+
+ int64_t nrows = 1000;
+ ColumnStatistics stats_int;
+ stats_int.null_count = 0;
+ stats_int.distinct_count = nrows;
+ std::string int_min = std::string("100");
+ std::string int_max = std::string("200");
+ stats_int.min = &int_min;
+ stats_int.max = &int_max;
+ ColumnStatistics stats_float;
+ stats_float.null_count = 0;
+ stats_float.distinct_count = nrows;
+ std::string float_min = std::string("100.100");
+ std::string float_max = std::string("200.200");
+ stats_float.min = &float_min;
+ stats_float.max = &float_max;
+
+ auto f_builder = FileMetaDataBuilder::Make(&schema, props);
+ auto rg1_builder = f_builder->AppendRowGroup();
+ auto rg2_builder = f_builder->AppendRowGroup();
+
+ // Write the metadata
+ // rowgroup1 metadata
+ auto col1_builder = rg1_builder->NextColumnChunk();
+ auto col2_builder = rg1_builder->NextColumnChunk();
+ // column metadata
+ col1_builder->SetStatistics(stats_int);
+ col2_builder->SetStatistics(stats_float);
+ col1_builder->Finish(nrows / 2, 4, 0, 10, 512, 600, false);
+ col2_builder->Finish(nrows / 2, 24, 0, 30, 512, 600, false);
+ rg1_builder->Finish(nrows / 2);
+
+ // rowgroup2 metadata
+ col1_builder = rg2_builder->NextColumnChunk();
+ col2_builder = rg2_builder->NextColumnChunk();
+ // column metadata
+ col1_builder->SetStatistics(stats_int);
+ col2_builder->SetStatistics(stats_float);
+ col1_builder->Finish(nrows / 2, 6, 0, 10, 512, 600, false);
+ col2_builder->Finish(nrows / 2, 16, 0, 26, 512, 600, false);
+ rg2_builder->Finish(nrows / 2);
+
+ // Read the metadata
+ auto f_accessor = f_builder->Finish();
+
+ // file metadata
+ ASSERT_EQ(nrows, f_accessor->num_rows());
+ ASSERT_EQ(2, f_accessor->num_row_groups());
+ ASSERT_EQ(DEFAULT_WRITER_VERSION, f_accessor->version());
+ ASSERT_EQ(DEFAULT_CREATED_BY, f_accessor->created_by());
+ ASSERT_EQ(3, f_accessor->num_schema_elements());
+
+ // row group1 metadata
+ auto rg1_accessor = f_accessor->RowGroup(0);
+ ASSERT_EQ(2, rg1_accessor->num_columns());
+ ASSERT_EQ(nrows / 2, rg1_accessor->num_rows());
+ ASSERT_EQ(1024, rg1_accessor->total_byte_size());
+
+ auto rg1_column1 = rg1_accessor->ColumnChunk(0);
+ auto rg1_column2 = rg1_accessor->ColumnChunk(1);
+ ASSERT_EQ(true, rg1_column1->is_stats_set());
+ ASSERT_EQ(true, rg1_column2->is_stats_set());
+ ASSERT_EQ("100.100", *rg1_column2->statistics().min);
+ ASSERT_EQ("200.200", *rg1_column2->statistics().max);
+ ASSERT_EQ("100", *rg1_column1->statistics().min);
+ ASSERT_EQ("200", *rg1_column1->statistics().max);
+ ASSERT_EQ(0, rg1_column1->statistics().null_count);
+ ASSERT_EQ(0, rg1_column2->statistics().null_count);
+ ASSERT_EQ(nrows, rg1_column1->statistics().distinct_count);
+ ASSERT_EQ(nrows, rg1_column2->statistics().distinct_count);
+ ASSERT_EQ(DEFAULT_COMPRESSION_TYPE, rg1_column1->compression());
+ ASSERT_EQ(DEFAULT_COMPRESSION_TYPE, rg1_column2->compression());
+ ASSERT_EQ(nrows / 2, rg1_column1->num_values());
+ ASSERT_EQ(nrows / 2, rg1_column2->num_values());
+ ASSERT_EQ(2, rg1_column1->encodings().size());
+ ASSERT_EQ(2, rg1_column2->encodings().size());
+ ASSERT_EQ(512, rg1_column1->total_compressed_size());
+ ASSERT_EQ(512, rg1_column2->total_compressed_size());
+ ASSERT_EQ(600, rg1_column1->total_uncompressed_size());
+ ASSERT_EQ(600, rg1_column2->total_uncompressed_size());
+ ASSERT_EQ(4, rg1_column1->dictionary_page_offset());
+ ASSERT_EQ(24, rg1_column2->dictionary_page_offset());
+ ASSERT_EQ(10, rg1_column1->data_page_offset());
+ ASSERT_EQ(30, rg1_column2->data_page_offset());
+
+ auto rg2_accessor = f_accessor->RowGroup(1);
+ ASSERT_EQ(2, rg2_accessor->num_columns());
+ ASSERT_EQ(nrows / 2, rg2_accessor->num_rows());
+ ASSERT_EQ(1024, rg2_accessor->total_byte_size());
+
+ auto rg2_column1 = rg2_accessor->ColumnChunk(0);
+ auto rg2_column2 = rg2_accessor->ColumnChunk(1);
+ ASSERT_EQ(true, rg2_column1->is_stats_set());
+ ASSERT_EQ(true, rg2_column2->is_stats_set());
+ ASSERT_EQ("100.100", *rg2_column2->statistics().min);
+ ASSERT_EQ("200.200", *rg2_column2->statistics().max);
+ ASSERT_EQ("100", *rg2_column1->statistics().min);
+ ASSERT_EQ("200", *rg2_column1->statistics().max);
+ ASSERT_EQ(0, rg2_column1->statistics().null_count);
+ ASSERT_EQ(0, rg2_column2->statistics().null_count);
+ ASSERT_EQ(nrows, rg2_column1->statistics().distinct_count);
+ ASSERT_EQ(nrows, rg2_column2->statistics().distinct_count);
+ ASSERT_EQ(nrows / 2, rg2_column1->num_values());
+ ASSERT_EQ(nrows / 2, rg2_column2->num_values());
+ ASSERT_EQ(DEFAULT_COMPRESSION_TYPE, rg2_column1->compression());
+ ASSERT_EQ(DEFAULT_COMPRESSION_TYPE, rg2_column2->compression());
+ ASSERT_EQ(2, rg2_column1->encodings().size());
+ ASSERT_EQ(2, rg2_column2->encodings().size());
+ ASSERT_EQ(512, rg2_column1->total_compressed_size());
+ ASSERT_EQ(512, rg2_column2->total_compressed_size());
+ ASSERT_EQ(600, rg2_column1->total_uncompressed_size());
+ ASSERT_EQ(600, rg2_column2->total_uncompressed_size());
+ ASSERT_EQ(6, rg2_column1->dictionary_page_offset());
+ ASSERT_EQ(16, rg2_column2->dictionary_page_offset());
+ ASSERT_EQ(10, rg2_column1->data_page_offset());
+ ASSERT_EQ(26, rg2_column2->data_page_offset());
+}
+} // namespace metadata
+} // namespace parquet
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/c0fd08a9/src/parquet/file/file-serialize-test.cc
----------------------------------------------------------------------
diff --git a/src/parquet/file/file-serialize-test.cc b/src/parquet/file/file-serialize-test.cc
index ca7bb45..bd41e1e 100644
--- a/src/parquet/file/file-serialize-test.cc
+++ b/src/parquet/file/file-serialize-test.cc
@@ -79,13 +79,13 @@ class TestSerialize : public ::testing::Test {
auto buffer = sink->GetBuffer();
std::unique_ptr<RandomAccessSource> source(new BufferReader(buffer));
auto file_reader = ParquetFileReader::Open(std::move(source));
- ASSERT_EQ(1, file_reader->num_columns());
- ASSERT_EQ(1, file_reader->num_row_groups());
- ASSERT_EQ(100, file_reader->num_rows());
+ ASSERT_EQ(1, file_reader->metadata()->num_columns());
+ ASSERT_EQ(1, file_reader->metadata()->num_row_groups());
+ ASSERT_EQ(100, file_reader->metadata()->num_rows());
auto rg_reader = file_reader->RowGroup(0);
- ASSERT_EQ(1, rg_reader->num_columns());
- ASSERT_EQ(100, rg_reader->num_rows());
+ ASSERT_EQ(1, rg_reader->metadata()->num_columns());
+ ASSERT_EQ(100, rg_reader->metadata()->num_rows());
auto col_reader = std::static_pointer_cast<Int64Reader>(rg_reader->Column(0));
std::vector<int64_t> values_out(100);
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/c0fd08a9/src/parquet/file/metadata.cc
----------------------------------------------------------------------
diff --git a/src/parquet/file/metadata.cc b/src/parquet/file/metadata.cc
new file mode 100644
index 0000000..c1fd767
--- /dev/null
+++ b/src/parquet/file/metadata.cc
@@ -0,0 +1,549 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <vector>
+
+#include "parquet/file/metadata.h"
+#include "parquet/schema/converter.h"
+#include "parquet/thrift/util.h"
+
+namespace parquet {
+
+// MetaData Accessor
+// ColumnChunk metadata
+class ColumnChunkMetaData::ColumnChunkMetaDataImpl {
+ public:
+ explicit ColumnChunkMetaDataImpl(const format::ColumnChunk* column) : column_(column) {
+ const format::ColumnMetaData& meta_data = column->meta_data;
+ for (auto encoding : meta_data.encodings) {
+ encodings_.push_back(FromThrift(encoding));
+ }
+ if (meta_data.__isset.statistics) {
+ stats_.null_count = meta_data.statistics.null_count;
+ stats_.distinct_count = meta_data.statistics.distinct_count;
+ stats_.max = &meta_data.statistics.max;
+ stats_.min = &meta_data.statistics.min;
+ }
+ }
+ ~ColumnChunkMetaDataImpl() {}
+
+ // column chunk
+ inline int64_t file_offset() const { return column_->file_offset; }
+ inline const std::string& file_path() const { return column_->file_path; }
+
+ // column metadata
+ inline Type::type type() { return FromThrift(column_->meta_data.type); }
+
+ inline int64_t num_values() const { return column_->meta_data.num_values; }
+
+ std::shared_ptr<schema::ColumnPath> path_in_schema() {
+ return std::make_shared<schema::ColumnPath>(column_->meta_data.path_in_schema);
+ }
+
+ inline bool is_stats_set() const { return column_->meta_data.__isset.statistics; }
+
+ inline const ColumnStatistics& statistics() const { return stats_; }
+
+ inline Compression::type compression() const {
+ return FromThrift(column_->meta_data.codec);
+ }
+
+ const std::vector<Encoding::type>& encodings() const { return encodings_; }
+
+ inline int64_t has_dictionary_page() const {
+ return column_->meta_data.__isset.dictionary_page_offset;
+ }
+
+ inline int64_t dictionary_page_offset() const {
+ return column_->meta_data.dictionary_page_offset;
+ }
+
+ inline int64_t data_page_offset() const { return column_->meta_data.data_page_offset; }
+
+ inline int64_t index_page_offset() const {
+ return column_->meta_data.index_page_offset;
+ }
+
+ inline int64_t total_compressed_size() const {
+ return column_->meta_data.total_compressed_size;
+ }
+
+ inline int64_t total_uncompressed_size() const {
+ return column_->meta_data.total_uncompressed_size;
+ }
+
+ private:
+ ColumnStatistics stats_;
+ std::vector<Encoding::type> encodings_;
+ const format::ColumnChunk* column_;
+};
+
+std::unique_ptr<ColumnChunkMetaData> ColumnChunkMetaData::Make(const uint8_t* metadata) {
+ return std::unique_ptr<ColumnChunkMetaData>(new ColumnChunkMetaData(metadata));
+}
+
+ColumnChunkMetaData::ColumnChunkMetaData(const uint8_t* metadata)
+ : impl_{std::unique_ptr<ColumnChunkMetaDataImpl>(new ColumnChunkMetaDataImpl(
+ reinterpret_cast<const format::ColumnChunk*>(metadata)))} {}
+ColumnChunkMetaData::~ColumnChunkMetaData() {}
+
+// column chunk
+int64_t ColumnChunkMetaData::file_offset() const {
+ return impl_->file_offset();
+}
+
+const std::string& ColumnChunkMetaData::file_path() const {
+ return impl_->file_path();
+}
+
+// column metadata
+Type::type ColumnChunkMetaData::type() const {
+ return impl_->type();
+}
+
+int64_t ColumnChunkMetaData::num_values() const {
+ return impl_->num_values();
+}
+
+std::shared_ptr<schema::ColumnPath> ColumnChunkMetaData::path_in_schema() const {
+ return impl_->path_in_schema();
+}
+
+const ColumnStatistics& ColumnChunkMetaData::statistics() const {
+ return impl_->statistics();
+}
+
+bool ColumnChunkMetaData::is_stats_set() const {
+ return impl_->is_stats_set();
+}
+
+int64_t ColumnChunkMetaData::has_dictionary_page() const {
+ return impl_->has_dictionary_page();
+}
+
+int64_t ColumnChunkMetaData::dictionary_page_offset() const {
+ return impl_->dictionary_page_offset();
+}
+
+int64_t ColumnChunkMetaData::data_page_offset() const {
+ return impl_->data_page_offset();
+}
+
+int64_t ColumnChunkMetaData::index_page_offset() const {
+ return impl_->index_page_offset();
+}
+
+Compression::type ColumnChunkMetaData::compression() const {
+ return impl_->compression();
+}
+
+const std::vector<Encoding::type>& ColumnChunkMetaData::encodings() const {
+ return impl_->encodings();
+}
+
+int64_t ColumnChunkMetaData::total_uncompressed_size() const {
+ return impl_->total_uncompressed_size();
+}
+
+int64_t ColumnChunkMetaData::total_compressed_size() const {
+ return impl_->total_compressed_size();
+}
+
+// row-group metadata
+class RowGroupMetaData::RowGroupMetaDataImpl {
+ public:
+ explicit RowGroupMetaDataImpl(const format::RowGroup* row_group)
+ : row_group_(row_group) {}
+ ~RowGroupMetaDataImpl() {}
+
+ inline int num_columns() const { return row_group_->columns.size(); }
+
+ inline int64_t num_rows() const { return row_group_->num_rows; }
+
+ inline int64_t total_byte_size() const { return row_group_->total_byte_size; }
+
+ std::unique_ptr<ColumnChunkMetaData> ColumnChunk(int i) {
+ DCHECK(i < num_columns()) << "The file only has " << num_columns()
+ << " columns, requested metadata for column: " << i;
+ return ColumnChunkMetaData::Make(
+ reinterpret_cast<const uint8_t*>(&row_group_->columns[i]));
+ }
+
+ private:
+ const format::RowGroup* row_group_;
+};
+
+std::unique_ptr<RowGroupMetaData> RowGroupMetaData::Make(const uint8_t* metadata) {
+ return std::unique_ptr<RowGroupMetaData>(new RowGroupMetaData(metadata));
+}
+
+RowGroupMetaData::RowGroupMetaData(const uint8_t* metadata)
+ : impl_{std::unique_ptr<RowGroupMetaDataImpl>(new RowGroupMetaDataImpl(
+ reinterpret_cast<const format::RowGroup*>(metadata)))} {}
+RowGroupMetaData::~RowGroupMetaData() {}
+
+int RowGroupMetaData::num_columns() const {
+ return impl_->num_columns();
+}
+
+int64_t RowGroupMetaData::num_rows() const {
+ return impl_->num_rows();
+}
+
+int64_t RowGroupMetaData::total_byte_size() const {
+ return impl_->total_byte_size();
+}
+
+std::unique_ptr<ColumnChunkMetaData> RowGroupMetaData::ColumnChunk(int i) const {
+ return impl_->ColumnChunk(i);
+}
+
+// file metadata
+class FileMetaData::FileMetaDataImpl {
+ public:
+ FileMetaDataImpl() {}
+
+ explicit FileMetaDataImpl(const uint8_t* metadata, uint32_t* metadata_len) {
+ metadata_.reset(new format::FileMetaData);
+ DeserializeThriftMsg(metadata, metadata_len, metadata_.get());
+ InitSchema();
+ }
+ ~FileMetaDataImpl() {}
+
+ inline int num_columns() const { return schema_.num_columns(); }
+ inline int64_t num_rows() const { return metadata_->num_rows; }
+ inline int num_row_groups() const { return metadata_->row_groups.size(); }
+ inline int32_t version() const { return metadata_->version; }
+ inline const std::string& created_by() const { return metadata_->created_by; }
+ inline int num_schema_elements() const { return metadata_->schema.size(); }
+
+ void WriteTo(OutputStream* dst) { SerializeThriftMsg(metadata_.get(), 1024, dst); }
+
+ std::unique_ptr<RowGroupMetaData> RowGroup(int i) {
+ DCHECK(i < num_row_groups())
+ << "The file only has " << num_row_groups()
+ << " row groups, requested metadata for row group: " << i;
+ return RowGroupMetaData::Make(
+ reinterpret_cast<const uint8_t*>(&metadata_->row_groups[i]));
+ }
+
+ const SchemaDescriptor* schema_descriptor() const { return &schema_; }
+
+ private:
+ friend FileMetaDataBuilder;
+ std::unique_ptr<format::FileMetaData> metadata_;
+ void InitSchema() {
+ schema::FlatSchemaConverter converter(
+ &metadata_->schema[0], metadata_->schema.size());
+ schema_.Init(converter.Convert());
+ }
+ SchemaDescriptor schema_;
+};
+
+std::unique_ptr<FileMetaData> FileMetaData::Make(
+ const uint8_t* metadata, uint32_t* metadata_len) {
+ return std::unique_ptr<FileMetaData>(new FileMetaData(metadata, metadata_len));
+}
+
+FileMetaData::FileMetaData(const uint8_t* metadata, uint32_t* metadata_len)
+ : impl_{std::unique_ptr<FileMetaDataImpl>(
+ new FileMetaDataImpl(metadata, metadata_len))} {}
+
+FileMetaData::FileMetaData()
+ : impl_{std::unique_ptr<FileMetaDataImpl>(new FileMetaDataImpl())} {}
+
+FileMetaData::~FileMetaData() {}
+
+std::unique_ptr<RowGroupMetaData> FileMetaData::RowGroup(int i) const {
+ return impl_->RowGroup(i);
+}
+
+int FileMetaData::num_columns() const {
+ return impl_->num_columns();
+}
+
+int64_t FileMetaData::num_rows() const {
+ return impl_->num_rows();
+}
+
+int FileMetaData::num_row_groups() const {
+ return impl_->num_row_groups();
+}
+
+int32_t FileMetaData::version() const {
+ return impl_->version();
+}
+
+const std::string& FileMetaData::created_by() const {
+ return impl_->created_by();
+}
+
+int FileMetaData::num_schema_elements() const {
+ return impl_->num_schema_elements();
+}
+
+const SchemaDescriptor* FileMetaData::schema_descriptor() const {
+ return impl_->schema_descriptor();
+}
+
+void FileMetaData::WriteTo(OutputStream* dst) {
+ return impl_->WriteTo(dst);
+}
+
+// MetaData Builders
+// row-group metadata
+class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl {
+ public:
+ explicit ColumnChunkMetaDataBuilderImpl(const std::shared_ptr<WriterProperties>& props,
+ const ColumnDescriptor* column, uint8_t* contents)
+ : properties_(props), column_(column) {
+ column_chunk_ = reinterpret_cast<format::ColumnChunk*>(contents);
+ column_chunk_->meta_data.__set_type(ToThrift(column->physical_type()));
+ column_chunk_->meta_data.__set_path_in_schema(column->path()->ToDotVector());
+ column_chunk_->meta_data.__set_codec(
+ ToThrift(properties_->compression(column->path())));
+ }
+ ~ColumnChunkMetaDataBuilderImpl() {}
+
+ // column chunk
+ void set_file_path(const std::string& val) { column_chunk_->__set_file_path(val); }
+
+ // column metadata
+ void SetStatistics(const ColumnStatistics& val) {
+ format::Statistics stats;
+ stats.null_count = val.null_count;
+ stats.distinct_count = val.distinct_count;
+ stats.max = *val.max;
+ stats.min = *val.min;
+
+ column_chunk_->meta_data.statistics = stats;
+ column_chunk_->meta_data.__isset.statistics = true;
+ }
+
+ void Finish(int64_t num_values, int64_t dictionary_page_offset,
+ int64_t index_page_offset, int64_t data_page_offset, int64_t compressed_size,
+ int64_t uncompressed_size, bool dictionary_fallback = false) {
+ if (dictionary_page_offset > 0) {
+ column_chunk_->__set_file_offset(dictionary_page_offset + compressed_size);
+ } else {
+ column_chunk_->__set_file_offset(data_page_offset + compressed_size);
+ }
+ column_chunk_->__isset.meta_data = true;
+ column_chunk_->meta_data.__set_num_values(num_values);
+ column_chunk_->meta_data.__set_dictionary_page_offset(dictionary_page_offset);
+ column_chunk_->meta_data.__set_index_page_offset(index_page_offset);
+ column_chunk_->meta_data.__set_data_page_offset(data_page_offset);
+ column_chunk_->meta_data.__set_total_uncompressed_size(uncompressed_size);
+ column_chunk_->meta_data.__set_total_compressed_size(compressed_size);
+ std::vector<format::Encoding::type> thrift_encodings;
+ thrift_encodings.push_back(ToThrift(Encoding::RLE));
+ if (properties_->dictionary_enabled(column_->path())) {
+ thrift_encodings.push_back(ToThrift(properties_->dictionary_page_encoding()));
+ // add the encoding only if it is unique
+ if (properties_->version() == ParquetVersion::PARQUET_2_0) {
+ thrift_encodings.push_back(ToThrift(properties_->dictionary_index_encoding()));
+ }
+ }
+ if (!properties_->dictionary_enabled(column_->path()) || dictionary_fallback) {
+ thrift_encodings.push_back(ToThrift(properties_->encoding(column_->path())));
+ }
+ column_chunk_->meta_data.__set_encodings(thrift_encodings);
+ }
+
+ private:
+ format::ColumnChunk* column_chunk_;
+ const std::shared_ptr<WriterProperties> properties_;
+ const ColumnDescriptor* column_;
+};
+
+std::unique_ptr<ColumnChunkMetaDataBuilder> ColumnChunkMetaDataBuilder::Make(
+ const std::shared_ptr<WriterProperties>& props, const ColumnDescriptor* column,
+ uint8_t* contents) {
+ return std::unique_ptr<ColumnChunkMetaDataBuilder>(
+ new ColumnChunkMetaDataBuilder(props, column, contents));
+}
+
+ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilder(
+ const std::shared_ptr<WriterProperties>& props, const ColumnDescriptor* column,
+ uint8_t* contents)
+ : impl_{std::unique_ptr<ColumnChunkMetaDataBuilderImpl>(
+ new ColumnChunkMetaDataBuilderImpl(props, column, contents))} {}
+
+ColumnChunkMetaDataBuilder::~ColumnChunkMetaDataBuilder() {}
+
+void ColumnChunkMetaDataBuilder::set_file_path(const std::string& path) {
+ impl_->set_file_path(path);
+}
+
+void ColumnChunkMetaDataBuilder::Finish(int64_t num_values,
+ int64_t dictionary_page_offset, int64_t index_page_offset, int64_t data_page_offset,
+ int64_t compressed_size, int64_t uncompressed_size, bool dictionary_fallback) {
+ impl_->Finish(num_values, dictionary_page_offset, index_page_offset, data_page_offset,
+ compressed_size, uncompressed_size, dictionary_fallback);
+}
+
+void ColumnChunkMetaDataBuilder::SetStatistics(const ColumnStatistics& result) {
+ impl_->SetStatistics(result);
+}
+
+class RowGroupMetaDataBuilder::RowGroupMetaDataBuilderImpl {
+ public:
+ explicit RowGroupMetaDataBuilderImpl(const std::shared_ptr<WriterProperties>& props,
+ const SchemaDescriptor* schema, uint8_t* contents)
+ : properties_(props), schema_(schema), current_column_(0) {
+ row_group_ = reinterpret_cast<format::RowGroup*>(contents);
+ InitializeColumns(schema->num_columns());
+ }
+ ~RowGroupMetaDataBuilderImpl() {}
+
+ ColumnChunkMetaDataBuilder* NextColumnChunk() {
+ DCHECK(current_column_ < num_columns())
+ << "The schema only has " << num_columns()
+ << " columns, requested metadata for column: " << current_column_;
+ auto column = schema_->Column(current_column_);
+ auto column_builder = ColumnChunkMetaDataBuilder::Make(properties_, column,
+ reinterpret_cast<uint8_t*>(&row_group_->columns[current_column_++]));
+ auto column_builder_ptr = column_builder.get();
+ column_builders_.push_back(std::move(column_builder));
+ return column_builder_ptr;
+ }
+
+ void Finish(int64_t num_rows) {
+ DCHECK(current_column_ == schema_->num_columns())
+ << "Only " << current_column_ - 1 << " out of " << schema_->num_columns()
+ << " columns are initialized";
+ size_t total_byte_size = 0;
+
+ for (int i = 0; i < schema_->num_columns(); i++) {
+ DCHECK(row_group_->columns[i].file_offset > 0) << "Column " << i
+ << " is not complete.";
+ total_byte_size += row_group_->columns[i].meta_data.total_compressed_size;
+ }
+
+ row_group_->__set_total_byte_size(total_byte_size);
+ row_group_->__set_num_rows(num_rows);
+ }
+
+ private:
+ int num_columns() { return row_group_->columns.size(); }
+
+ void InitializeColumns(int ncols) { row_group_->columns.resize(ncols); }
+
+ format::RowGroup* row_group_;
+ const std::shared_ptr<WriterProperties> properties_;
+ const SchemaDescriptor* schema_;
+ std::vector<std::unique_ptr<ColumnChunkMetaDataBuilder>> column_builders_;
+ int current_column_;
+};
+
+std::unique_ptr<RowGroupMetaDataBuilder> RowGroupMetaDataBuilder::Make(
+ const std::shared_ptr<WriterProperties>& props, const SchemaDescriptor* schema_,
+ uint8_t* contents) {
+ return std::unique_ptr<RowGroupMetaDataBuilder>(
+ new RowGroupMetaDataBuilder(props, schema_, contents));
+}
+
+RowGroupMetaDataBuilder::RowGroupMetaDataBuilder(
+ const std::shared_ptr<WriterProperties>& props, const SchemaDescriptor* schema_,
+ uint8_t* contents)
+ : impl_{std::unique_ptr<RowGroupMetaDataBuilderImpl>(
+ new RowGroupMetaDataBuilderImpl(props, schema_, contents))} {}
+
+RowGroupMetaDataBuilder::~RowGroupMetaDataBuilder() {}
+
+ColumnChunkMetaDataBuilder* RowGroupMetaDataBuilder::NextColumnChunk() {
+ return impl_->NextColumnChunk();
+}
+
+void RowGroupMetaDataBuilder::Finish(int64_t num_rows) {
+ impl_->Finish(num_rows);
+}
+
+// file metadata
+class FileMetaDataBuilder::FileMetaDataBuilderImpl {
+ public:
+ explicit FileMetaDataBuilderImpl(
+ const SchemaDescriptor* schema, const std::shared_ptr<WriterProperties>& props)
+ : properties_(props), schema_(schema) {
+ metadata_.reset(new format::FileMetaData());
+ }
+ ~FileMetaDataBuilderImpl() {}
+
+ RowGroupMetaDataBuilder* AppendRowGroup() {
+ auto row_group = std::unique_ptr<format::RowGroup>(new format::RowGroup());
+ auto row_group_builder = RowGroupMetaDataBuilder::Make(
+ properties_, schema_, reinterpret_cast<uint8_t*>(row_group.get()));
+ RowGroupMetaDataBuilder* row_group_ptr = row_group_builder.get();
+ row_group_builders_.push_back(std::move(row_group_builder));
+ row_groups_.push_back(std::move(row_group));
+ return row_group_ptr;
+ }
+
+ std::unique_ptr<FileMetaData> Finish() {
+ int64_t total_rows = 0;
+ std::vector<format::RowGroup> row_groups;
+ for (auto row_group = row_groups_.begin(); row_group != row_groups_.end();
+ row_group++) {
+ auto rowgroup = *((*row_group).get());
+ row_groups.push_back(rowgroup);
+ total_rows += rowgroup.num_rows;
+ }
+ metadata_->__set_num_rows(total_rows);
+ metadata_->__set_row_groups(row_groups);
+ metadata_->__set_version(properties_->version());
+ metadata_->__set_created_by(properties_->created_by());
+ parquet::schema::SchemaFlattener flattener(
+ static_cast<parquet::schema::GroupNode*>(schema_->schema().get()),
+ &metadata_->schema);
+ flattener.Flatten();
+ auto file_meta_data = std::unique_ptr<FileMetaData>(new FileMetaData());
+ file_meta_data->impl_->metadata_ = std::move(metadata_);
+ file_meta_data->impl_->InitSchema();
+ return file_meta_data;
+ }
+
+ protected:
+ std::unique_ptr<format::FileMetaData> metadata_;
+
+ private:
+ const std::shared_ptr<WriterProperties> properties_;
+ std::vector<std::unique_ptr<format::RowGroup>> row_groups_;
+ std::vector<std::unique_ptr<RowGroupMetaDataBuilder>> row_group_builders_;
+ const SchemaDescriptor* schema_;
+};
+
+std::unique_ptr<FileMetaDataBuilder> FileMetaDataBuilder::Make(
+ const SchemaDescriptor* schema, const std::shared_ptr<WriterProperties>& props) {
+ return std::unique_ptr<FileMetaDataBuilder>(new FileMetaDataBuilder(schema, props));
+}
+
+FileMetaDataBuilder::FileMetaDataBuilder(
+ const SchemaDescriptor* schema, const std::shared_ptr<WriterProperties>& props)
+ : impl_{std::unique_ptr<FileMetaDataBuilderImpl>(
+ new FileMetaDataBuilderImpl(schema, props))} {}
+
+FileMetaDataBuilder::~FileMetaDataBuilder() {}
+
+RowGroupMetaDataBuilder* FileMetaDataBuilder::AppendRowGroup() {
+ return impl_->AppendRowGroup();
+}
+
+std::unique_ptr<FileMetaData> FileMetaDataBuilder::Finish() {
+ return impl_->Finish();
+}
+
+} // namespace parquet
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/c0fd08a9/src/parquet/file/metadata.h
----------------------------------------------------------------------
diff --git a/src/parquet/file/metadata.h b/src/parquet/file/metadata.h
new file mode 100644
index 0000000..c35f82f
--- /dev/null
+++ b/src/parquet/file/metadata.h
@@ -0,0 +1,203 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#ifndef PARQUET_FILE_METADATA_H
+#define PARQUET_FILE_METADATA_H
+
+#include <string>
+#include <vector>
+#include <set>
+
+#include "parquet/column/properties.h"
+#include "parquet/compression/codec.h"
+#include "parquet/schema/descriptor.h"
+#include "parquet/types.h"
+#include "parquet/util/output.h"
+#include "parquet/util/visibility.h"
+
+namespace parquet {
+
+// ColumnStatistics does not own the min/max values
+struct ColumnStatistics {
+ int64_t null_count;
+ int64_t distinct_count;
+ const std::string* min;
+ const std::string* max;
+};
+
+class PARQUET_EXPORT ColumnChunkMetaData {
+ public:
+ // API convenience to get a MetaData accessor
+ static std::unique_ptr<ColumnChunkMetaData> Make(const uint8_t* metadata);
+
+ ~ColumnChunkMetaData();
+
+ // column chunk
+ int64_t file_offset() const;
+ // parameter is only used when a dataset is spread across multiple files
+ const std::string& file_path() const;
+ // column metadata
+ Type::type type() const;
+ int64_t num_values() const;
+ std::shared_ptr<schema::ColumnPath> path_in_schema() const;
+ bool is_stats_set() const;
+ const ColumnStatistics& statistics() const;
+ Compression::type compression() const;
+ const std::vector<Encoding::type>& encodings() const;
+ int64_t has_dictionary_page() const;
+ int64_t dictionary_page_offset() const;
+ int64_t data_page_offset() const;
+ int64_t index_page_offset() const;
+ int64_t total_compressed_size() const;
+ int64_t total_uncompressed_size() const;
+
+ private:
+ explicit ColumnChunkMetaData(const uint8_t* metadata);
+ // PIMPL Idiom
+ class ColumnChunkMetaDataImpl;
+ std::unique_ptr<ColumnChunkMetaDataImpl> impl_;
+};
+
+class PARQUET_EXPORT RowGroupMetaData {
+ public:
+ // API convenience to get a MetaData accessor
+ static std::unique_ptr<RowGroupMetaData> Make(const uint8_t* metadata);
+
+ ~RowGroupMetaData();
+
+ // row-group metadata
+ int num_columns() const;
+ int64_t num_rows() const;
+ int64_t total_byte_size() const;
+ std::unique_ptr<ColumnChunkMetaData> ColumnChunk(int i) const;
+
+ private:
+ explicit RowGroupMetaData(const uint8_t* metadata);
+ // PIMPL Idiom
+ class RowGroupMetaDataImpl;
+ std::unique_ptr<RowGroupMetaDataImpl> impl_;
+};
+
+class FileMetaDataBuilder;
+
+class PARQUET_EXPORT FileMetaData {
+ public:
+ // API convenience to get a MetaData accessor
+ static std::unique_ptr<FileMetaData> Make(
+ const uint8_t* serialized_metadata, uint32_t* metadata_len);
+
+ ~FileMetaData();
+
+ // file metadata
+ int num_columns() const;
+ int64_t num_rows() const;
+ int num_row_groups() const;
+ int32_t version() const;
+ const std::string& created_by() const;
+ int num_schema_elements() const;
+ std::unique_ptr<RowGroupMetaData> RowGroup(int i) const;
+
+ void WriteTo(OutputStream* dst);
+
+ // Return const-pointer to make it clear that this object is not to be copied
+ const SchemaDescriptor* schema_descriptor() const;
+
+ private:
+ friend FileMetaDataBuilder;
+ explicit FileMetaData(const uint8_t* serialized_metadata, uint32_t* metadata_len);
+ // PIMPL Idiom
+ FileMetaData();
+ class FileMetaDataImpl;
+ std::unique_ptr<FileMetaDataImpl> impl_;
+};
+
+// Builder API
+class PARQUET_EXPORT ColumnChunkMetaDataBuilder {
+ public:
+ // API convenience to get a MetaData reader
+ static std::unique_ptr<ColumnChunkMetaDataBuilder> Make(
+ const std::shared_ptr<WriterProperties>& props, const ColumnDescriptor* column,
+ uint8_t* contents);
+
+ ~ColumnChunkMetaDataBuilder();
+
+ // column chunk
+ // Used when a dataset is spread across multiple files
+ void set_file_path(const std::string& path);
+ // column metadata
+ // ownership of min/max is with ColumnChunkMetadata
+ void SetStatistics(const ColumnStatistics& stats);
+
+ // commit the metadata
+ void Finish(int64_t num_values, int64_t dictonary_page_offset,
+ int64_t index_page_offset, int64_t data_page_offset, int64_t compressed_size,
+ int64_t uncompressed_size, bool dictionary_fallback);
+
+ private:
+ explicit ColumnChunkMetaDataBuilder(const std::shared_ptr<WriterProperties>& props,
+ const ColumnDescriptor* column, uint8_t* contents);
+ // PIMPL Idiom
+ class ColumnChunkMetaDataBuilderImpl;
+ std::unique_ptr<ColumnChunkMetaDataBuilderImpl> impl_;
+};
+
+class PARQUET_EXPORT RowGroupMetaDataBuilder {
+ public:
+ // API convenience to get a MetaData reader
+ static std::unique_ptr<RowGroupMetaDataBuilder> Make(
+ const std::shared_ptr<WriterProperties>& props, const SchemaDescriptor* schema_,
+ uint8_t* contents);
+
+ ~RowGroupMetaDataBuilder();
+
+ ColumnChunkMetaDataBuilder* NextColumnChunk();
+
+ // commit the metadata
+ void Finish(int64_t num_rows);
+
+ private:
+ explicit RowGroupMetaDataBuilder(const std::shared_ptr<WriterProperties>& props,
+ const SchemaDescriptor* schema_, uint8_t* contents);
+ // PIMPL Idiom
+ class RowGroupMetaDataBuilderImpl;
+ std::unique_ptr<RowGroupMetaDataBuilderImpl> impl_;
+};
+
+class PARQUET_EXPORT FileMetaDataBuilder {
+ public:
+ // API convenience to get a MetaData reader
+ static std::unique_ptr<FileMetaDataBuilder> Make(
+ const SchemaDescriptor* schema, const std::shared_ptr<WriterProperties>& props);
+
+ ~FileMetaDataBuilder();
+
+ RowGroupMetaDataBuilder* AppendRowGroup();
+
+ // commit the metadata
+ std::unique_ptr<FileMetaData> Finish();
+
+ private:
+ explicit FileMetaDataBuilder(
+ const SchemaDescriptor* schema, const std::shared_ptr<WriterProperties>& props);
+ // PIMPL Idiom
+ class FileMetaDataBuilderImpl;
+ std::unique_ptr<FileMetaDataBuilderImpl> impl_;
+};
+
+} // namespace parquet
+
+#endif // PARQUET_FILE_METADATA_H
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/c0fd08a9/src/parquet/file/reader-internal.cc
----------------------------------------------------------------------
diff --git a/src/parquet/file/reader-internal.cc b/src/parquet/file/reader-internal.cc
index 9e592b5..5c1bc37 100644
--- a/src/parquet/file/reader-internal.cc
+++ b/src/parquet/file/reader-internal.cc
@@ -141,76 +141,26 @@ std::shared_ptr<Page> SerializedPageReader::NextPage() {
return std::shared_ptr<Page>(nullptr);
}
-// ----------------------------------------------------------------------
-// SerializedRowGroup
-
-int64_t SerializedRowGroup::num_rows() const {
- return metadata_->num_rows;
-}
-
-int SerializedRowGroup::num_columns() const {
- return metadata_->columns.size();
+const RowGroupMetaData* SerializedRowGroup::metadata() const {
+ return row_group_metadata_.get();
}
std::unique_ptr<PageReader> SerializedRowGroup::GetColumnPageReader(int i) {
// Read column chunk from the file
- const format::ColumnChunk& col = metadata_->columns[i];
+ auto col = row_group_metadata_->ColumnChunk(i);
- int64_t col_start = col.meta_data.data_page_offset;
- if (col.meta_data.__isset.dictionary_page_offset &&
- col_start > col.meta_data.dictionary_page_offset) {
- col_start = col.meta_data.dictionary_page_offset;
+ int64_t col_start = col->data_page_offset();
+ if (col->has_dictionary_page() && col_start > col->dictionary_page_offset()) {
+ col_start = col->dictionary_page_offset();
}
- int64_t bytes_to_read = col.meta_data.total_compressed_size;
+ int64_t bytes_to_read = col->total_compressed_size();
std::unique_ptr<InputStream> stream;
stream = properties_.GetStream(source_, col_start, bytes_to_read);
return std::unique_ptr<PageReader>(new SerializedPageReader(
- std::move(stream), FromThrift(col.meta_data.codec), properties_.allocator()));
-}
-
-RowGroupStatistics SerializedRowGroup::GetColumnStats(int i) const {
- const format::ColumnMetaData& meta_data = metadata_->columns[i].meta_data;
-
- RowGroupStatistics result;
- result.num_values = meta_data.num_values;
- result.null_count = meta_data.statistics.null_count;
- result.distinct_count = meta_data.statistics.distinct_count;
- result.max = &meta_data.statistics.max;
- result.min = &meta_data.statistics.min;
- return result;
-}
-
-bool SerializedRowGroup::IsColumnStatsSet(int i) const {
- const format::ColumnMetaData& meta_data = metadata_->columns[i].meta_data;
- return meta_data.__isset.statistics;
-}
-
-Compression::type SerializedRowGroup::GetColumnCompression(int i) const {
- const format::ColumnMetaData& meta_data = metadata_->columns[i].meta_data;
- return FromThrift(meta_data.codec);
-}
-
-std::vector<Encoding::type> SerializedRowGroup::GetColumnEncodings(int i) const {
- const format::ColumnMetaData& meta_data = metadata_->columns[i].meta_data;
-
- std::vector<Encoding::type> encodings;
- for (auto encoding : meta_data.encodings) {
- encodings.push_back(FromThrift(encoding));
- }
- return encodings;
-}
-
-int64_t SerializedRowGroup::GetColumnUnCompressedSize(int i) const {
- const format::ColumnMetaData& meta_data = metadata_->columns[i].meta_data;
- return meta_data.total_uncompressed_size;
-}
-
-int64_t SerializedRowGroup::GetColumnCompressedSize(int i) const {
- const format::ColumnMetaData& meta_data = metadata_->columns[i].meta_data;
- return meta_data.total_compressed_size;
+ std::move(stream), col->compression(), properties_.allocator()));
}
// ----------------------------------------------------------------------
@@ -242,23 +192,15 @@ SerializedFile::~SerializedFile() {
}
std::shared_ptr<RowGroupReader> SerializedFile::GetRowGroup(int i) {
- std::unique_ptr<SerializedRowGroup> contents(
- new SerializedRowGroup(source_.get(), &metadata_.row_groups[i], properties_));
+ std::unique_ptr<SerializedRowGroup> contents(new SerializedRowGroup(
+ source_.get(), std::move(file_metadata_->RowGroup(i)), properties_));
return std::make_shared<RowGroupReader>(
- &schema_, std::move(contents), properties_.allocator());
-}
-
-int64_t SerializedFile::num_rows() const {
- return metadata_.num_rows;
-}
-
-int SerializedFile::num_columns() const {
- return schema_.num_columns();
+ file_metadata_->schema_descriptor(), std::move(contents), properties_.allocator());
}
-int SerializedFile::num_row_groups() const {
- return metadata_.row_groups.size();
+const FileMetaData* SerializedFile::metadata() const {
+ return file_metadata_.get();
}
SerializedFile::SerializedFile(std::unique_ptr<RandomAccessSource> source,
@@ -293,10 +235,8 @@ void SerializedFile::ParseMetaData() {
if (bytes_read != metadata_len) {
throw ParquetException("Invalid parquet file. Could not read metadata bytes.");
}
- DeserializeThriftMsg(&metadata_buffer[0], &metadata_len, &metadata_);
- schema::FlatSchemaConverter converter(&metadata_.schema[0], metadata_.schema.size());
- schema_.Init(converter.Convert());
+ file_metadata_ = FileMetaData::Make(&metadata_buffer[0], &metadata_len);
}
} // namespace parquet
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/c0fd08a9/src/parquet/file/reader-internal.h
----------------------------------------------------------------------
diff --git a/src/parquet/file/reader-internal.h b/src/parquet/file/reader-internal.h
index 0c3c9f7..48e2daf 100644
--- a/src/parquet/file/reader-internal.h
+++ b/src/parquet/file/reader-internal.h
@@ -25,6 +25,7 @@
#include "parquet/column/page.h"
#include "parquet/column/properties.h"
#include "parquet/compression/codec.h"
+#include "parquet/file/metadata.h"
#include "parquet/file/reader.h"
#include "parquet/thrift/parquet_types.h"
#include "parquet/types.h"
@@ -69,23 +70,17 @@ class SerializedPageReader : public PageReader {
// RowGroupReader::Contents implementation for the Parquet file specification
class SerializedRowGroup : public RowGroupReader::Contents {
public:
- SerializedRowGroup(RandomAccessSource* source, const format::RowGroup* metadata,
- ReaderProperties props)
- : source_(source), metadata_(metadata), properties_(props) {}
+ SerializedRowGroup(RandomAccessSource* source,
+ std::unique_ptr<RowGroupMetaData> metadata, const ReaderProperties props)
+ : source_(source), row_group_metadata_(std::move(metadata)), properties_(props) {}
+
+ virtual const RowGroupMetaData* metadata() const;
- virtual int num_columns() const;
- virtual int64_t num_rows() const;
virtual std::unique_ptr<PageReader> GetColumnPageReader(int i);
- virtual RowGroupStatistics GetColumnStats(int i) const;
- virtual bool IsColumnStatsSet(int i) const;
- virtual Compression::type GetColumnCompression(int i) const;
- virtual std::vector<Encoding::type> GetColumnEncodings(int i) const;
- virtual int64_t GetColumnCompressedSize(int i) const;
- virtual int64_t GetColumnUnCompressedSize(int i) const;
private:
RandomAccessSource* source_;
- const format::RowGroup* metadata_;
+ std::unique_ptr<RowGroupMetaData> row_group_metadata_;
ReaderProperties properties_;
};
@@ -103,9 +98,7 @@ class SerializedFile : public ParquetFileReader::Contents {
ReaderProperties props = default_reader_properties());
virtual void Close();
virtual std::shared_ptr<RowGroupReader> GetRowGroup(int i);
- virtual int64_t num_rows() const;
- virtual int num_columns() const;
- virtual int num_row_groups() const;
+ virtual const FileMetaData* metadata() const;
virtual ~SerializedFile();
private:
@@ -114,7 +107,7 @@ class SerializedFile : public ParquetFileReader::Contents {
std::unique_ptr<RandomAccessSource> source, ReaderProperties props);
std::unique_ptr<RandomAccessSource> source_;
- format::FileMetaData metadata_;
+ std::unique_ptr<FileMetaData> file_metadata_;
ReaderProperties properties_;
void ParseMetaData();
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/c0fd08a9/src/parquet/file/reader.cc
----------------------------------------------------------------------
diff --git a/src/parquet/file/reader.cc b/src/parquet/file/reader.cc
index aabcc2b..b6de168 100644
--- a/src/parquet/file/reader.cc
+++ b/src/parquet/file/reader.cc
@@ -30,6 +30,7 @@
#include "parquet/exception.h"
#include "parquet/file/reader-internal.h"
#include "parquet/util/input.h"
+#include "parquet/util/logging.h"
#include "parquet/types.h"
using std::string;
@@ -44,50 +45,24 @@ RowGroupReader::RowGroupReader(const SchemaDescriptor* schema,
std::unique_ptr<Contents> contents, MemoryAllocator* allocator)
: schema_(schema), contents_(std::move(contents)), allocator_(allocator) {}
-int RowGroupReader::num_columns() const {
- return contents_->num_columns();
-}
-
-int64_t RowGroupReader::num_rows() const {
- return contents_->num_rows();
-}
-
std::shared_ptr<ColumnReader> RowGroupReader::Column(int i) {
- // TODO: boundschecking
+ DCHECK(i < schema_->num_columns()) << "The RowGroup only has " << schema_->num_columns()
+ << "columns, requested column: " << i;
const ColumnDescriptor* descr = schema_->Column(i);
std::unique_ptr<PageReader> page_reader = contents_->GetColumnPageReader(i);
return ColumnReader::Make(descr, std::move(page_reader), allocator_);
}
-RowGroupStatistics RowGroupReader::GetColumnStats(int i) const {
- return contents_->GetColumnStats(i);
-}
-
-bool RowGroupReader::IsColumnStatsSet(int i) const {
- return contents_->IsColumnStatsSet(i);
-}
-
-Compression::type RowGroupReader::GetColumnCompression(int i) const {
- return contents_->GetColumnCompression(i);
-}
-
-std::vector<Encoding::type> RowGroupReader::GetColumnEncodings(int i) const {
- return contents_->GetColumnEncodings(i);
-}
-
-int64_t RowGroupReader::GetColumnUnCompressedSize(int i) const {
- return contents_->GetColumnUnCompressedSize(i);
-}
-
-int64_t RowGroupReader::GetColumnCompressedSize(int i) const {
- return contents_->GetColumnCompressedSize(i);
+// Returns the rowgroup metadata
+const RowGroupMetaData* RowGroupReader::metadata() const {
+ return contents_->metadata();
}
// ----------------------------------------------------------------------
// ParquetFileReader public API
-ParquetFileReader::ParquetFileReader() : schema_(nullptr) {}
+ParquetFileReader::ParquetFileReader() {}
ParquetFileReader::~ParquetFileReader() {
Close();
}
@@ -117,33 +92,20 @@ std::unique_ptr<ParquetFileReader> ParquetFileReader::OpenFile(
void ParquetFileReader::Open(std::unique_ptr<ParquetFileReader::Contents> contents) {
contents_ = std::move(contents);
- schema_ = contents_->schema();
}
void ParquetFileReader::Close() {
if (contents_) { contents_->Close(); }
}
-int ParquetFileReader::num_row_groups() const {
- return contents_->num_row_groups();
-}
-
-int64_t ParquetFileReader::num_rows() const {
- return contents_->num_rows();
-}
-
-int ParquetFileReader::num_columns() const {
- return schema_->num_columns();
+const FileMetaData* ParquetFileReader::metadata() const {
+ return contents_->metadata();
}
std::shared_ptr<RowGroupReader> ParquetFileReader::RowGroup(int i) {
- if (i >= num_row_groups()) {
- std::stringstream ss;
- ss << "The file only has " << num_row_groups()
- << "row groups, requested reader for: " << i;
- throw ParquetException(ss.str());
- }
-
+ DCHECK(i < metadata()->num_row_groups()) << "The file only has "
+ << metadata()->num_row_groups()
+ << "row groups, requested reader for: " << i;
return contents_->GetRowGroup(i);
}
@@ -155,43 +117,57 @@ std::shared_ptr<RowGroupReader> ParquetFileReader::RowGroup(int i) {
void ParquetFileReader::DebugPrint(
std::ostream& stream, std::list<int> selected_columns, bool print_values) {
+ const FileMetaData* file_metadata = metadata();
+
stream << "File statistics:\n";
- stream << "Total rows: " << num_rows() << "\n";
+ stream << "Version: " << file_metadata->version() << "\n";
+ stream << "Created By: " << file_metadata->created_by() << "\n";
+ stream << "Total rows: " << file_metadata->num_rows() << "\n";
+ stream << "Number of RowGroups: " << file_metadata->num_row_groups() << "\n";
+ stream << "Number of Real Columns: "
+ << file_metadata->schema_descriptor()->group()->field_count() << "\n";
if (selected_columns.size() == 0) {
- for (int i = 0; i < num_columns(); i++) {
+ for (int i = 0; i < file_metadata->num_columns(); i++) {
selected_columns.push_back(i);
}
} else {
for (auto i : selected_columns) {
- if (i < 0 || i >= num_columns()) {
+ if (i < 0 || i >= file_metadata->num_columns()) {
throw ParquetException("Selected column is out of range");
}
}
}
+ stream << "Number of Columns: " << file_metadata->num_columns() << "\n";
+ stream << "Number of Selected Columns: " << selected_columns.size() << "\n";
for (auto i : selected_columns) {
- const ColumnDescriptor* descr = schema_->Column(i);
+ const ColumnDescriptor* descr = file_metadata->schema_descriptor()->Column(i);
stream << "Column " << i << ": " << descr->name() << " ("
<< type_to_string(descr->physical_type()) << ")" << std::endl;
}
- for (int r = 0; r < num_row_groups(); ++r) {
+ for (int r = 0; r < file_metadata->num_row_groups(); ++r) {
stream << "--- Row Group " << r << " ---\n";
auto group_reader = RowGroup(r);
+ std::unique_ptr<RowGroupMetaData> group_metadata = file_metadata->RowGroup(r);
+
+ stream << "--- Total Bytes " << group_metadata->total_byte_size() << " ---\n";
+ stream << " rows: " << group_metadata->num_rows() << "---\n";
// Print column metadata
for (auto i : selected_columns) {
- RowGroupStatistics stats = group_reader->GetColumnStats(i);
+ auto column_chunk = group_metadata->ColumnChunk(i);
+ const ColumnStatistics stats = column_chunk->statistics();
- const ColumnDescriptor* descr = schema_->Column(i);
+ const ColumnDescriptor* descr = file_metadata->schema_descriptor()->Column(i);
stream << "Column " << i << std::endl
- << " rows: " << group_reader->num_rows() << ", values: " << stats.num_values
- << ", null values: " << stats.null_count
- << ", distinct values: " << stats.distinct_count << std::endl;
- if (group_reader->IsColumnStatsSet(i)) {
- stream << " max: " << FormatStatValue(descr->physical_type(), stats.max->c_str())
+ << ", values: " << column_chunk->num_values();
+ if (column_chunk->is_stats_set()) {
+ stream << ", null values: " << stats.null_count
+ << ", distinct values: " << stats.distinct_count << std::endl
+ << " max: " << FormatStatValue(descr->physical_type(), stats.max->c_str())
<< ", min: "
<< FormatStatValue(descr->physical_type(), stats.min->c_str());
} else {
@@ -199,15 +175,16 @@ void ParquetFileReader::DebugPrint(
}
stream << std::endl
<< " compression: "
- << compression_to_string(group_reader->GetColumnCompression(i))
+ << compression_to_string(column_chunk->compression())
<< ", encodings: ";
- for (auto encoding : group_reader->GetColumnEncodings(i)) {
+ for (auto encoding : column_chunk->encodings()) {
stream << encoding_to_string(encoding) << " ";
}
stream << std::endl
- << " uncompressed size: " << group_reader->GetColumnUnCompressedSize(i)
- << ", compressed size: " << group_reader->GetColumnCompressedSize(i)
- << std::endl;
+ << " uncompressed size: "
+ << column_chunk->total_uncompressed_size()
+ << ", compressed size: "
+ << column_chunk->total_compressed_size() << std::endl;
}
if (!print_values) { continue; }
@@ -225,7 +202,8 @@ void ParquetFileReader::DebugPrint(
ss << "%-" << COL_WIDTH << "s";
std::string fmt = ss.str();
- snprintf(buffer, bufsize, fmt.c_str(), column_schema(i)->name().c_str());
+ snprintf(buffer, bufsize, fmt.c_str(),
+ file_metadata->schema_descriptor()->Column(i)->name().c_str());
stream << buffer;
// This is OK in this method as long as the RowGroupReader does not get
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/c0fd08a9/src/parquet/file/reader.h
----------------------------------------------------------------------
diff --git a/src/parquet/file/reader.h b/src/parquet/file/reader.h
index 8e0d26f..baa3e30 100644
--- a/src/parquet/file/reader.h
+++ b/src/parquet/file/reader.h
@@ -27,6 +27,7 @@
#include "parquet/column/page.h"
#include "parquet/column/properties.h"
+#include "parquet/file/metadata.h"
#include "parquet/schema/descriptor.h"
#include "parquet/util/visibility.h"
@@ -35,54 +36,31 @@ namespace parquet {
class ColumnReader;
class RandomAccessSource;
-struct RowGroupStatistics {
- int64_t num_values;
- int64_t null_count;
- int64_t distinct_count;
- const std::string* min;
- const std::string* max;
-};
-
class PARQUET_EXPORT RowGroupReader {
public:
// Forward declare the PIMPL
struct Contents {
- virtual int num_columns() const = 0;
- virtual int64_t num_rows() const = 0;
+ virtual ~Contents() {}
virtual std::unique_ptr<PageReader> GetColumnPageReader(int i) = 0;
- virtual RowGroupStatistics GetColumnStats(int i) const = 0;
- virtual bool IsColumnStatsSet(int i) const = 0;
- virtual Compression::type GetColumnCompression(int i) const = 0;
- virtual std::vector<Encoding::type> GetColumnEncodings(int i) const = 0;
- virtual int64_t GetColumnCompressedSize(int i) const = 0;
- virtual int64_t GetColumnUnCompressedSize(int i) const = 0;
+ virtual const RowGroupMetaData* metadata() const = 0;
};
RowGroupReader(const SchemaDescriptor* schema, std::unique_ptr<Contents> contents,
MemoryAllocator* allocator);
+ // Returns the rowgroup metadata
+ const RowGroupMetaData* metadata() const;
+
// Construct a ColumnReader for the indicated row group-relative
// column. Ownership is shared with the RowGroupReader.
std::shared_ptr<ColumnReader> Column(int i);
- int num_columns() const;
- int64_t num_rows() const;
-
- RowGroupStatistics GetColumnStats(int i) const;
- bool IsColumnStatsSet(int i) const;
- Compression::type GetColumnCompression(int i) const;
- std::vector<Encoding::type> GetColumnEncodings(int i) const;
- int64_t GetColumnCompressedSize(int i) const;
- int64_t GetColumnUnCompressedSize(int i) const;
private:
- // Owned by the parent ParquetFileReader
const SchemaDescriptor* schema_;
-
// PIMPL idiom
// This is declared in the .cc file so that we can hide compiled Thrift
// headers from the public API and also more easily create test fixtures.
std::unique_ptr<Contents> contents_;
-
MemoryAllocator* allocator_;
};
@@ -93,16 +71,8 @@ class PARQUET_EXPORT ParquetFileReader {
virtual ~Contents() {}
// Perform any cleanup associated with the file contents
virtual void Close() = 0;
-
virtual std::shared_ptr<RowGroupReader> GetRowGroup(int i) = 0;
-
- virtual int64_t num_rows() const = 0;
- virtual int num_columns() const = 0;
- virtual int num_row_groups() const = 0;
-
- // Return const-poitner to make it clear that this object is not to be copied
- const SchemaDescriptor* schema() const { return &schema_; }
- SchemaDescriptor schema_;
+ virtual const FileMetaData* metadata() const = 0;
};
ParquetFileReader();
@@ -122,14 +92,8 @@ class PARQUET_EXPORT ParquetFileReader {
// The RowGroupReader is owned by the FileReader
std::shared_ptr<RowGroupReader> RowGroup(int i);
- int num_columns() const;
- int64_t num_rows() const;
- int num_row_groups() const;
-
- // Returns the file schema descriptor
- const SchemaDescriptor* descr() { return schema_; }
-
- const ColumnDescriptor* column_schema(int i) const { return schema_->Column(i); }
+ // Returns the file metadata
+ const FileMetaData* metadata() const;
void DebugPrint(
std::ostream& stream, std::list<int> selected_columns, bool print_values = true);
@@ -139,9 +103,6 @@ class PARQUET_EXPORT ParquetFileReader {
// This is declared in the .cc file so that we can hide compiled Thrift
// headers from the public API and also more easily create test fixtures.
std::unique_ptr<Contents> contents_;
-
- // The SchemaDescriptor is provided by the Contents impl
- const SchemaDescriptor* schema_;
};
} // namespace parquet
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/c0fd08a9/src/parquet/reader-test.cc
----------------------------------------------------------------------
diff --git a/src/parquet/reader-test.cc b/src/parquet/reader-test.cc
index 9a8fcf6..2e28c80 100644
--- a/src/parquet/reader-test.cc
+++ b/src/parquet/reader-test.cc
@@ -68,11 +68,11 @@ TEST_F(TestAllTypesPlain, TestBatchRead) {
int32_t values[4];
// This file only has 8 rows
- ASSERT_EQ(8, reader_->num_rows());
+ ASSERT_EQ(8, reader_->metadata()->num_rows());
// This file only has 1 row group
- ASSERT_EQ(1, reader_->num_row_groups());
+ ASSERT_EQ(1, reader_->metadata()->num_row_groups());
// This row group must have 8 rows
- ASSERT_EQ(8, group->num_rows());
+ ASSERT_EQ(8, group->metadata()->num_rows());
ASSERT_TRUE(col->HasNext());
int64_t values_read;
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/c0fd08a9/src/parquet/util/bpacking.h
----------------------------------------------------------------------
diff --git a/src/parquet/util/bpacking.h b/src/parquet/util/bpacking.h
index d9ae531..f407538 100644
--- a/src/parquet/util/bpacking.h
+++ b/src/parquet/util/bpacking.h
@@ -13,6 +13,8 @@
#ifndef PARQUET_UTIL_BPACKING_H
#define PARQUET_UTIL_BPACKING_H
+#include <stdexcept>
+
namespace parquet {
inline const uint32_t* unpack1_32(const uint32_t* in, uint32_t* out) {