You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@parquet.apache.org by ju...@apache.org on 2016/02/06 03:01:48 UTC
[1/2] parquet-cpp git commit: PARQUET-442: Nested schema conversion,
Thrift struct decoupling, dump-schema utility
Repository: parquet-cpp
Updated Branches:
refs/heads/master c0eec9a59 -> 04d75c7cb
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/04d75c7c/src/parquet/schema/schema-types-test.cc
----------------------------------------------------------------------
diff --git a/src/parquet/schema/schema-types-test.cc b/src/parquet/schema/schema-types-test.cc
new file mode 100644
index 0000000..72d38c0
--- /dev/null
+++ b/src/parquet/schema/schema-types-test.cc
@@ -0,0 +1,231 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <cstdint>
+#include <string>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "parquet/util/test-common.h"
+
+#include "parquet/schema/types.h"
+#include "parquet/schema/test-util.h"
+
+using std::string;
+using std::vector;
+
+namespace parquet_cpp {
+
+namespace schema {
+
+// ----------------------------------------------------------------------
+// Primitive node
+
+class TestPrimitiveNode : public ::testing::Test {
+ public:
+ void setUp() {
+ name_ = "name";
+ id_ = 5;
+ }
+
+ void Convert(const parquet::SchemaElement* element) {
+ node_ = PrimitiveNode::FromParquet(element, id_);
+ ASSERT_TRUE(node_->is_primitive());
+ prim_node_ = static_cast<const PrimitiveNode*>(node_.get());
+ }
+
+ protected:
+ std::string name_;
+ const PrimitiveNode* prim_node_;
+
+ int id_;
+ std::unique_ptr<Node> node_;
+};
+
+TEST_F(TestPrimitiveNode, Attrs) {
+ PrimitiveNode node1("foo", Repetition::REPEATED, Type::INT32);
+
+ PrimitiveNode node2("bar", Repetition::OPTIONAL, Type::BYTE_ARRAY,
+ LogicalType::UTF8);
+
+ ASSERT_EQ("foo", node1.name());
+
+ ASSERT_TRUE(node1.is_primitive());
+ ASSERT_FALSE(node1.is_group());
+
+ ASSERT_EQ(Repetition::REPEATED, node1.repetition());
+ ASSERT_EQ(Repetition::OPTIONAL, node2.repetition());
+
+ ASSERT_EQ(Node::PRIMITIVE, node1.node_type());
+
+ ASSERT_EQ(Type::INT32, node1.physical_type());
+ ASSERT_EQ(Type::BYTE_ARRAY, node2.physical_type());
+
+ // logical types
+ ASSERT_EQ(LogicalType::NONE, node1.logical_type());
+ ASSERT_EQ(LogicalType::UTF8, node2.logical_type());
+
+ // repetition
+ node1 = PrimitiveNode("foo", Repetition::REQUIRED, Type::INT32);
+ node2 = PrimitiveNode("foo", Repetition::OPTIONAL, Type::INT32);
+ PrimitiveNode node3("foo", Repetition::REPEATED, Type::INT32);
+
+ ASSERT_TRUE(node1.is_required());
+
+ ASSERT_TRUE(node2.is_optional());
+ ASSERT_FALSE(node2.is_required());
+
+ ASSERT_TRUE(node3.is_repeated());
+ ASSERT_FALSE(node3.is_optional());
+}
+
+TEST_F(TestPrimitiveNode, FromParquet) {
+ SchemaElement elt = NewPrimitive(name_, FieldRepetitionType::OPTIONAL,
+ parquet::Type::INT32);
+ Convert(&elt);
+ ASSERT_EQ(name_, prim_node_->name());
+ ASSERT_EQ(id_, prim_node_->id());
+ ASSERT_EQ(Repetition::OPTIONAL, prim_node_->repetition());
+ ASSERT_EQ(Type::INT32, prim_node_->physical_type());
+ ASSERT_EQ(LogicalType::NONE, prim_node_->logical_type());
+
+ // Test a logical type
+ elt = NewPrimitive(name_, FieldRepetitionType::REQUIRED, parquet::Type::BYTE_ARRAY);
+ elt.__set_converted_type(ConvertedType::UTF8);
+
+ Convert(&elt);
+ ASSERT_EQ(Repetition::REQUIRED, prim_node_->repetition());
+ ASSERT_EQ(Type::BYTE_ARRAY, prim_node_->physical_type());
+ ASSERT_EQ(LogicalType::UTF8, prim_node_->logical_type());
+
+ // FIXED_LEN_BYTE_ARRAY
+ elt = NewPrimitive(name_, FieldRepetitionType::OPTIONAL,
+ parquet::Type::FIXED_LEN_BYTE_ARRAY);
+ elt.__set_type_length(16);
+
+ Convert(&elt);
+ ASSERT_EQ(name_, prim_node_->name());
+ ASSERT_EQ(id_, prim_node_->id());
+ ASSERT_EQ(Repetition::OPTIONAL, prim_node_->repetition());
+ ASSERT_EQ(Type::FIXED_LEN_BYTE_ARRAY, prim_node_->physical_type());
+ ASSERT_EQ(16, prim_node_->type_length());
+
+ // ConvertedType::Decimal
+ elt = NewPrimitive(name_, FieldRepetitionType::OPTIONAL,
+ parquet::Type::FIXED_LEN_BYTE_ARRAY);
+ elt.__set_converted_type(ConvertedType::DECIMAL);
+ elt.__set_type_length(6);
+ elt.__set_scale(12);
+ elt.__set_precision(2);
+
+ Convert(&elt);
+ ASSERT_EQ(Type::FIXED_LEN_BYTE_ARRAY, prim_node_->physical_type());
+ ASSERT_EQ(LogicalType::DECIMAL, prim_node_->logical_type());
+ ASSERT_EQ(6, prim_node_->type_length());
+ ASSERT_EQ(12, prim_node_->decimal_metadata().scale);
+ ASSERT_EQ(2, prim_node_->decimal_metadata().precision);
+}
+
+TEST_F(TestPrimitiveNode, Equals) {
+ PrimitiveNode node1("foo", Repetition::REQUIRED, Type::INT32);
+ PrimitiveNode node2("foo", Repetition::REQUIRED, Type::INT64);
+ PrimitiveNode node3("bar", Repetition::REQUIRED, Type::INT32);
+ PrimitiveNode node4("foo", Repetition::OPTIONAL, Type::INT32);
+ PrimitiveNode node5("foo", Repetition::REQUIRED, Type::INT32);
+
+ ASSERT_TRUE(node1.Equals(&node1));
+ ASSERT_FALSE(node1.Equals(&node2));
+ ASSERT_FALSE(node1.Equals(&node3));
+ ASSERT_FALSE(node1.Equals(&node4));
+ ASSERT_TRUE(node1.Equals(&node5));
+
+ PrimitiveNode flba1("foo", Repetition::REQUIRED, Type::FIXED_LEN_BYTE_ARRAY);
+ flba1.SetTypeLength(12);
+
+ PrimitiveNode flba2("foo", Repetition::REQUIRED, Type::FIXED_LEN_BYTE_ARRAY);
+ flba2.SetTypeLength(12);
+
+ PrimitiveNode flba3("foo", Repetition::REQUIRED, Type::FIXED_LEN_BYTE_ARRAY);
+ flba3.SetTypeLength(16);
+
+ ASSERT_TRUE(flba1.Equals(&flba2));
+ ASSERT_FALSE(flba1.Equals(&flba3));
+}
+
+// ----------------------------------------------------------------------
+// Group node
+
+class TestGroupNode : public ::testing::Test {
+ public:
+ NodeVector Fields1() {
+ NodeVector fields;
+
+ fields.push_back(Int32("one", Repetition::REQUIRED));
+ fields.push_back(Int64("two"));
+ fields.push_back(Double("three"));
+
+ return fields;
+ }
+};
+
+TEST_F(TestGroupNode, Attrs) {
+ NodeVector fields = Fields1();
+
+ GroupNode node1("foo", Repetition::REPEATED, fields);
+ GroupNode node2("bar", Repetition::OPTIONAL, fields, LogicalType::LIST);
+
+ ASSERT_EQ("foo", node1.name());
+
+ ASSERT_TRUE(node1.is_group());
+ ASSERT_FALSE(node1.is_primitive());
+
+ ASSERT_EQ(fields.size(), node1.field_count());
+
+ ASSERT_TRUE(node1.is_repeated());
+ ASSERT_TRUE(node2.is_optional());
+
+ ASSERT_EQ(Repetition::REPEATED, node1.repetition());
+ ASSERT_EQ(Repetition::OPTIONAL, node2.repetition());
+
+ ASSERT_EQ(Node::GROUP, node1.node_type());
+
+ // logical types
+ ASSERT_EQ(LogicalType::NONE, node1.logical_type());
+ ASSERT_EQ(LogicalType::LIST, node2.logical_type());
+}
+
+TEST_F(TestGroupNode, Equals) {
+ NodeVector f1 = Fields1();
+ NodeVector f2 = Fields1();
+
+ GroupNode group1("group", Repetition::REPEATED, f1);
+ GroupNode group2("group", Repetition::REPEATED, f2);
+ GroupNode group3("group2", Repetition::REPEATED, f2);
+
+ // This is copied in the GroupNode ctor, so this is okay
+ f2.push_back(Float("four", Repetition::OPTIONAL));
+ GroupNode group4("group", Repetition::REPEATED, f2);
+
+ ASSERT_TRUE(group1.Equals(&group2));
+ ASSERT_FALSE(group1.Equals(&group3));
+
+ ASSERT_FALSE(group1.Equals(&group4));
+}
+
+} // namespace schema
+
+} // namespace parquet_cpp
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/04d75c7c/src/parquet/schema/test-util.h
----------------------------------------------------------------------
diff --git a/src/parquet/schema/test-util.h b/src/parquet/schema/test-util.h
new file mode 100644
index 0000000..5593abd
--- /dev/null
+++ b/src/parquet/schema/test-util.h
@@ -0,0 +1,63 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// This module defines an abstract interface for iterating through pages in a
+// Parquet column chunk within a row group. It could be extended in the future
+// to iterate through all data pages in all chunks in a file.
+
+#ifndef PARQUET_SCHEMA_TEST_UTIL_H
+#define PARQUET_SCHEMA_TEST_UTIL_H
+
+#include <string>
+
+#include "parquet/schema/types.h"
+#include "parquet/thrift/parquet_types.h"
+
+using parquet::ConvertedType;
+using parquet::FieldRepetitionType;
+using parquet::SchemaElement;
+
+namespace parquet_cpp {
+
+namespace schema {
+
+static inline SchemaElement NewPrimitive(const std::string& name,
+ FieldRepetitionType::type repetition, parquet::Type::type type) {
+ SchemaElement result;
+ result.__set_name(name);
+ result.__set_repetition_type(repetition);
+ result.__set_type(type);
+ result.__set_num_children(0);
+
+ return result;
+}
+
+static inline SchemaElement NewGroup(const std::string& name,
+ FieldRepetitionType::type repetition, size_t num_children) {
+ SchemaElement result;
+ result.__set_name(name);
+ result.__set_repetition_type(repetition);
+ result.__set_num_children(num_children);
+
+ return result;
+}
+
+} // namespace schema
+
+} // namespace parquet_cpp
+
+#endif // PARQUET_COLUMN_TEST_UTIL_H
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/04d75c7c/src/parquet/schema/types.cc
----------------------------------------------------------------------
diff --git a/src/parquet/schema/types.cc b/src/parquet/schema/types.cc
new file mode 100644
index 0000000..e088eed
--- /dev/null
+++ b/src/parquet/schema/types.cc
@@ -0,0 +1,163 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "parquet/schema/types.h"
+
+#include <memory>
+
+#include "parquet/thrift/parquet_types.h"
+
+namespace parquet_cpp {
+
+namespace schema {
+
+// ----------------------------------------------------------------------
+// Base node
+
+bool Node::EqualsInternal(const Node* other) const {
+ return type_ == other->type_ &&
+ name_ == other->name_ &&
+ repetition_ == other->repetition_ &&
+ logical_type_ == other->logical_type_;
+}
+
+// ----------------------------------------------------------------------
+// Primitive node
+
+bool PrimitiveNode::EqualsInternal(const PrimitiveNode* other) const {
+ if (physical_type_ != other->physical_type_) {
+ return false;
+ } else if (logical_type_ == LogicalType::DECIMAL) {
+ // TODO(wesm): metadata
+ ParquetException::NYI("comparing decimals");
+ return false;
+ } else if (physical_type_ == Type::FIXED_LEN_BYTE_ARRAY) {
+ return type_length_ == other->type_length_;
+ }
+ return true;
+}
+
+bool PrimitiveNode::Equals(const Node* other) const {
+ if (!Node::EqualsInternal(other)) {
+ return false;
+ }
+ return EqualsInternal(static_cast<const PrimitiveNode*>(other));
+}
+
+void PrimitiveNode::Visit(Node::Visitor* visitor) {
+ visitor->Visit(this);
+}
+
+// ----------------------------------------------------------------------
+// Group node
+
+bool GroupNode::EqualsInternal(const GroupNode* other) const {
+ if (this == other) {
+ return true;
+ }
+ if (this->field_count() != other->field_count()) {
+ return false;
+ }
+ for (size_t i = 0; i < this->field_count(); ++i) {
+ if (!this->field(i)->Equals(other->field(i).get())) {
+ return false;
+ }
+ }
+ return true;
+}
+
+bool GroupNode::Equals(const Node* other) const {
+ if (!Node::EqualsInternal(other)) {
+ return false;
+ }
+ return EqualsInternal(static_cast<const GroupNode*>(other));
+}
+
+void GroupNode::Visit(Node::Visitor* visitor) {
+ visitor->Visit(this);
+}
+
+// ----------------------------------------------------------------------
+// Node construction from Parquet metadata
+
+static Type::type ConvertEnum(parquet::Type::type type) {
+ return static_cast<Type::type>(type);
+}
+
+static LogicalType::type ConvertEnum(parquet::ConvertedType::type type) {
+ // item 0 is NONE
+ return static_cast<LogicalType::type>(static_cast<int>(type) + 1);
+}
+
+static Repetition::type ConvertEnum(parquet::FieldRepetitionType::type type) {
+ return static_cast<Repetition::type>(type);
+}
+
+struct NodeParams {
+ explicit NodeParams(const std::string& name) :
+ name(name) {}
+
+ const std::string& name;
+ Repetition::type repetition;
+ LogicalType::type logical_type;
+};
+
+static inline NodeParams GetNodeParams(const parquet::SchemaElement* element) {
+ NodeParams params(element->name);
+
+ params.repetition = ConvertEnum(element->repetition_type);
+ if (element->__isset.converted_type) {
+ params.logical_type = ConvertEnum(element->converted_type);
+ } else {
+ params.logical_type = LogicalType::NONE;
+ }
+ return params;
+}
+
+std::unique_ptr<Node> GroupNode::FromParquet(const void* opaque_element, int node_id,
+ const NodeVector& fields) {
+ const parquet::SchemaElement* element =
+ static_cast<const parquet::SchemaElement*>(opaque_element);
+ NodeParams params = GetNodeParams(element);
+ return std::unique_ptr<Node>(new GroupNode(params.name, params.repetition, fields,
+ params.logical_type, node_id));
+}
+
+std::unique_ptr<Node> PrimitiveNode::FromParquet(const void* opaque_element,
+ int node_id) {
+ const parquet::SchemaElement* element =
+ static_cast<const parquet::SchemaElement*>(opaque_element);
+ NodeParams params = GetNodeParams(element);
+
+ std::unique_ptr<PrimitiveNode> result = std::unique_ptr<PrimitiveNode>(
+ new PrimitiveNode(params.name, params.repetition,
+ ConvertEnum(element->type), params.logical_type, node_id));
+
+ if (element->type == parquet::Type::FIXED_LEN_BYTE_ARRAY) {
+ result->SetTypeLength(element->type_length);
+ if (params.logical_type == LogicalType::DECIMAL) {
+ result->SetDecimalMetadata(element->scale, element->precision);
+ }
+ }
+
+ // Return as unique_ptr to the base type
+ return std::unique_ptr<Node>(result.release());
+}
+
+} // namespace schema
+
+} // namespace parquet_cpp
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/04d75c7c/src/parquet/schema/types.h
----------------------------------------------------------------------
diff --git a/src/parquet/schema/types.h b/src/parquet/schema/types.h
new file mode 100644
index 0000000..82db233
--- /dev/null
+++ b/src/parquet/schema/types.h
@@ -0,0 +1,303 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// This module contains the logical parquet-cpp types (independent of Thrift
+// structures), schema nodes, and related type tools
+
+#ifndef PARQUET_SCHEMA_TYPES_H
+#define PARQUET_SCHEMA_TYPES_H
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "parquet/exception.h"
+#include "parquet/types.h"
+#include "parquet/util/macros.h"
+
+namespace parquet_cpp {
+
+namespace schema {
+
+// List encodings: using the terminology from Impala to define different styles
+// of representing logical lists (a.k.a. ARRAY types) in Parquet schemas. Since
+// the converted type named in the Parquet metadata is ConvertedType::LIST we
+// use that terminology here. It also helps distinguish from the *_ARRAY
+// primitive types.
+//
+// One-level encoding: Only allows required lists with required cells
+// repeated value_type name
+//
+// Two-level encoding: Enables optional lists with only required cells
+// <required/optional> group list
+// repeated value_type item
+//
+// Three-level encoding: Enables optional lists with optional cells
+// <required/optional> group bag
+// repeated group list
+// <required/optional> value_type item
+//
+// 2- and 1-level encoding are respectively equivalent to 3-level encoding with
+// the non-repeated nodes set to required.
+//
+// The "official" encoding recommended in the Parquet spec is the 3-level, and
+// we use that as the default when creating list types. For semantic completeness
+// we allow the other two. Since all types of encodings will occur "in the
+// wild" we need to be able to interpret the associated definition levels in
+// the context of the actual encoding used in the file.
+//
+// NB: Some Parquet writers may not set ConvertedType::LIST on the repeated
+// SchemaElement, which could make things challenging if we are trying to infer
+// that a sequence of nodes semantically represents an array according to one
+// of these encodings (versus a struct containing an array). We should refuse
+// the temptation to guess, as they say.
+struct ListEncoding {
+ enum type {
+ ONE_LEVEL,
+ TWO_LEVEL,
+ THREE_LEVEL
+ };
+};
+
+struct DecimalMetadata {
+ int32_t scale;
+ int32_t precision;
+};
+
+// Base class for logical schema types. A type has a name, repetition level,
+// and optionally a logical type (ConvertedType in Parquet metadata parlance)
+class Node {
+ public:
+ enum type {
+ PRIMITIVE,
+ GROUP
+ };
+
+ Node(Node::type type, const std::string& name,
+ Repetition::type repetition,
+ LogicalType::type logical_type = LogicalType::NONE,
+ int id = -1) :
+ type_(type),
+ name_(name),
+ repetition_(repetition),
+ logical_type_(logical_type),
+ id_(id) {}
+
+ virtual ~Node() {}
+
+ bool is_primitive() const {
+ return type_ == Node::PRIMITIVE;
+ }
+
+ bool is_group() const {
+ return type_ == Node::GROUP;
+ }
+
+ bool is_optional() const {
+ return repetition_ == Repetition::OPTIONAL;
+ }
+
+ bool is_repeated() const {
+ return repetition_ == Repetition::REPEATED;
+ }
+
+ bool is_required() const {
+ return repetition_ == Repetition::REQUIRED;
+ }
+
+ virtual bool Equals(const Node* other) const = 0;
+
+ const std::string& name() const {
+ return name_;
+ }
+
+ Node::type node_type() const {
+ return type_;
+ }
+
+ Repetition::type repetition() const {
+ return repetition_;
+ }
+
+ LogicalType::type logical_type() const {
+ return logical_type_;
+ }
+
+ int id() const {
+ return id_;
+ }
+
+ // Node::Visitor abstract class for walking schemas with the visitor pattern
+ class Visitor {
+ public:
+ virtual ~Visitor() {}
+
+ virtual void Visit(const Node* node) = 0;
+ };
+
+ virtual void Visit(Visitor* visitor) = 0;
+
+ protected:
+ Node::type type_;
+ std::string name_;
+ Repetition::type repetition_;
+ LogicalType::type logical_type_;
+ int id_;
+
+ bool EqualsInternal(const Node* other) const;
+};
+
+// Save our breath all over the place with these typedefs
+typedef std::shared_ptr<Node> NodePtr;
+typedef std::vector<NodePtr> NodeVector;
+
+// A type that is one of the primitive Parquet storage types. In addition to
+// the other type metadata (name, repetition level, logical type), also has the
+// physical storage type and their type-specific metadata (byte width, decimal
+// parameters)
+class PrimitiveNode : public Node {
+ public:
+ // FromParquet accepts an opaque void* to avoid exporting
+ // parquet::SchemaElement into the public API
+ static std::unique_ptr<Node> FromParquet(const void* opaque_element, int id);
+
+ static inline NodePtr Make(const std::string& name,
+ Repetition::type repetition, Type::type type,
+ LogicalType::type logical_type = LogicalType::NONE) {
+ return NodePtr(new PrimitiveNode(name, repetition, type, logical_type));
+ }
+
+ // Alternate constructor for FIXED_LEN_BYTE_ARRAY (FLBA)
+ static inline NodePtr MakeFLBA(const std::string& name,
+ Repetition::type repetition, Type::type type,
+ int32_t type_length,
+ LogicalType::type logical_type = LogicalType::NONE) {
+ NodePtr result = Make(name, repetition, type, logical_type);
+ static_cast<PrimitiveNode*>(result.get())->SetTypeLength(type_length);
+ return result;
+ }
+
+ virtual bool Equals(const Node* other) const;
+
+ Type::type physical_type() const {
+ return physical_type_;
+ }
+
+ int32_t type_length() const {
+ return type_length_;
+ }
+
+ const DecimalMetadata& decimal_metadata() const {
+ return decimal_metadata_;
+ }
+
+ virtual void Visit(Visitor* visitor);
+
+ private:
+ PrimitiveNode(const std::string& name, Repetition::type repetition,
+ Type::type type,
+ LogicalType::type logical_type = LogicalType::NONE,
+ int id = -1) :
+ Node(Node::PRIMITIVE, name, repetition, logical_type, id),
+ physical_type_(type) {}
+
+ Type::type physical_type_;
+ int32_t type_length_;
+ DecimalMetadata decimal_metadata_;
+
+ // For FIXED_LEN_BYTE_ARRAY
+ void SetTypeLength(int32_t length) {
+ type_length_ = length;
+ }
+
+
+ // For Decimal logical type: Precision and scale
+ void SetDecimalMetadata(int32_t scale, int32_t precision) {
+ decimal_metadata_.scale = scale;
+ decimal_metadata_.precision = precision;
+ }
+
+ bool EqualsInternal(const PrimitiveNode* other) const;
+
+ FRIEND_TEST(TestPrimitiveNode, Attrs);
+ FRIEND_TEST(TestPrimitiveNode, Equals);
+ FRIEND_TEST(TestPrimitiveNode, FromParquet);
+};
+
+class GroupNode : public Node {
+ public:
+ // Like PrimitiveNode, GroupNode::FromParquet accepts an opaque void* to avoid exporting
+ // parquet::SchemaElement into the public API
+ static std::unique_ptr<Node> FromParquet(const void* opaque_element, int id,
+ const NodeVector& fields);
+
+ static inline NodePtr Make(const std::string& name,
+ Repetition::type repetition, const NodeVector& fields,
+ LogicalType::type logical_type = LogicalType::NONE) {
+ return NodePtr(new GroupNode(name, repetition, fields, logical_type));
+ }
+
+ virtual bool Equals(const Node* other) const;
+
+ const NodePtr& field(size_t i) const {
+ return fields_[i];
+ }
+
+ size_t field_count() const {
+ return fields_.size();
+ }
+
+ virtual void Visit(Visitor* visitor);
+
+ private:
+ GroupNode(const std::string& name, Repetition::type repetition,
+ const NodeVector& fields,
+ LogicalType::type logical_type = LogicalType::NONE,
+ int id = -1) :
+ Node(Node::GROUP, name, repetition, logical_type, id),
+ fields_(fields) {}
+
+ NodeVector fields_;
+ bool EqualsInternal(const GroupNode* other) const;
+
+ FRIEND_TEST(TestGroupNode, Attrs);
+ FRIEND_TEST(TestGroupNode, Equals);
+};
+
+// ----------------------------------------------------------------------
+// Convenience primitive type factory functions
+
+#define PRIMITIVE_FACTORY(FuncName, TYPE) \
+ static inline NodePtr FuncName(const std::string& name, \
+ Repetition::type repetition = Repetition::OPTIONAL) { \
+ return PrimitiveNode::Make(name, repetition, Type::TYPE); \
+ }
+
+PRIMITIVE_FACTORY(Boolean, BOOLEAN);
+PRIMITIVE_FACTORY(Int32, INT32);
+PRIMITIVE_FACTORY(Int64, INT64);
+PRIMITIVE_FACTORY(Int96, INT96);
+PRIMITIVE_FACTORY(Float, FLOAT);
+PRIMITIVE_FACTORY(Double, DOUBLE);
+PRIMITIVE_FACTORY(ByteArray, BYTE_ARRAY);
+
+} // namespace schema
+
+} // namespace parquet_cpp
+
+#endif // PARQUET_SCHEMA_TYPES_H
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/04d75c7c/src/parquet/types.h
----------------------------------------------------------------------
diff --git a/src/parquet/types.h b/src/parquet/types.h
index f39e3a2..2d15cad 100644
--- a/src/parquet/types.h
+++ b/src/parquet/types.h
@@ -24,11 +24,110 @@
#include <sstream>
#include <string>
-#include "parquet/thrift/parquet_types.h"
#include "parquet/util/compiler-util.h"
namespace parquet_cpp {
+// ----------------------------------------------------------------------
+// Metadata enums to match Thrift metadata
+//
+// The reason we maintain our own enums is to avoid transitive dependency on
+// the compiled Thrift headers (and thus thrift/Thrift.h) for users of the
+// public API. After building parquet-cpp, you should not need to include
+// Thrift headers in your application. This means some boilerplate to convert
+// between our types and Parquet's Thrift types.
+//
+// We can also add special values like NONE to distinguish between metadata
+// values being set and not set. As an example consider ConvertedType and
+// CompressionCodec
+
+// Mirrors parquet::Type
+struct Type {
+ enum type {
+ BOOLEAN = 0,
+ INT32 = 1,
+ INT64 = 2,
+ INT96 = 3,
+ FLOAT = 4,
+ DOUBLE = 5,
+ BYTE_ARRAY = 6,
+ FIXED_LEN_BYTE_ARRAY = 7
+ };
+};
+
+// Mirrors parquet::ConvertedType
+struct LogicalType {
+ enum type {
+ NONE,
+ UTF8,
+ MAP,
+ MAP_KEY_VALUE,
+ LIST,
+ ENUM,
+ DECIMAL,
+ DATE,
+ TIME_MILLIS,
+ TIMESTAMP_MILLIS,
+ UINT_8,
+ UINT_16,
+ UINT_32,
+ UINT_64,
+ INT_8,
+ INT_16,
+ INT_32,
+ INT_64,
+ JSON,
+ BSON,
+ INTERVAL
+ };
+};
+
+// Mirrors parquet::FieldRepetitionType
+struct Repetition {
+ enum type {
+ REQUIRED = 0,
+ OPTIONAL = 1,
+ REPEATED = 2
+ };
+};
+
+// Data encodings. Mirrors parquet::Encoding
+struct Encoding {
+ enum type {
+ PLAIN = 0,
+ PLAIN_DICTIONARY = 2,
+ RLE = 3,
+ BIT_PACKED = 4,
+ DELTA_BINARY_PACKED = 5,
+ DELTA_LENGTH_BYTE_ARRAY = 6,
+ DELTA_BYTE_ARRAY = 7,
+ RLE_DICTIONARY = 8
+ };
+};
+
+// Compression, mirrors parquet::CompressionCodec
+struct Compression {
+ enum type {
+ NONE,
+ UNCOMPRESSED,
+ SNAPPY,
+ GZIP,
+ LZO
+ };
+};
+
+// parquet::PageType
+struct PageType {
+ enum type {
+ DATA_PAGE,
+ INDEX_PAGE,
+ DICTIONARY_PAGE,
+ DATA_PAGE_V2
+ };
+};
+
+// ----------------------------------------------------------------------
+
struct ByteArray {
uint32_t len;
const uint8_t* ptr;
@@ -80,72 +179,64 @@ struct type_traits {
};
template <>
-struct type_traits<parquet::Type::BOOLEAN> {
+struct type_traits<Type::BOOLEAN> {
typedef bool value_type;
- static constexpr parquet::Type::type parquet_type = parquet::Type::BOOLEAN;
static constexpr size_t value_byte_size = 1;
static constexpr const char* printf_code = "d";
};
template <>
-struct type_traits<parquet::Type::INT32> {
+struct type_traits<Type::INT32> {
typedef int32_t value_type;
- static constexpr parquet::Type::type parquet_type = parquet::Type::INT32;
static constexpr size_t value_byte_size = 4;
static constexpr const char* printf_code = "d";
};
template <>
-struct type_traits<parquet::Type::INT64> {
+struct type_traits<Type::INT64> {
typedef int64_t value_type;
- static constexpr parquet::Type::type parquet_type = parquet::Type::INT64;
static constexpr size_t value_byte_size = 8;
static constexpr const char* printf_code = "ld";
};
template <>
-struct type_traits<parquet::Type::INT96> {
+struct type_traits<Type::INT96> {
typedef Int96 value_type;
- static constexpr parquet::Type::type parquet_type = parquet::Type::INT96;
static constexpr size_t value_byte_size = 12;
static constexpr const char* printf_code = "s";
};
template <>
-struct type_traits<parquet::Type::FLOAT> {
+struct type_traits<Type::FLOAT> {
typedef float value_type;
- static constexpr parquet::Type::type parquet_type = parquet::Type::FLOAT;
static constexpr size_t value_byte_size = 4;
static constexpr const char* printf_code = "f";
};
template <>
-struct type_traits<parquet::Type::DOUBLE> {
+struct type_traits<Type::DOUBLE> {
typedef double value_type;
- static constexpr parquet::Type::type parquet_type = parquet::Type::DOUBLE;
static constexpr size_t value_byte_size = 8;
static constexpr const char* printf_code = "lf";
};
template <>
-struct type_traits<parquet::Type::BYTE_ARRAY> {
+struct type_traits<Type::BYTE_ARRAY> {
typedef ByteArray value_type;
- static constexpr parquet::Type::type parquet_type = parquet::Type::BYTE_ARRAY;
static constexpr size_t value_byte_size = sizeof(ByteArray);
static constexpr const char* printf_code = "s";
};
template <>
-struct type_traits<parquet::Type::FIXED_LEN_BYTE_ARRAY> {
+struct type_traits<Type::FIXED_LEN_BYTE_ARRAY> {
typedef FixedLenByteArray value_type;
- static constexpr parquet::Type::type parquet_type = parquet::Type::FIXED_LEN_BYTE_ARRAY;
static constexpr size_t value_byte_size = sizeof(FixedLenByteArray);
static constexpr const char* printf_code = "s";
@@ -158,6 +249,38 @@ inline std::string format_fwf(int width) {
return ss.str();
}
+static inline std::string type_to_string(Type::type t) {
+ switch (t) {
+ case Type::BOOLEAN:
+ return "BOOLEAN";
+ break;
+ case Type::INT32:
+ return "INT32";
+ break;
+ case Type::INT64:
+ return "INT64";
+ break;
+ case Type::INT96:
+ return "INT96";
+ break;
+ case Type::FLOAT:
+ return "FLOAT";
+ break;
+ case Type::DOUBLE:
+ return "DOUBLE";
+ break;
+ case Type::BYTE_ARRAY:
+ return "BYTE_ARRAY";
+ break;
+ case Type::FIXED_LEN_BYTE_ARRAY:
+ return "FIXED_LEN_BYTE_ARRAY";
+ break;
+ default:
+ return "UNKNOWN";
+ break;
+ }
+}
+
} // namespace parquet_cpp
#endif // PARQUET_TYPES_H
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/04d75c7c/src/parquet/util/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/src/parquet/util/CMakeLists.txt b/src/parquet/util/CMakeLists.txt
index 1c86112..90a053f 100644
--- a/src/parquet/util/CMakeLists.txt
+++ b/src/parquet/util/CMakeLists.txt
@@ -24,6 +24,7 @@ install(FILES
sse-info.h
compiler-util.h
logging.h
+ macros.h
rle-encoding.h
stopwatch.h
input_stream.h
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/04d75c7c/src/parquet/util/macros.h
----------------------------------------------------------------------
diff --git a/src/parquet/util/macros.h b/src/parquet/util/macros.h
new file mode 100644
index 0000000..7b301d6
--- /dev/null
+++ b/src/parquet/util/macros.h
@@ -0,0 +1,47 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#ifndef PARQUET_UTIL_MACROS_H
+#define PARQUET_UTIL_MACROS_H
+
+// Useful macros from elsewhere
+
+// ----------------------------------------------------------------------
+// From googletest
+
+// When you need to test the private or protected members of a class,
+// use the FRIEND_TEST macro to declare your tests as friends of the
+// class. For example:
+//
+// class MyClass {
+// private:
+// void MyMethod();
+// FRIEND_TEST(MyClassTest, MyMethod);
+// };
+//
+// class MyClassTest : public testing::Test {
+// // ...
+// };
+//
+// TEST_F(MyClassTest, MyMethod) {
+// // Can call MyClass::MyMethod() here.
+// }
+
+#define FRIEND_TEST(test_case_name, test_name)\
+friend class test_case_name##_##test_name##_Test
+
+#endif // PARQUET_UTIL_MACROS_H
[2/2] parquet-cpp git commit: PARQUET-442: Nested schema conversion,
Thrift struct decoupling, dump-schema utility
Posted by ju...@apache.org.
PARQUET-442: Nested schema conversion, Thrift struct decoupling, dump-schema utility
Several inter-related things here:
* Added SchemaDescriptor and ColumnDescriptor types to hold computed structure
information (e.g. max ref/def levels) about the file schema. These are used
now in the FileReader and ColumnReader
* I also added, very similar to parquet-mr (though leaned down), a logical
schema node class structure which can be used for both the file reading and
writing.
* Added FlatSchemaConverter to convert Parquet flat schema metadata into a
nested logical schema
* Added a SchemaPrinter tool and parquet-dump-schema CLI tool to visit a nested
schema and print it to the console.
* Another big thing here is that per PARQUET-446 and related work in
parquet-mr, it's important for both the public API of this project and
internal development to limit our coupling to the compiled Thrift headers. I
added `Type`, `Repetition`, and `LogicalType` enums to the `parquet_cpp`
namespace and inverted the dependency between the column readers, scanners,
and encoders to use these enums.
* A bunch of unit tests.
Author: Wes McKinney <we...@cloudera.com>
Closes #38 from wesm/PARQUET-442 and squashes the following commits:
9ca0219 [Wes McKinney] Add a unit test for SchemaPrinter
fdd37cd [Wes McKinney] Comment re: FLBA node ctor
3a15c0c [Wes McKinney] Add some SchemaDescriptor and ColumnDescriptor tests
27e1805 [Wes McKinney] Don't squash supplied CMAKE_CXX_FLAGS
76dd283 [Wes McKinney] Refactor Make* methods as static member functions
2fae8cd [Wes McKinney] Trim some includes
b2e2661 [Wes McKinney] More doc about the parquet_cpp enums
bd78d7c [Wes McKinney] Move metadata enums to parquet/types.h and add rest of parquet:: enums. Add NONE value to Compression
415305b [Wes McKinney] cpplint
4ac84aa [Wes McKinney] Refactor to make PrimitiveNode and GroupNode ctors private. Add MakePrimitive and MakeGroup factory functions. Move parquet::SchemaElement function into static FromParquet ctors so can set private members
3169b24 [Wes McKinney] NewPrimitive should set num_children = 0 always
954658e [Wes McKinney] Add a comment for TestSchemaConverter.InvalidRoot and uncomment tests for root nodes of other repetition types
55d21b0 [Wes McKinney] Remove schema-builder-test.cc
71c1eab [Wes McKinney] Remove crufty builder.h, will revisit
7ef2dee [Wes McKinney] Fix list encoding comment
8c5af4e [Wes McKinney] Remove old comment, unneeded cast
6b041c5 [Wes McKinney] First draft SchemaDescriptor::Init. Refactor to use ColumnDescriptor. Standardize on parquet_cpp enums instead of Thrift metadata structs. Limit #include from Thrift
841ae7f [Wes McKinney] Don't export SchemaPrinter for now
834389a [Wes McKinney] Add Node::Visotor API and implement a simple schema dump CLI tool
a8bf5c8 [Wes McKinney] Catch and throw exception (instead of core dump) if run out of schema children. Add a Node::Visitor abstract API
bde8b18 [Wes McKinney] Can compare FLBA type metadata in logical schemas
f0df0ba [Wes McKinney] Finish a nested schema conversion test
0af0161 [Wes McKinney] Check that root schema node is repeated
5df00aa [Wes McKinney] Expose GroupConverter API, add test for invalid root
beaa99f [Wes McKinney] Refactor slightly and add an FLBA test
6e248b8 [Wes McKinney] Schema tree conversion first cut, add a couple primitive tests
9685c90 [Wes McKinney] Rename Schema -> RootSchema and add another unit test
f7d0487 [Wes McKinney] Schema types test coverage, move more methods into compilation unit
d746352 [Wes McKinney] Better isolate thrift dependency. Move schema/column descriptor into its own header
a8e5a0a [Wes McKinney] Tweaks
fb9d7ad [Wes McKinney] Draft of flat to nested schema conversion. No tests yet
3015063 [Wes McKinney] More prototyping. Rename Type -> Node. PrimitiveNode factory functions
a8a7a01 [Wes McKinney] Start drafting schema types
Project: http://git-wip-us.apache.org/repos/asf/parquet-cpp/repo
Commit: http://git-wip-us.apache.org/repos/asf/parquet-cpp/commit/04d75c7c
Tree: http://git-wip-us.apache.org/repos/asf/parquet-cpp/tree/04d75c7c
Diff: http://git-wip-us.apache.org/repos/asf/parquet-cpp/diff/04d75c7c
Branch: refs/heads/master
Commit: 04d75c7cb9203263c31eb6be4fade076ac536d63
Parents: c0eec9a
Author: Wes McKinney <we...@cloudera.com>
Authored: Fri Feb 5 18:01:43 2016 -0800
Committer: Julien Le Dem <ju...@dremio.com>
Committed: Fri Feb 5 18:01:43 2016 -0800
----------------------------------------------------------------------
CMakeLists.txt | 8 +-
example/CMakeLists.txt | 3 +
example/decode_benchmark.cc | 1 -
example/parquet-dump-schema.cc | 59 ++++
src/parquet/column/column-reader-test.cc | 33 +-
src/parquet/column/reader.cc | 77 ++---
src/parquet/column/reader.h | 49 ++-
src/parquet/column/scanner.cc | 4 -
src/parquet/column/scanner.h | 40 +--
src/parquet/column/test-util.h | 6 +-
src/parquet/encodings/delta-bit-pack-encoding.h | 8 +-
.../encodings/delta-byte-array-encoding.h | 10 +-
.../delta-length-byte-array-encoding.h | 12 +-
src/parquet/encodings/dictionary-encoding.h | 25 +-
src/parquet/encodings/encodings.h | 18 +-
src/parquet/encodings/plain-encoding.h | 20 +-
src/parquet/reader.cc | 88 ++----
src/parquet/reader.h | 13 +
src/parquet/schema/CMakeLists.txt | 29 ++
src/parquet/schema/converter.cc | 106 +++++++
src/parquet/schema/converter.h | 92 ++++++
src/parquet/schema/descriptor.cc | 94 ++++++
src/parquet/schema/descriptor.h | 127 ++++++++
src/parquet/schema/printer.cc | 139 +++++++++
src/parquet/schema/printer.h | 38 +++
src/parquet/schema/schema-converter-test.cc | 136 +++++++++
src/parquet/schema/schema-descriptor-test.cc | 128 ++++++++
src/parquet/schema/schema-printer-test.cc | 72 +++++
src/parquet/schema/schema-types-test.cc | 231 ++++++++++++++
src/parquet/schema/test-util.h | 63 ++++
src/parquet/schema/types.cc | 163 ++++++++++
src/parquet/schema/types.h | 303 +++++++++++++++++++
src/parquet/types.h | 157 ++++++++--
src/parquet/util/CMakeLists.txt | 1 +
src/parquet/util/macros.h | 47 +++
35 files changed, 2162 insertions(+), 238 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/04d75c7c/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/CMakeLists.txt b/CMakeLists.txt
index d379e3d..e856494 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -177,7 +177,7 @@ set(LIBRARY_OUTPUT_DIRECTORY "${BUILD_OUTPUT_ROOT_DIRECTORY}")
# where to put generated binaries
set(EXECUTABLE_OUTPUT_PATH "${BUILD_OUTPUT_ROOT_DIRECTORY}")
-SET(CMAKE_CXX_FLAGS "-std=c++11 -msse3 -Wall -Wno-unused-value -Wno-unused-variable -Wno-sign-compare -Wno-unknown-pragmas")
+SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -msse3 -Wall -Wno-unused-value -Wno-unused-variable -Wno-sign-compare -Wno-unknown-pragmas")
SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -ggdb")
if (APPLE)
@@ -218,6 +218,11 @@ set(LIBPARQUET_SRCS
src/parquet/column/reader.cc
src/parquet/column/scanner.cc
src/parquet/reader.cc
+
+ src/parquet/schema/converter.cc
+ src/parquet/schema/descriptor.cc
+ src/parquet/schema/printer.cc
+ src/parquet/schema/types.cc
)
set(LIBPARQUET_LINK_LIBS
@@ -250,6 +255,7 @@ add_subdirectory(src/parquet)
add_subdirectory(src/parquet/column)
add_subdirectory(src/parquet/compression)
add_subdirectory(src/parquet/encodings)
+add_subdirectory(src/parquet/schema)
add_subdirectory(src/parquet/thrift)
add_subdirectory(src/parquet/util)
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/04d75c7c/example/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/example/CMakeLists.txt b/example/CMakeLists.txt
index bd9e66c..4143b0d 100644
--- a/example/CMakeLists.txt
+++ b/example/CMakeLists.txt
@@ -25,3 +25,6 @@ target_link_libraries(decode_benchmark ${LINK_LIBS})
add_executable(parquet_reader parquet_reader.cc)
target_link_libraries(parquet_reader ${LINK_LIBS})
+
+add_executable(parquet-dump-schema parquet-dump-schema.cc)
+target_link_libraries(parquet-dump-schema ${LINK_LIBS})
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/04d75c7c/example/decode_benchmark.cc
----------------------------------------------------------------------
diff --git a/example/decode_benchmark.cc b/example/decode_benchmark.cc
index 9a7a19f..b9076bf 100644
--- a/example/decode_benchmark.cc
+++ b/example/decode_benchmark.cc
@@ -23,7 +23,6 @@
#include "parquet/encodings/encodings.h"
#include "parquet/util/stopwatch.h"
-using namespace parquet;
using namespace parquet_cpp;
using namespace std;
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/04d75c7c/example/parquet-dump-schema.cc
----------------------------------------------------------------------
diff --git a/example/parquet-dump-schema.cc b/example/parquet-dump-schema.cc
new file mode 100644
index 0000000..9471225
--- /dev/null
+++ b/example/parquet-dump-schema.cc
@@ -0,0 +1,59 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <parquet/reader.h>
+#include <parquet/schema/converter.h>
+#include <parquet/schema/printer.h>
+
+#include <iostream>
+
+using namespace parquet_cpp;
+
+void DumpSchema(const ParquetFileReader* reader) {
+ auto schema = reader->metadata().schema;
+ schema::FlatSchemaConverter converter(&schema[0], schema.size());
+ std::unique_ptr<schema::Node> root = converter.Convert();
+
+ PrintSchema(root.get(), std::cout);
+}
+
+int main(int argc, char** argv) {
+ std::string filename = argv[1];
+
+ parquet_cpp::ParquetFileReader reader;
+ parquet_cpp::LocalFile file;
+
+ file.Open(filename);
+ if (!file.is_open()) {
+ std::cerr << "Could not open file " << file.path()
+ << std::endl;
+ return -1;
+ }
+
+ try {
+ reader.Open(&file);
+ reader.ParseMetaData();
+ DumpSchema(&reader);
+ } catch (const std::exception& e) {
+ std::cerr << "Parquet error: "
+ << e.what()
+ << std::endl;
+ return -1;
+ }
+
+ return 0;
+}
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/04d75c7c/src/parquet/column/column-reader-test.cc
----------------------------------------------------------------------
diff --git a/src/parquet/column/column-reader-test.cc b/src/parquet/column/column-reader-test.cc
index 88f4465..920ae56 100644
--- a/src/parquet/column/column-reader-test.cc
+++ b/src/parquet/column/column-reader-test.cc
@@ -36,11 +36,11 @@ using std::vector;
using std::shared_ptr;
using parquet::FieldRepetitionType;
using parquet::SchemaElement;
-using parquet::Encoding;
-using parquet::Type;
namespace parquet_cpp {
+using schema::NodePtr;
+
namespace test {
class TestPrimitiveReader : public ::testing::Test {
@@ -49,9 +49,9 @@ class TestPrimitiveReader : public ::testing::Test {
void TearDown() {}
- void InitReader(const SchemaElement* element) {
+ void InitReader(const ColumnDescriptor* descr) {
pager_.reset(new test::MockPageReader(pages_));
- reader_ = ColumnReader::Make(element, std::move(pager_));
+ reader_ = ColumnReader::Make(descr, std::move(pager_));
}
protected:
@@ -77,18 +77,17 @@ static vector<T> slice(const vector<T>& values, size_t start, size_t end) {
TEST_F(TestPrimitiveReader, TestInt32FlatRequired) {
vector<int32_t> values = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
size_t num_values = values.size();
- Encoding::type value_encoding = Encoding::PLAIN;
+ parquet::Encoding::type value_encoding = parquet::Encoding::PLAIN;
vector<uint8_t> page1;
test::DataPageBuilder<Type::INT32> page_builder(&page1);
- page_builder.AppendValues(values, Encoding::PLAIN);
+ page_builder.AppendValues(values, parquet::Encoding::PLAIN);
pages_.push_back(page_builder.Finish());
// TODO: simplify this
- SchemaElement element;
- element.__set_type(Type::INT32);
- element.__set_repetition_type(FieldRepetitionType::REQUIRED);
- InitReader(&element);
+ NodePtr type = schema::Int32("a", Repetition::REQUIRED);
+ ColumnDescriptor descr(type, 0, 0);
+ InitReader(&descr);
Int32Reader* reader = static_cast<Int32Reader*>(reader_.get());
@@ -108,22 +107,20 @@ TEST_F(TestPrimitiveReader, TestInt32FlatOptional) {
vector<int16_t> def_levels = {1, 0, 0, 1, 1, 0, 0, 0, 1, 1};
size_t num_values = values.size();
- Encoding::type value_encoding = Encoding::PLAIN;
+ parquet::Encoding::type value_encoding = parquet::Encoding::PLAIN;
vector<uint8_t> page1;
test::DataPageBuilder<Type::INT32> page_builder(&page1);
// Definition levels precede the values
- page_builder.AppendDefLevels(def_levels, 1, Encoding::RLE);
- page_builder.AppendValues(values, Encoding::PLAIN);
+ page_builder.AppendDefLevels(def_levels, 1, parquet::Encoding::RLE);
+ page_builder.AppendValues(values, parquet::Encoding::PLAIN);
pages_.push_back(page_builder.Finish());
- // TODO: simplify this
- SchemaElement element;
- element.__set_type(Type::INT32);
- element.__set_repetition_type(FieldRepetitionType::OPTIONAL);
- InitReader(&element);
+ NodePtr type = schema::Int32("a", Repetition::OPTIONAL);
+ ColumnDescriptor descr(type, 1, 0);
+ InitReader(&descr);
Int32Reader* reader = static_cast<Int32Reader*>(reader_.get());
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/04d75c7c/src/parquet/column/reader.cc
----------------------------------------------------------------------
diff --git a/src/parquet/column/reader.cc b/src/parquet/column/reader.cc
index 91e026a..0fe7a6e 100644
--- a/src/parquet/column/reader.cc
+++ b/src/parquet/column/reader.cc
@@ -28,26 +28,23 @@
namespace parquet_cpp {
-using parquet::Encoding;
-using parquet::FieldRepetitionType;
-using parquet::PageType;
-using parquet::Type;
-
-ColumnReader::ColumnReader(const parquet::SchemaElement* schema,
+ColumnReader::ColumnReader(const ColumnDescriptor* descr,
std::unique_ptr<PageReader> pager)
- : schema_(schema),
+ : descr_(descr),
pager_(std::move(pager)),
num_buffered_values_(0),
num_decoded_values_(0) {}
template <int TYPE>
void TypedColumnReader<TYPE>::ConfigureDictionary(const DictionaryPage* page) {
- auto it = decoders_.find(Encoding::RLE_DICTIONARY);
+ int encoding = static_cast<int>(parquet::Encoding::RLE_DICTIONARY);
+
+ auto it = decoders_.find(encoding);
if (it != decoders_.end()) {
throw ParquetException("Column cannot have more than one dictionary.");
}
- PlainDecoder<TYPE> dictionary(schema_);
+ PlainDecoder<TYPE> dictionary(descr_);
dictionary.SetData(page->num_values(), page->data(), page->size());
// The dictionary is fully decoded during DictionaryDecoder::Init, so the
@@ -56,10 +53,10 @@ void TypedColumnReader<TYPE>::ConfigureDictionary(const DictionaryPage* page) {
// TODO(wesm): investigate whether this all-or-nothing decoding of the
// dictionary makes sense and whether performance can be improved
std::shared_ptr<DecoderType> decoder(
- new DictionaryDecoder<TYPE>(schema_, &dictionary));
+ new DictionaryDecoder<TYPE>(descr_, &dictionary));
- decoders_[Encoding::RLE_DICTIONARY] = decoder;
- current_decoder_ = decoders_[Encoding::RLE_DICTIONARY].get();
+ decoders_[encoding] = decoder;
+ current_decoder_ = decoders_[encoding].get();
}
@@ -76,8 +73,9 @@ static size_t InitializeLevelDecoder(const uint8_t* buffer,
// PLAIN_DICTIONARY is deprecated but used to be used as a dictionary index
// encoding.
-static bool IsDictionaryIndexEncoding(const Encoding::type& e) {
- return e == Encoding::RLE_DICTIONARY || e == Encoding::PLAIN_DICTIONARY;
+static bool IsDictionaryIndexEncoding(const parquet::Encoding::type& e) {
+ return e == parquet::Encoding::RLE_DICTIONARY ||
+ e == parquet::Encoding::PLAIN_DICTIONARY;
}
template <int TYPE>
@@ -92,10 +90,10 @@ bool TypedColumnReader<TYPE>::ReadNewPage() {
return false;
}
- if (current_page_->type() == PageType::DICTIONARY_PAGE) {
+ if (current_page_->type() == parquet::PageType::DICTIONARY_PAGE) {
ConfigureDictionary(static_cast<const DictionaryPage*>(current_page_.get()));
continue;
- } else if (current_page_->type() == PageType::DATA_PAGE) {
+ } else if (current_page_->type() == parquet::PageType::DATA_PAGE) {
const DataPage* page = static_cast<const DataPage*>(current_page_.get());
// Read a data page.
@@ -111,10 +109,11 @@ bool TypedColumnReader<TYPE>::ReadNewPage() {
// the page size to determine the number of bytes in the encoded data.
size_t data_size = page->size();
+ max_definition_level_ = descr_->max_definition_level();
+
// Read definition levels.
- if (schema_->repetition_type != FieldRepetitionType::REQUIRED) {
+ if (max_definition_level_ > 0) {
// Temporary hack until schema resolution implemented
- max_definition_level_ = 1;
size_t def_levels_bytes = InitializeLevelDecoder(buffer,
max_definition_level_, definition_level_decoder_);
@@ -130,27 +129,29 @@ bool TypedColumnReader<TYPE>::ReadNewPage() {
// Get a decoder object for this page or create a new decoder if this is the
// first page with this encoding.
- Encoding::type encoding = page->encoding();
+ parquet::Encoding::type encoding = page->encoding();
- if (IsDictionaryIndexEncoding(encoding)) encoding = Encoding::RLE_DICTIONARY;
+ if (IsDictionaryIndexEncoding(encoding)) {
+ encoding = parquet::Encoding::RLE_DICTIONARY;
+ }
- auto it = decoders_.find(encoding);
+ auto it = decoders_.find(static_cast<int>(encoding));
if (it != decoders_.end()) {
current_decoder_ = it->second.get();
} else {
switch (encoding) {
- case Encoding::PLAIN: {
- std::shared_ptr<DecoderType> decoder(new PlainDecoder<TYPE>(schema_));
- decoders_[encoding] = decoder;
+ case parquet::Encoding::PLAIN: {
+ std::shared_ptr<DecoderType> decoder(new PlainDecoder<TYPE>(descr_));
+ decoders_[static_cast<int>(encoding)] = decoder;
current_decoder_ = decoder.get();
break;
}
- case Encoding::RLE_DICTIONARY:
+ case parquet::Encoding::RLE_DICTIONARY:
throw ParquetException("Dictionary page must be before data page.");
- case Encoding::DELTA_BINARY_PACKED:
- case Encoding::DELTA_LENGTH_BYTE_ARRAY:
- case Encoding::DELTA_BYTE_ARRAY:
+ case parquet::Encoding::DELTA_BINARY_PACKED:
+ case parquet::Encoding::DELTA_LENGTH_BYTE_ARRAY:
+ case parquet::Encoding::DELTA_BYTE_ARRAY:
ParquetException::NYI("Unsupported encoding");
default:
@@ -202,25 +203,25 @@ size_t ColumnReader::ReadRepetitionLevels(size_t batch_size, int16_t* levels) {
// Dynamic column reader constructor
std::shared_ptr<ColumnReader> ColumnReader::Make(
- const parquet::SchemaElement* element,
+ const ColumnDescriptor* descr,
std::unique_ptr<PageReader> pager) {
- switch (element->type) {
+ switch (descr->physical_type()) {
case Type::BOOLEAN:
- return std::make_shared<BoolReader>(element, std::move(pager));
+ return std::make_shared<BoolReader>(descr, std::move(pager));
case Type::INT32:
- return std::make_shared<Int32Reader>(element, std::move(pager));
+ return std::make_shared<Int32Reader>(descr, std::move(pager));
case Type::INT64:
- return std::make_shared<Int64Reader>(element, std::move(pager));
+ return std::make_shared<Int64Reader>(descr, std::move(pager));
case Type::INT96:
- return std::make_shared<Int96Reader>(element, std::move(pager));
+ return std::make_shared<Int96Reader>(descr, std::move(pager));
case Type::FLOAT:
- return std::make_shared<FloatReader>(element, std::move(pager));
+ return std::make_shared<FloatReader>(descr, std::move(pager));
case Type::DOUBLE:
- return std::make_shared<DoubleReader>(element, std::move(pager));
+ return std::make_shared<DoubleReader>(descr, std::move(pager));
case Type::BYTE_ARRAY:
- return std::make_shared<ByteArrayReader>(element, std::move(pager));
+ return std::make_shared<ByteArrayReader>(descr, std::move(pager));
case Type::FIXED_LEN_BYTE_ARRAY:
- return std::make_shared<FixedLenByteArrayReader>(element, std::move(pager));
+ return std::make_shared<FixedLenByteArrayReader>(descr, std::move(pager));
default:
ParquetException::NYI("type reader not implemented");
}
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/04d75c7c/src/parquet/column/reader.h
----------------------------------------------------------------------
diff --git a/src/parquet/column/reader.h b/src/parquet/column/reader.h
index 27ff678..0d10f0f 100644
--- a/src/parquet/column/reader.h
+++ b/src/parquet/column/reader.h
@@ -30,23 +30,10 @@
#include "parquet/types.h"
#include "parquet/column/page.h"
-
-#include "parquet/thrift/parquet_constants.h"
-#include "parquet/thrift/parquet_types.h"
#include "parquet/encodings/encodings.h"
+#include "parquet/schema/descriptor.h"
#include "parquet/util/rle-encoding.h"
-namespace std {
-
-template <>
-struct hash<parquet::Encoding::type> {
- std::size_t operator()(const parquet::Encoding::type& k) const {
- return hash<int>()(static_cast<int>(k));
- }
-};
-
-} // namespace std
-
namespace parquet_cpp {
class Codec;
@@ -54,9 +41,9 @@ class Scanner;
class ColumnReader {
public:
- ColumnReader(const parquet::SchemaElement*, std::unique_ptr<PageReader>);
+ ColumnReader(const ColumnDescriptor*, std::unique_ptr<PageReader>);
- static std::shared_ptr<ColumnReader> Make(const parquet::SchemaElement*,
+ static std::shared_ptr<ColumnReader> Make(const ColumnDescriptor*,
std::unique_ptr<PageReader>);
// Returns true if there are still values in this column.
@@ -71,12 +58,12 @@ class ColumnReader {
return true;
}
- parquet::Type::type type() const {
- return schema_->type;
+ Type::type type() const {
+ return descr_->physical_type();
}
- const parquet::SchemaElement* schema() const {
- return schema_;
+ const ColumnDescriptor* descr() const {
+ return descr_;
}
protected:
@@ -92,7 +79,7 @@ class ColumnReader {
// Returns the number of decoded repetition levels
size_t ReadRepetitionLevels(size_t batch_size, int16_t* levels);
- const parquet::SchemaElement* schema_;
+ const ColumnDescriptor* descr_;
std::unique_ptr<PageReader> pager_;
std::shared_ptr<Page> current_page_;
@@ -125,7 +112,7 @@ class TypedColumnReader : public ColumnReader {
public:
typedef typename type_traits<TYPE>::value_type T;
- TypedColumnReader(const parquet::SchemaElement* schema,
+ TypedColumnReader(const ColumnDescriptor* schema,
std::unique_ptr<PageReader> pager) :
ColumnReader(schema, std::move(pager)),
current_decoder_(NULL) {
@@ -162,7 +149,7 @@ class TypedColumnReader : public ColumnReader {
// Map of encoding type to the respective decoder object. For example, a
// column chunk's data pages may include both dictionary-encoded and
// plain-encoded data.
- std::unordered_map<parquet::Encoding::type, std::shared_ptr<DecoderType> > decoders_;
+ std::unordered_map<int, std::shared_ptr<DecoderType> > decoders_;
void ConfigureDictionary(const DictionaryPage* page);
@@ -227,14 +214,14 @@ inline size_t TypedColumnReader<TYPE>::ReadBatch(int batch_size, int16_t* def_le
}
-typedef TypedColumnReader<parquet::Type::BOOLEAN> BoolReader;
-typedef TypedColumnReader<parquet::Type::INT32> Int32Reader;
-typedef TypedColumnReader<parquet::Type::INT64> Int64Reader;
-typedef TypedColumnReader<parquet::Type::INT96> Int96Reader;
-typedef TypedColumnReader<parquet::Type::FLOAT> FloatReader;
-typedef TypedColumnReader<parquet::Type::DOUBLE> DoubleReader;
-typedef TypedColumnReader<parquet::Type::BYTE_ARRAY> ByteArrayReader;
-typedef TypedColumnReader<parquet::Type::FIXED_LEN_BYTE_ARRAY> FixedLenByteArrayReader;
+typedef TypedColumnReader<Type::BOOLEAN> BoolReader;
+typedef TypedColumnReader<Type::INT32> Int32Reader;
+typedef TypedColumnReader<Type::INT64> Int64Reader;
+typedef TypedColumnReader<Type::INT96> Int96Reader;
+typedef TypedColumnReader<Type::FLOAT> FloatReader;
+typedef TypedColumnReader<Type::DOUBLE> DoubleReader;
+typedef TypedColumnReader<Type::BYTE_ARRAY> ByteArrayReader;
+typedef TypedColumnReader<Type::FIXED_LEN_BYTE_ARRAY> FixedLenByteArrayReader;
} // namespace parquet_cpp
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/04d75c7c/src/parquet/column/scanner.cc
----------------------------------------------------------------------
diff --git a/src/parquet/column/scanner.cc b/src/parquet/column/scanner.cc
index b263d1e..58f1460 100644
--- a/src/parquet/column/scanner.cc
+++ b/src/parquet/column/scanner.cc
@@ -20,10 +20,6 @@
#include <memory>
#include "parquet/column/reader.h"
-#include "parquet/thrift/parquet_types.h"
-#include "parquet/thrift/util.h"
-
-using parquet::Type;
namespace parquet_cpp {
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/04d75c7c/src/parquet/column/scanner.h
----------------------------------------------------------------------
diff --git a/src/parquet/column/scanner.h b/src/parquet/column/scanner.h
index 8c47a8c..17fd5f6 100644
--- a/src/parquet/column/scanner.h
+++ b/src/parquet/column/scanner.h
@@ -24,7 +24,9 @@
#include <vector>
#include "parquet/column/reader.h"
-#include "parquet/thrift/parquet_types.h"
+
+#include "parquet/schema/descriptor.h"
+#include "parquet/types.h"
namespace parquet_cpp {
@@ -56,8 +58,8 @@ class Scanner {
return value_offset_ < values_buffered_ || reader_->HasNext();
}
- const parquet::SchemaElement* schema() const {
- return reader_->schema();
+ const ColumnDescriptor* descr() const {
+ return reader_->descr();
}
size_t batch_size() const { return batch_size_;}
@@ -150,7 +152,7 @@ class TypedScanner : public Scanner {
NextValue(&val, &is_null);
if (is_null) {
- std::string null_fmt = format_fwf<parquet::Type::BYTE_ARRAY>(width);
+ std::string null_fmt = format_fwf<Type::BYTE_ARRAY>(width);
snprintf(buffer, sizeof(buffer), null_fmt.c_str(), "NULL");
} else {
FormatValue(&val, buffer, sizeof(buffer), width);
@@ -176,39 +178,39 @@ inline void TypedScanner<TYPE>::FormatValue(void* val, char* buffer,
}
template <>
-inline void TypedScanner<parquet::Type::INT96>::FormatValue(
+inline void TypedScanner<Type::INT96>::FormatValue(
void* val, char* buffer, size_t bufsize, size_t width) {
- std::string fmt = format_fwf<parquet::Type::INT96>(width);
+ std::string fmt = format_fwf<Type::INT96>(width);
std::string result = Int96ToString(*reinterpret_cast<Int96*>(val));
snprintf(buffer, bufsize, fmt.c_str(), result.c_str());
}
template <>
-inline void TypedScanner<parquet::Type::BYTE_ARRAY>::FormatValue(
+inline void TypedScanner<Type::BYTE_ARRAY>::FormatValue(
void* val, char* buffer, size_t bufsize, size_t width) {
- std::string fmt = format_fwf<parquet::Type::BYTE_ARRAY>(width);
+ std::string fmt = format_fwf<Type::BYTE_ARRAY>(width);
std::string result = ByteArrayToString(*reinterpret_cast<ByteArray*>(val));
snprintf(buffer, bufsize, fmt.c_str(), result.c_str());
}
template <>
-inline void TypedScanner<parquet::Type::FIXED_LEN_BYTE_ARRAY>::FormatValue(
+inline void TypedScanner<Type::FIXED_LEN_BYTE_ARRAY>::FormatValue(
void* val, char* buffer, size_t bufsize, size_t width) {
- std::string fmt = format_fwf<parquet::Type::FIXED_LEN_BYTE_ARRAY>(width);
+ std::string fmt = format_fwf<Type::FIXED_LEN_BYTE_ARRAY>(width);
std::string result = FixedLenByteArrayToString(
*reinterpret_cast<FixedLenByteArray*>(val),
- schema()->type_length);
+ descr()->type_length());
snprintf(buffer, bufsize, fmt.c_str(), result.c_str());
}
-typedef TypedScanner<parquet::Type::BOOLEAN> BoolScanner;
-typedef TypedScanner<parquet::Type::INT32> Int32Scanner;
-typedef TypedScanner<parquet::Type::INT64> Int64Scanner;
-typedef TypedScanner<parquet::Type::INT96> Int96Scanner;
-typedef TypedScanner<parquet::Type::FLOAT> FloatScanner;
-typedef TypedScanner<parquet::Type::DOUBLE> DoubleScanner;
-typedef TypedScanner<parquet::Type::BYTE_ARRAY> ByteArrayScanner;
-typedef TypedScanner<parquet::Type::FIXED_LEN_BYTE_ARRAY> FixedLenByteArrayScanner;
+typedef TypedScanner<Type::BOOLEAN> BoolScanner;
+typedef TypedScanner<Type::INT32> Int32Scanner;
+typedef TypedScanner<Type::INT64> Int64Scanner;
+typedef TypedScanner<Type::INT96> Int96Scanner;
+typedef TypedScanner<Type::FLOAT> FloatScanner;
+typedef TypedScanner<Type::DOUBLE> DoubleScanner;
+typedef TypedScanner<Type::BYTE_ARRAY> ByteArrayScanner;
+typedef TypedScanner<Type::FIXED_LEN_BYTE_ARRAY> FixedLenByteArrayScanner;
} // namespace parquet_cpp
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/04d75c7c/src/parquet/column/test-util.h
----------------------------------------------------------------------
diff --git a/src/parquet/column/test-util.h b/src/parquet/column/test-util.h
index 80f3fa1..9aa4e5a 100644
--- a/src/parquet/column/test-util.h
+++ b/src/parquet/column/test-util.h
@@ -28,8 +28,6 @@
#include "parquet/column/page.h"
-using parquet::Encoding;
-
namespace parquet_cpp {
namespace test {
@@ -96,7 +94,7 @@ class DataPageBuilder {
void AppendValues(const std::vector<T>& values,
parquet::Encoding::type encoding) {
- if (encoding != Encoding::PLAIN) {
+ if (encoding != parquet::Encoding::PLAIN) {
ParquetException::NYI("only plain encoding currently implemented");
}
size_t bytes_to_encode = values.size() * sizeof(T);
@@ -150,7 +148,7 @@ class DataPageBuilder {
// Used internally for both repetition and definition levels
void AppendLevels(const std::vector<int16_t>& levels, int16_t max_level,
parquet::Encoding::type encoding) {
- if (encoding != Encoding::RLE) {
+ if (encoding != parquet::Encoding::RLE) {
ParquetException::NYI("only rle encoding currently implemented");
}
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/04d75c7c/src/parquet/encodings/delta-bit-pack-encoding.h
----------------------------------------------------------------------
diff --git a/src/parquet/encodings/delta-bit-pack-encoding.h b/src/parquet/encodings/delta-bit-pack-encoding.h
index 858fcec..4eb762b 100644
--- a/src/parquet/encodings/delta-bit-pack-encoding.h
+++ b/src/parquet/encodings/delta-bit-pack-encoding.h
@@ -30,11 +30,9 @@ class DeltaBitPackDecoder : public Decoder<TYPE> {
public:
typedef typename type_traits<TYPE>::value_type T;
- explicit DeltaBitPackDecoder(const parquet::SchemaElement* schema)
- : Decoder<TYPE>(schema, parquet::Encoding::DELTA_BINARY_PACKED) {
- parquet::Type::type type = type_traits<TYPE>::parquet_type;
-
- if (type != parquet::Type::INT32 && type != parquet::Type::INT64) {
+ explicit DeltaBitPackDecoder(const ColumnDescriptor* descr)
+ : Decoder<TYPE>(descr, parquet::Encoding::DELTA_BINARY_PACKED) {
+ if (TYPE != Type::INT32 && TYPE != Type::INT64) {
throw ParquetException("Delta bit pack encoding should only be for integer data.");
}
}
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/04d75c7c/src/parquet/encodings/delta-byte-array-encoding.h
----------------------------------------------------------------------
diff --git a/src/parquet/encodings/delta-byte-array-encoding.h b/src/parquet/encodings/delta-byte-array-encoding.h
index 92f0d53..2763f16 100644
--- a/src/parquet/encodings/delta-byte-array-encoding.h
+++ b/src/parquet/encodings/delta-byte-array-encoding.h
@@ -24,10 +24,10 @@
namespace parquet_cpp {
-class DeltaByteArrayDecoder : public Decoder<parquet::Type::BYTE_ARRAY> {
+class DeltaByteArrayDecoder : public Decoder<Type::BYTE_ARRAY> {
public:
- explicit DeltaByteArrayDecoder(const parquet::SchemaElement* schema)
- : Decoder<parquet::Type::BYTE_ARRAY>(schema, parquet::Encoding::DELTA_BYTE_ARRAY),
+ explicit DeltaByteArrayDecoder(const ColumnDescriptor* descr)
+ : Decoder<Type::BYTE_ARRAY>(descr, parquet::Encoding::DELTA_BYTE_ARRAY),
prefix_len_decoder_(nullptr),
suffix_decoder_(nullptr) {
}
@@ -67,9 +67,9 @@ class DeltaByteArrayDecoder : public Decoder<parquet::Type::BYTE_ARRAY> {
}
private:
- using Decoder<parquet::Type::BYTE_ARRAY>::num_values_;
+ using Decoder<Type::BYTE_ARRAY>::num_values_;
- DeltaBitPackDecoder<parquet::Type::INT32> prefix_len_decoder_;
+ DeltaBitPackDecoder<Type::INT32> prefix_len_decoder_;
DeltaLengthByteArrayDecoder suffix_decoder_;
ByteArray last_value_;
};
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/04d75c7c/src/parquet/encodings/delta-length-byte-array-encoding.h
----------------------------------------------------------------------
diff --git a/src/parquet/encodings/delta-length-byte-array-encoding.h b/src/parquet/encodings/delta-length-byte-array-encoding.h
index d138c1a..0868924 100644
--- a/src/parquet/encodings/delta-length-byte-array-encoding.h
+++ b/src/parquet/encodings/delta-length-byte-array-encoding.h
@@ -24,11 +24,11 @@
namespace parquet_cpp {
-class DeltaLengthByteArrayDecoder : public Decoder<parquet::Type::BYTE_ARRAY> {
+class DeltaLengthByteArrayDecoder : public Decoder<Type::BYTE_ARRAY> {
public:
- explicit DeltaLengthByteArrayDecoder(const parquet::SchemaElement* schema)
- : Decoder<parquet::Type::BYTE_ARRAY>(
- schema, parquet::Encoding::DELTA_LENGTH_BYTE_ARRAY),
+ explicit DeltaLengthByteArrayDecoder(const ColumnDescriptor* descr)
+ : Decoder<Type::BYTE_ARRAY>(descr,
+ parquet::Encoding::DELTA_LENGTH_BYTE_ARRAY),
len_decoder_(nullptr) {
}
@@ -57,8 +57,8 @@ class DeltaLengthByteArrayDecoder : public Decoder<parquet::Type::BYTE_ARRAY> {
}
private:
- using Decoder<parquet::Type::BYTE_ARRAY>::num_values_;
- DeltaBitPackDecoder<parquet::Type::INT32> len_decoder_;
+ using Decoder<Type::BYTE_ARRAY>::num_values_;
+ DeltaBitPackDecoder<Type::INT32> len_decoder_;
const uint8_t* data_;
int len_;
};
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/04d75c7c/src/parquet/encodings/dictionary-encoding.h
----------------------------------------------------------------------
diff --git a/src/parquet/encodings/dictionary-encoding.h b/src/parquet/encodings/dictionary-encoding.h
index 9641e7e..0547eb3 100644
--- a/src/parquet/encodings/dictionary-encoding.h
+++ b/src/parquet/encodings/dictionary-encoding.h
@@ -30,11 +30,12 @@ class DictionaryDecoder : public Decoder<TYPE> {
public:
typedef typename type_traits<TYPE>::value_type T;
- // Initializes the dictionary with values from 'dictionary'. The data in dictionary
- // is not guaranteed to persist in memory after this call so the dictionary decoder
- // needs to copy the data out if necessary.
- DictionaryDecoder(const parquet::SchemaElement* schema, Decoder<TYPE>* dictionary)
- : Decoder<TYPE>(schema, parquet::Encoding::RLE_DICTIONARY) {
+ // Initializes the dictionary with values from 'dictionary'. The data in
+ // dictionary is not guaranteed to persist in memory after this call so the
+ // dictionary decoder needs to copy the data out if necessary.
+ DictionaryDecoder(const ColumnDescriptor* descr,
+ Decoder<TYPE>* dictionary)
+ : Decoder<TYPE>(descr, parquet::Encoding::RLE_DICTIONARY) {
Init(dictionary);
}
@@ -86,14 +87,14 @@ inline void DictionaryDecoder<TYPE>::Init(Decoder<TYPE>* dictionary) {
}
template <>
-inline void DictionaryDecoder<parquet::Type::BOOLEAN>::Init(
- Decoder<parquet::Type::BOOLEAN>* dictionary) {
+inline void DictionaryDecoder<Type::BOOLEAN>::Init(
+ Decoder<Type::BOOLEAN>* dictionary) {
ParquetException::NYI("Dictionary encoding is not implemented for boolean values");
}
template <>
-inline void DictionaryDecoder<parquet::Type::BYTE_ARRAY>::Init(
- Decoder<parquet::Type::BYTE_ARRAY>* dictionary) {
+inline void DictionaryDecoder<Type::BYTE_ARRAY>::Init(
+ Decoder<Type::BYTE_ARRAY>* dictionary) {
int num_dictionary_values = dictionary->values_left();
dictionary_.resize(num_dictionary_values);
dictionary->Decode(&dictionary_[0], num_dictionary_values);
@@ -112,13 +113,13 @@ inline void DictionaryDecoder<parquet::Type::BYTE_ARRAY>::Init(
}
template <>
-inline void DictionaryDecoder<parquet::Type::FIXED_LEN_BYTE_ARRAY>::Init(
- Decoder<parquet::Type::FIXED_LEN_BYTE_ARRAY>* dictionary) {
+inline void DictionaryDecoder<Type::FIXED_LEN_BYTE_ARRAY>::Init(
+ Decoder<Type::FIXED_LEN_BYTE_ARRAY>* dictionary) {
int num_dictionary_values = dictionary->values_left();
dictionary_.resize(num_dictionary_values);
dictionary->Decode(&dictionary_[0], num_dictionary_values);
- int fixed_len = schema_->type_length;
+ int fixed_len = descr_->type_length();
int total_size = num_dictionary_values*fixed_len;
byte_array_data_.resize(total_size);
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/04d75c7c/src/parquet/encodings/encodings.h
----------------------------------------------------------------------
diff --git a/src/parquet/encodings/encodings.h b/src/parquet/encodings/encodings.h
index c427ff4..21754d1 100644
--- a/src/parquet/encodings/encodings.h
+++ b/src/parquet/encodings/encodings.h
@@ -26,9 +26,13 @@
#include "parquet/util/rle-encoding.h"
#include "parquet/util/bit-stream-utils.inline.h"
+#include "parquet/schema/descriptor.h"
+
+#include "parquet/thrift/parquet_types.h"
+
namespace parquet_cpp {
-// The Decoder template is parameterized on parquet::Type::type
+// The Decoder template is parameterized on parquet_cpp::Type::type
template <int TYPE>
class Decoder {
public:
@@ -55,12 +59,12 @@ class Decoder {
const parquet::Encoding::type encoding() const { return encoding_; }
protected:
- explicit Decoder(const parquet::SchemaElement* schema,
+ explicit Decoder(const ColumnDescriptor* descr,
const parquet::Encoding::type& encoding)
- : schema_(schema), encoding_(encoding), num_values_(0) {}
+ : descr_(descr), encoding_(encoding), num_values_(0) {}
// For accessing type-specific metadata, like FIXED_LEN_BYTE_ARRAY
- const parquet::SchemaElement* schema_;
+ const ColumnDescriptor* descr_;
const parquet::Encoding::type encoding_;
int num_values_;
@@ -91,12 +95,12 @@ class Encoder {
const parquet::Encoding::type encoding() const { return encoding_; }
protected:
- explicit Encoder(const parquet::SchemaElement* schema,
+ explicit Encoder(const ColumnDescriptor* descr,
const parquet::Encoding::type& encoding)
- : schema_(schema), encoding_(encoding) {}
+ : descr_(descr), encoding_(encoding) {}
// For accessing type-specific metadata, like FIXED_LEN_BYTE_ARRAY
- const parquet::SchemaElement* schema_;
+ const ColumnDescriptor* descr_;
const parquet::Encoding::type encoding_;
};
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/04d75c7c/src/parquet/encodings/plain-encoding.h
----------------------------------------------------------------------
diff --git a/src/parquet/encodings/plain-encoding.h b/src/parquet/encodings/plain-encoding.h
index cc37776..06d237a 100644
--- a/src/parquet/encodings/plain-encoding.h
+++ b/src/parquet/encodings/plain-encoding.h
@@ -23,8 +23,6 @@
#include <algorithm>
#include <vector>
-using parquet::Type;
-
namespace parquet_cpp {
// ----------------------------------------------------------------------
@@ -36,8 +34,8 @@ class PlainDecoder : public Decoder<TYPE> {
typedef typename type_traits<TYPE>::value_type T;
using Decoder<TYPE>::num_values_;
- explicit PlainDecoder(const parquet::SchemaElement* schema) :
- Decoder<TYPE>(schema, parquet::Encoding::PLAIN),
+ explicit PlainDecoder(const ColumnDescriptor* descr) :
+ Decoder<TYPE>(descr, parquet::Encoding::PLAIN),
data_(NULL), len_(0) {}
virtual void SetData(int num_values, const uint8_t* data, int len) {
@@ -85,7 +83,7 @@ template <>
inline int PlainDecoder<Type::FIXED_LEN_BYTE_ARRAY>::Decode(
FixedLenByteArray* buffer, int max_values) {
max_values = std::min(max_values, num_values_);
- int len = schema_->type_length;
+ int len = descr_->type_length();
for (int i = 0; i < max_values; ++i) {
if (len_ < len) ParquetException::EofException();
buffer[i].ptr = data_;
@@ -99,8 +97,8 @@ inline int PlainDecoder<Type::FIXED_LEN_BYTE_ARRAY>::Decode(
template <>
class PlainDecoder<Type::BOOLEAN> : public Decoder<Type::BOOLEAN> {
public:
- explicit PlainDecoder(const parquet::SchemaElement* schema) :
- Decoder<Type::BOOLEAN>(schema, parquet::Encoding::PLAIN) {}
+ explicit PlainDecoder(const ColumnDescriptor* descr) :
+ Decoder<Type::BOOLEAN>(descr, parquet::Encoding::PLAIN) {}
virtual void SetData(int num_values, const uint8_t* data, int len) {
num_values_ = num_values;
@@ -146,8 +144,8 @@ class PlainEncoder : public Encoder<TYPE> {
public:
typedef typename type_traits<TYPE>::value_type T;
- explicit PlainEncoder(const parquet::SchemaElement* schema) :
- Encoder<TYPE>(schema, parquet::Encoding::PLAIN) {}
+ explicit PlainEncoder(const ColumnDescriptor* descr) :
+ Encoder<TYPE>(descr, parquet::Encoding::PLAIN) {}
virtual size_t Encode(const T* src, int num_values, uint8_t* dst);
};
@@ -155,8 +153,8 @@ class PlainEncoder : public Encoder<TYPE> {
template <>
class PlainEncoder<Type::BOOLEAN> : public Encoder<Type::BOOLEAN> {
public:
- explicit PlainEncoder(const parquet::SchemaElement* schema) :
- Encoder<Type::BOOLEAN>(schema, parquet::Encoding::PLAIN) {}
+ explicit PlainEncoder(const ColumnDescriptor* descr) :
+ Encoder<Type::BOOLEAN>(descr, parquet::Encoding::PLAIN) {}
virtual size_t Encode(const std::vector<bool>& src, int num_values,
uint8_t* dst) {
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/04d75c7c/src/parquet/reader.cc
----------------------------------------------------------------------
diff --git a/src/parquet/reader.cc b/src/parquet/reader.cc
index a4e767e..a90bafb 100644
--- a/src/parquet/reader.cc
+++ b/src/parquet/reader.cc
@@ -29,12 +29,12 @@
#include "parquet/column/scanner.h"
#include "parquet/exception.h"
+#include "parquet/schema/converter.h"
#include "parquet/thrift/util.h"
#include "parquet/util/input_stream.h"
using std::string;
using std::vector;
-using parquet::Type;
namespace parquet_cpp {
@@ -116,13 +116,12 @@ std::shared_ptr<ColumnReader> RowGroupReader::Column(size_t i) {
throw ParquetException("Unable to read column chunk data");
}
- // TODO(wesm): This presumes a flat schema
- const parquet::SchemaElement* schema = &parent_->metadata_.schema[i + 1];
+ const ColumnDescriptor* descr = parent_->column_descr(i);
std::unique_ptr<PageReader> pager(
new SerializedPageReader(std::move(input), col.meta_data.codec));
- std::shared_ptr<ColumnReader> reader = ColumnReader::Make(schema,
+ std::shared_ptr<ColumnReader> reader = ColumnReader::Make(descr,
std::move(pager));
column_readers_[i] = reader;
@@ -212,44 +211,17 @@ void ParquetFileReader::ParseMetaData() {
throw ParquetException("Invalid parquet file. Could not read metadata bytes.");
}
DeserializeThriftMsg(&metadata_buffer[0], &metadata_len, &metadata_);
+
+ schema::FlatSchemaConverter converter(&metadata_.schema[0],
+ metadata_.schema.size());
+ schema_descr_.Init(converter.Convert());
+
parsed_metadata_ = true;
}
// ----------------------------------------------------------------------
// ParquetFileReader::DebugPrint
-static string parquet_type_to_string(Type::type t) {
- switch (t) {
- case Type::BOOLEAN:
- return "BOOLEAN";
- break;
- case Type::INT32:
- return "INT32";
- break;
- case Type::INT64:
- return "INT64";
- break;
- case Type::INT96:
- return "INT96";
- break;
- case Type::FLOAT:
- return "FLOAT";
- break;
- case Type::DOUBLE:
- return "DOUBLE";
- break;
- case Type::BYTE_ARRAY:
- return "BYTE_ARRAY";
- break;
- case Type::FIXED_LEN_BYTE_ARRAY:
- return "FIXED_LEN_BYTE_ARRAY";
- break;
- default:
- return "UNKNOWN";
- break;
- }
-}
-
// the fixed initial size is just for an example
#define COL_WIDTH "20"
@@ -261,24 +233,27 @@ void ParquetFileReader::DebugPrint(std::ostream& stream, bool print_values) {
stream << "File statistics:\n";
stream << "Total rows: " << metadata_.num_rows << "\n";
- for (int c = 1; c < metadata_.schema.size(); ++c) {
- stream << "Column " << c-1 << ": " << metadata_.schema[c].name << " ("
- << parquet_type_to_string(metadata_.schema[c].type);
- stream << ")\n";
+ for (int i = 0; i < num_columns(); ++i) {
+ const ColumnDescriptor* descr = column_descr(i);
+ stream << "Column " << i << ": "
+ << descr->name()
+ << " ("
+ << type_to_string(descr->physical_type())
+ << ")" << std::endl;
}
- for (int i = 0; i < metadata_.row_groups.size(); ++i) {
+ for (int i = 0; i < num_row_groups(); ++i) {
stream << "--- Row Group " << i << " ---\n";
RowGroupReader* group_reader = RowGroup(i);
// Print column metadata
- size_t nColumns = group_reader->num_columns();
+ size_t num_columns = group_reader->num_columns();
- for (int c = 0; c < group_reader->num_columns(); ++c) {
- const parquet::ColumnMetaData* meta_data = group_reader->column_metadata(c);
- stream << "Column " << c
- << ": " << meta_data->num_values << " rows, "
+ for (int i = 0; i < num_columns; ++i) {
+ const parquet::ColumnMetaData* meta_data = group_reader->column_metadata(i);
+ stream << "Column " << i << ": "
+ << meta_data->num_values << " rows, "
<< meta_data->statistics.null_count << " null values, "
<< meta_data->statistics.distinct_count << " distinct values, "
<< "min value: " << (meta_data->statistics.min.length() > 0 ?
@@ -295,36 +270,31 @@ void ParquetFileReader::DebugPrint(std::ostream& stream, bool print_values) {
char buffer[bufsize];
// Create readers for all columns and print contents
- vector<std::shared_ptr<Scanner> > scanners(nColumns, NULL);
- for (int c = 0; c < nColumns; ++c) {
- std::shared_ptr<ColumnReader> col_reader = group_reader->Column(c);
+ vector<std::shared_ptr<Scanner> > scanners(num_columns, NULL);
+ for (int i = 0; i < num_columns; ++i) {
+ std::shared_ptr<ColumnReader> col_reader = group_reader->Column(i);
Type::type col_type = col_reader->type();
std::stringstream ss;
ss << "%-" << COL_WIDTH << "s";
std::string fmt = ss.str();
- snprintf(buffer, bufsize, fmt.c_str(), metadata_.schema[c+1].name.c_str());
+ snprintf(buffer, bufsize, fmt.c_str(), column_descr(i)->name().c_str());
stream << buffer;
// This is OK in this method as long as the RowGroupReader does not get
// deleted
- scanners[c] = Scanner::Make(col_reader);
+ scanners[i] = Scanner::Make(col_reader);
}
stream << "\n";
bool hasRow;
do {
hasRow = false;
- for (int c = 0; c < nColumns; ++c) {
- if (scanners[c] == NULL) {
- snprintf(buffer, bufsize, "%-" COL_WIDTH"s", " ");
- stream << buffer;
- continue;
- }
- if (scanners[c]->HasNext()) {
+ for (int i = 0; i < num_columns; ++i) {
+ if (scanners[i]->HasNext()) {
hasRow = true;
- scanners[c]->PrintNext(stream, 17);
+ scanners[i]->PrintNext(stream, 17);
}
}
stream << "\n";
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/04d75c7c/src/parquet/reader.h
----------------------------------------------------------------------
diff --git a/src/parquet/reader.h b/src/parquet/reader.h
index 16927a7..ea23182 100644
--- a/src/parquet/reader.h
+++ b/src/parquet/reader.h
@@ -25,8 +25,11 @@
#include <unordered_map>
#include "parquet/thrift/parquet_types.h"
+
#include "parquet/types.h"
+#include "parquet/schema/descriptor.h"
+
namespace parquet_cpp {
class ColumnReader;
@@ -122,6 +125,14 @@ class ParquetFileReader {
return metadata_.row_groups.size();
}
+ const ColumnDescriptor* column_descr(size_t i) const {
+ return schema_descr_.Column(i);
+ }
+
+ size_t num_columns() const {
+ return schema_descr_.num_columns();
+ }
+
const parquet::FileMetaData& metadata() const {
return metadata_;
}
@@ -132,6 +143,8 @@ class ParquetFileReader {
friend class RowGroupReader;
parquet::FileMetaData metadata_;
+ SchemaDescriptor schema_descr_;
+
bool parsed_metadata_;
// Row group index -> RowGroupReader
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/04d75c7c/src/parquet/schema/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/src/parquet/schema/CMakeLists.txt b/src/parquet/schema/CMakeLists.txt
new file mode 100644
index 0000000..0902ccf
--- /dev/null
+++ b/src/parquet/schema/CMakeLists.txt
@@ -0,0 +1,29 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Headers: top level
+install(FILES
+ builder.h
+ converter.h
+ descriptor.h
+ types.h
+ DESTINATION include/parquet/schema)
+
+ADD_PARQUET_TEST(schema-converter-test)
+ADD_PARQUET_TEST(schema-descriptor-test)
+ADD_PARQUET_TEST(schema-printer-test)
+ADD_PARQUET_TEST(schema-types-test)
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/04d75c7c/src/parquet/schema/converter.cc
----------------------------------------------------------------------
diff --git a/src/parquet/schema/converter.cc b/src/parquet/schema/converter.cc
new file mode 100644
index 0000000..9b45cc9
--- /dev/null
+++ b/src/parquet/schema/converter.cc
@@ -0,0 +1,106 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "parquet/schema/converter.h"
+
+#include <string>
+
+#include "parquet/exception.h"
+
+using parquet::SchemaElement;
+
+namespace parquet_cpp {
+
+namespace schema {
+
+std::unique_ptr<Node> FlatSchemaConverter::Convert() {
+ const SchemaElement& root = elements_[0];
+
+ // Validate the root node
+ if (root.num_children == 0) {
+ throw ParquetException("Root node did not have children");
+ }
+
+ // Relaxing this restriction as some implementations don't set this
+ // if (root.repetition_type != FieldRepetitionType::REPEATED) {
+ // throw ParquetException("Root node was not FieldRepetitionType::REPEATED");
+ // }
+
+ return NextNode();
+}
+
+std::unique_ptr<Node> FlatSchemaConverter::NextNode() {
+ const SchemaElement& element = Next();
+
+ size_t node_id = next_id();
+
+ const void* opaque_element = static_cast<const void*>(&element);
+
+ if (element.num_children == 0) {
+ // Leaf (primitive) node
+ return PrimitiveNode::FromParquet(opaque_element, node_id);
+ } else {
+ // Group
+ NodeVector fields;
+ for (size_t i = 0; i < element.num_children; ++i) {
+ std::unique_ptr<Node> field = NextNode();
+ fields.push_back(NodePtr(field.release()));
+ }
+ return GroupNode::FromParquet(opaque_element, node_id, fields);
+ }
+}
+
+const parquet::SchemaElement& FlatSchemaConverter::Next() {
+ if (pos_ == length_) {
+ throw ParquetException("Malformed schema: not enough parquet::SchemaElement values");
+ }
+ return elements_[pos_++];
+}
+
+std::shared_ptr<SchemaDescriptor> FromParquet(const std::vector<SchemaElement>& schema) {
+ FlatSchemaConverter converter(&schema[0], schema.size());
+ std::unique_ptr<Node> root = converter.Convert();
+
+ std::shared_ptr<SchemaDescriptor> descr = std::make_shared<SchemaDescriptor>();
+ descr->Init(std::shared_ptr<GroupNode>(
+ static_cast<GroupNode*>(root.release())));
+
+ return descr;
+}
+
+// ----------------------------------------------------------------------
+// Conversion back to Parquet metadata
+
+// TODO: decide later what to do with these. When converting back only need to
+// write into a parquet::SchemaElement
+
+// FieldRepetitionType::type ToParquet(Repetition::type type) {
+// return static_cast<FieldRepetitionType::type>(type);
+// }
+
+// parquet::ConvertedType::type ToParquet(LogicalType::type type) {
+// // item 0 is NONE
+// return static_cast<parquet::ConvertedType::type>(static_cast<int>(type) - 1);
+// }
+
+// parquet::Type::type ToParquet(Type::type type) {
+// return static_cast<parquet::Type::type>(type);
+// }
+
+} // namespace schema
+
+} // namespace parquet_cpp
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/04d75c7c/src/parquet/schema/converter.h
----------------------------------------------------------------------
diff --git a/src/parquet/schema/converter.h b/src/parquet/schema/converter.h
new file mode 100644
index 0000000..cde48c9
--- /dev/null
+++ b/src/parquet/schema/converter.h
@@ -0,0 +1,92 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Conversion routines for converting to and from flat Parquet metadata. Among
+// other things, this limits the exposure of the internals of the Thrift
+// metadata structs to the rest of the library.
+
+// NB: This file is not part of the schema public API and only used internally
+// for converting to and from Parquet Thrift metadata
+
+#ifndef PARQUET_SCHEMA_CONVERTER_H
+#define PARQUET_SCHEMA_CONVERTER_H
+
+#include <cstdint>
+#include <memory>
+#include <unordered_map>
+#include <vector>
+
+#include "parquet/schema/descriptor.h"
+#include "parquet/schema/types.h"
+
+#include "parquet/thrift/parquet_types.h"
+
+namespace parquet_cpp {
+
+namespace schema {
+
+// ----------------------------------------------------------------------
+// Conversion from Parquet Thrift metadata
+
+std::shared_ptr<SchemaDescriptor> FromParquet(
+ const std::vector<parquet::SchemaElement>& schema);
+
+class FlatSchemaConverter {
+ public:
+ FlatSchemaConverter(const parquet::SchemaElement* elements, size_t length) :
+ elements_(elements),
+ length_(length),
+ pos_(0),
+ current_id_(0) {}
+
+ std::unique_ptr<Node> Convert();
+
+ private:
+ const parquet::SchemaElement* elements_;
+ size_t length_;
+ size_t pos_;
+ size_t current_id_;
+
+ size_t next_id() {
+ return current_id_++;
+ }
+
+ const parquet::SchemaElement& Next();
+
+ std::unique_ptr<Node> NextNode();
+};
+
+// ----------------------------------------------------------------------
+// Conversion to Parquet Thrift metadata
+
+void ToParquet(const GroupNode* schema, std::vector<parquet::SchemaElement>* out);
+
+// Converts nested parquet_cpp schema back to a flat vector of Thrift structs
+class SchemaFlattener {
+ public:
+ SchemaFlattener(const GroupNode* schema, std::vector<parquet::SchemaElement>* out);
+
+ private:
+ const GroupNode* root_;
+ std::vector<parquet::SchemaElement>* schema_;
+};
+
+} // namespace schema
+
+} // namespace parquet_cpp
+
+#endif // PARQUET_SCHEMA_CONVERTER_H
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/04d75c7c/src/parquet/schema/descriptor.cc
----------------------------------------------------------------------
diff --git a/src/parquet/schema/descriptor.cc b/src/parquet/schema/descriptor.cc
new file mode 100644
index 0000000..02b5060
--- /dev/null
+++ b/src/parquet/schema/descriptor.cc
@@ -0,0 +1,94 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "parquet/schema/descriptor.h"
+
+#include "parquet/exception.h"
+
+namespace parquet_cpp {
+
+using schema::NodePtr;
+using schema::PrimitiveNode;
+using schema::GroupNode;
+
+void SchemaDescriptor::Init(std::unique_ptr<schema::Node> schema) {
+ Init(NodePtr(schema.release()));
+}
+
+void SchemaDescriptor::Init(const NodePtr& schema) {
+ schema_ = schema;
+
+ if (!schema_->is_group()) {
+ throw ParquetException("Must initialize with a schema group");
+ }
+
+ group_ = static_cast<const GroupNode*>(schema_.get());
+ leaves_.clear();
+
+ for (size_t i = 0; i < group_->field_count(); ++i) {
+ BuildTree(group_->field(i), 0, 0);
+ }
+}
+
+void SchemaDescriptor::BuildTree(const NodePtr& node, int16_t max_def_level,
+ int16_t max_rep_level) {
+ if (node->is_optional()) {
+ ++max_def_level;
+ } else if (node->is_repeated()) {
+ // Repeated fields add a definition level. This is used to distinguish
+ // between an empty list and a list with an item in it.
+ ++max_rep_level;
+ ++max_def_level;
+ }
+
+ // Now, walk the schema and create a ColumnDescriptor for each leaf node
+ if (node->is_group()) {
+ const GroupNode* group = static_cast<const GroupNode*>(node.get());
+ for (size_t i = 0; i < group->field_count(); ++i) {
+ BuildTree(group->field(i), max_def_level, max_rep_level);
+ }
+ } else {
+ // Primitive node, append to leaves
+ leaves_.push_back(ColumnDescriptor(node, max_def_level, max_rep_level, this));
+ }
+}
+
+ColumnDescriptor::ColumnDescriptor(const schema::NodePtr& node,
+ int16_t max_definition_level, int16_t max_repetition_level,
+ const SchemaDescriptor* schema_descr) :
+ node_(node),
+ max_definition_level_(max_definition_level),
+ max_repetition_level_(max_repetition_level),
+ schema_descr_(schema_descr) {
+ if (!node_->is_primitive()) {
+ throw ParquetException("Must be a primitive type");
+ }
+ primitive_node_ = static_cast<const PrimitiveNode*>(node_.get());
+}
+
+const ColumnDescriptor* SchemaDescriptor::Column(size_t i) const {
+ return &leaves_[i];
+}
+
+int ColumnDescriptor::type_length() const {
+ if (primitive_node_->physical_type() != Type::FIXED_LEN_BYTE_ARRAY) {
+ throw ParquetException("Not a FIXED_LEN_BYTE_ARRAY");
+ }
+ return primitive_node_->type_length();
+}
+
+} // namespace parquet_cpp
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/04d75c7c/src/parquet/schema/descriptor.h
----------------------------------------------------------------------
diff --git a/src/parquet/schema/descriptor.h b/src/parquet/schema/descriptor.h
new file mode 100644
index 0000000..144666f
--- /dev/null
+++ b/src/parquet/schema/descriptor.h
@@ -0,0 +1,127 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#ifndef PARQUET_SCHEMA_DESCRIPTOR_H
+#define PARQUET_SCHEMA_DESCRIPTOR_H
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "parquet/schema/types.h"
+
+namespace parquet_cpp {
+
+class SchemaDescriptor;
+
+// The ColumnDescriptor encapsulates information necessary to interpret
+// primitive column data in the context of a particular schema. We have to
+// examine the node structure of a column's path to the root in the schema tree
+// to be able to reassemble the nested structure from the repetition and
+// definition levels.
+class ColumnDescriptor {
+ public:
+ ColumnDescriptor(const schema::NodePtr& node, int16_t max_definition_level,
+ int16_t max_repetition_level, const SchemaDescriptor* schema_descr = nullptr);
+
+ int16_t max_definition_level() const {
+ return max_definition_level_;
+ }
+
+ int16_t max_repetition_level() const {
+ return max_repetition_level_;
+ }
+
+ Type::type physical_type() const {
+ return primitive_node_->physical_type();
+ }
+
+ const std::string& name() const {
+ return primitive_node_->name();
+ }
+
+ int type_length() const;
+
+ private:
+ schema::NodePtr node_;
+ const schema::PrimitiveNode* primitive_node_;
+
+ int16_t max_definition_level_;
+ int16_t max_repetition_level_;
+
+ // When this descriptor is part of a real schema (and not being used for
+ // testing purposes), maintain a link back to the parent SchemaDescriptor to
+ // enable reverse graph traversals
+ const SchemaDescriptor* schema_descr_;
+};
+
+// Container for the converted Parquet schema with a computed information from
+// the schema analysis needed for file reading
+//
+// * Column index to Node
+// * Max repetition / definition levels for each primitive node
+//
+// The ColumnDescriptor objects produced by this class can be used to assist in
+// the reconstruction of fully materialized data structures from the
+// repetition-definition level encoding of nested data
+//
+// TODO(wesm): this object can be recomputed from a Schema
+class SchemaDescriptor {
+ public:
+ SchemaDescriptor() {}
+ ~SchemaDescriptor() {}
+
+ // Analyze the schema
+ void Init(std::unique_ptr<schema::Node> schema);
+ void Init(const schema::NodePtr& schema);
+
+ const ColumnDescriptor* Column(size_t i) const;
+
+ // The number of physical columns appearing in the file
+ size_t num_columns() const {
+ return leaves_.size();
+ }
+
+ private:
+ friend class ColumnDescriptor;
+
+ schema::NodePtr schema_;
+ const schema::GroupNode* group_;
+
+ void BuildTree(const schema::NodePtr& node, int16_t max_def_level,
+ int16_t max_rep_level);
+
+ // Result of leaf node / tree analysis
+ std::vector<ColumnDescriptor> leaves_;
+
+ // Mapping between leaf nodes and root group of leaf (first node
+ // below the schema's root group)
+ //
+ // For example, the leaf `a.b.c.d` would have a link back to `a`
+ //
+ // -- a <------
+ // -- -- b |
+ // -- -- -- c |
+ // -- -- -- -- d
+ std::unordered_map<int, schema::NodePtr> leaf_to_base_;
+};
+
+} // namespace parquet_cpp
+
+#endif // PARQUET_SCHEMA_DESCRIPTOR_H
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/04d75c7c/src/parquet/schema/printer.cc
----------------------------------------------------------------------
diff --git a/src/parquet/schema/printer.cc b/src/parquet/schema/printer.cc
new file mode 100644
index 0000000..9c43e8e
--- /dev/null
+++ b/src/parquet/schema/printer.cc
@@ -0,0 +1,139 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "parquet/schema/printer.h"
+
+#include <string>
+
+namespace parquet_cpp {
+
+namespace schema {
+
+class SchemaPrinter : public Node::Visitor {
+ public:
+ explicit SchemaPrinter(std::ostream& stream, size_t indent_width) :
+ stream_(stream),
+ indent_(0),
+ indent_width_(2) {}
+
+ virtual void Visit(const Node* node);
+
+ private:
+ void Visit(const PrimitiveNode* node);
+ void Visit(const GroupNode* node);
+
+ void Indent();
+
+ std::ostream& stream_;
+
+ size_t indent_;
+ size_t indent_width_;
+};
+
+static void PrintRepLevel(Repetition::type repetition, std::ostream& stream) {
+ switch (repetition) {
+ case Repetition::REQUIRED:
+ stream << "required";
+ break;
+ case Repetition::OPTIONAL:
+ stream << "optional";
+ break;
+ case Repetition::REPEATED:
+ stream << "repeated";
+ break;
+ default:
+ break;
+ }
+}
+
+static void PrintType(const PrimitiveNode* node, std::ostream& stream) {
+ switch (node->physical_type()) {
+ case Type::BOOLEAN:
+ stream << "boolean";
+ break;
+ case Type::INT32:
+ stream << "int32";
+ break;
+ case Type::INT64:
+ stream << "int64";
+ break;
+ case Type::INT96:
+ stream << "int96";
+ break;
+ case Type::FLOAT:
+ stream << "float";
+ break;
+ case Type::DOUBLE:
+ stream << "double";
+ break;
+ case Type::BYTE_ARRAY:
+ stream << "byte_array";
+ break;
+ case Type::FIXED_LEN_BYTE_ARRAY:
+ stream << "fixed_len_byte_array(" << node->type_length() << ")";
+ break;
+ default:
+ break;
+ }
+}
+
+void SchemaPrinter::Visit(const PrimitiveNode* node) {
+ PrintRepLevel(node->repetition(), stream_);
+ stream_ << " ";
+ PrintType(node, stream_);
+ stream_ << " " << node->name() << std::endl;
+}
+
+void SchemaPrinter::Visit(const GroupNode* node) {
+ PrintRepLevel(node->repetition(), stream_);
+ stream_ << " group " << node->name() << " {" << std::endl;
+
+ indent_ += indent_width_;
+ for (size_t i = 0; i < node->field_count(); ++i) {
+ node->field(i)->Visit(this);
+ }
+ indent_ -= indent_width_;
+ Indent();
+ stream_ << "}" << std::endl;
+}
+
+void SchemaPrinter::Indent() {
+ if (indent_ > 0) {
+ std::string spaces(indent_, ' ');
+ stream_ << spaces;
+ }
+}
+
+void SchemaPrinter::Visit(const Node* node) {
+ Indent();
+ if (node->is_group()) {
+ Visit(static_cast<const GroupNode*>(node));
+ } else {
+ // Primitive
+ Visit(static_cast<const PrimitiveNode*>(node));
+ }
+}
+
+void PrintSchema(const Node* schema, std::ostream& stream,
+ size_t indent_width) {
+ SchemaPrinter printer(stream, indent_width);
+ printer.Visit(schema);
+}
+
+} // namespace schema
+
+} // namespace parquet_cpp
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/04d75c7c/src/parquet/schema/printer.h
----------------------------------------------------------------------
diff --git a/src/parquet/schema/printer.h b/src/parquet/schema/printer.h
new file mode 100644
index 0000000..535262f
--- /dev/null
+++ b/src/parquet/schema/printer.h
@@ -0,0 +1,38 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// A simple Schema printer using the visitor pattern
+
+#ifndef PARQUET_SCHEMA_PRINTER_H
+#define PARQUET_SCHEMA_PRINTER_H
+
+#include "parquet/schema/types.h"
+
+#include <ostream>
+
+namespace parquet_cpp {
+
+namespace schema {
+
+void PrintSchema(const Node* schema, std::ostream& stream,
+ size_t indent_width = 2);
+
+} // namespace schema
+
+} // namespace parquet_cpp
+
+#endif // PARQUET_SCHEMA_PRINTER_H
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/04d75c7c/src/parquet/schema/schema-converter-test.cc
----------------------------------------------------------------------
diff --git a/src/parquet/schema/schema-converter-test.cc b/src/parquet/schema/schema-converter-test.cc
new file mode 100644
index 0000000..a5e1df7
--- /dev/null
+++ b/src/parquet/schema/schema-converter-test.cc
@@ -0,0 +1,136 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <cstdint>
+#include <string>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "parquet/util/test-common.h"
+
+#include "parquet/schema/converter.h"
+#include "parquet/thrift/parquet_types.h"
+
+#include "parquet/schema/test-util.h"
+
+using std::string;
+using std::vector;
+
+using parquet::ConvertedType;
+using parquet::FieldRepetitionType;
+using parquet::SchemaElement;
+
+namespace parquet_cpp {
+
+namespace schema {
+
+// ----------------------------------------------------------------------
+// Test convert group
+
+class TestSchemaConverter : public ::testing::Test {
+ public:
+ void setUp() {
+ name_ = "parquet_cpp_schema";
+ }
+
+ void Convert(const parquet::SchemaElement* elements, size_t length) {
+ FlatSchemaConverter converter(elements, length);
+ node_ = converter.Convert();
+ ASSERT_TRUE(node_->is_group());
+ group_ = static_cast<const GroupNode*>(node_.get());
+ }
+
+ protected:
+ std::string name_;
+ const GroupNode* group_;
+ std::unique_ptr<Node> node_;
+};
+
+TEST_F(TestSchemaConverter, NestedExample) {
+ SchemaElement elt;
+ std::vector<SchemaElement> elements;
+ elements.push_back(NewGroup(name_, FieldRepetitionType::REPEATED, 2));
+
+ // A primitive one
+ elements.push_back(NewPrimitive("a", FieldRepetitionType::REQUIRED,
+ parquet::Type::INT32));
+
+ // A group
+ elements.push_back(NewGroup("bag", FieldRepetitionType::OPTIONAL, 1));
+
+ // 3-level list encoding, by hand
+ elt = NewGroup("b", FieldRepetitionType::REPEATED, 1);
+ elt.__set_converted_type(ConvertedType::LIST);
+ elements.push_back(elt);
+ elements.push_back(NewPrimitive("item", FieldRepetitionType::OPTIONAL,
+ parquet::Type::INT64));
+
+ Convert(&elements[0], elements.size());
+
+ // Construct the expected schema
+ NodeVector fields;
+ fields.push_back(Int32("a", Repetition::REQUIRED));
+
+ // 3-level list encoding
+ NodePtr item = Int64("item");
+ NodePtr list(GroupNode::Make("b", Repetition::REPEATED, {item}, LogicalType::LIST));
+ NodePtr bag(GroupNode::Make("bag", Repetition::OPTIONAL, {list}));
+ fields.push_back(bag);
+
+ NodePtr schema = GroupNode::Make(name_, Repetition::REPEATED, fields);
+
+ ASSERT_TRUE(schema->Equals(group_));
+}
+
+TEST_F(TestSchemaConverter, InvalidRoot) {
+ // According to the Parquet specification, the first element in the
+ // list<SchemaElement> is a group whose children (and their descendants)
+ // contain all of the rest of the flattened schema elements. If the first
+ // element is not a group, it is a malformed Parquet file.
+
+ SchemaElement elements[2];
+ elements[0] = NewPrimitive("not-a-group", FieldRepetitionType::REQUIRED,
+ parquet::Type::INT32);
+ ASSERT_THROW(Convert(elements, 2), ParquetException);
+
+ // While the Parquet spec indicates that the root group should have REPEATED
+ // repetition type, some implementations may return REQUIRED or OPTIONAL
+ // groups as the first element. These tests check that this is okay as a
+ // practicality matter.
+ elements[0] = NewGroup("not-repeated", FieldRepetitionType::REQUIRED, 1);
+ elements[1] = NewPrimitive("a", FieldRepetitionType::REQUIRED,
+ parquet::Type::INT32);
+ Convert(elements, 2);
+
+ elements[0] = NewGroup("not-repeated", FieldRepetitionType::OPTIONAL, 1);
+ Convert(elements, 2);
+}
+
+TEST_F(TestSchemaConverter, NotEnoughChildren) {
+ // Throw a ParquetException, but don't core dump or anything
+ SchemaElement elt;
+ std::vector<SchemaElement> elements;
+ elements.push_back(NewGroup(name_, FieldRepetitionType::REPEATED, 2));
+ ASSERT_THROW(Convert(&elements[0], 2), ParquetException);
+}
+
+// ----------------------------------------------------------------------
+// Schema tree flatten / unflatten
+
+} // namespace schema
+
+} // namespace parquet_cpp
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/04d75c7c/src/parquet/schema/schema-descriptor-test.cc
----------------------------------------------------------------------
diff --git a/src/parquet/schema/schema-descriptor-test.cc b/src/parquet/schema/schema-descriptor-test.cc
new file mode 100644
index 0000000..1328bed
--- /dev/null
+++ b/src/parquet/schema/schema-descriptor-test.cc
@@ -0,0 +1,128 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Schema / column descriptor correctness tests (from flat Parquet schemas)
+
+#include <cstdint>
+#include <string>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "parquet/util/test-common.h"
+
+#include "parquet/schema/converter.h"
+#include "parquet/schema/descriptor.h"
+
+#include "parquet/thrift/parquet_types.h"
+
+using std::string;
+using std::vector;
+
+namespace parquet_cpp {
+
+namespace schema {
+
+TEST(TestColumnDescriptor, TestAttrs) {
+ NodePtr node = PrimitiveNode::Make("name", Repetition::OPTIONAL,
+ Type::BYTE_ARRAY, LogicalType::UTF8);
+ ColumnDescriptor descr(node, 4, 1);
+
+ ASSERT_EQ("name", descr.name());
+ ASSERT_EQ(4, descr.max_definition_level());
+ ASSERT_EQ(1, descr.max_repetition_level());
+
+ ASSERT_EQ(Type::BYTE_ARRAY, descr.physical_type());
+
+ ASSERT_THROW(descr.type_length(), ParquetException);
+
+ // Test FIXED_LEN_BYTE_ARRAY
+ node = PrimitiveNode::MakeFLBA("name", Repetition::OPTIONAL,
+ Type::FIXED_LEN_BYTE_ARRAY, 12, LogicalType::UTF8);
+ descr = ColumnDescriptor(node, 4, 1);
+
+ ASSERT_EQ(Type::FIXED_LEN_BYTE_ARRAY, descr.physical_type());
+ ASSERT_EQ(12, descr.type_length());
+}
+
+class TestSchemaDescriptor : public ::testing::Test {
+ public:
+ void setUp() {
+ }
+
+ protected:
+ SchemaDescriptor descr_;
+};
+
+TEST_F(TestSchemaDescriptor, InitNonGroup) {
+ NodePtr node = PrimitiveNode::Make("field", Repetition::OPTIONAL,
+ Type::INT32);
+
+ ASSERT_THROW(descr_.Init(node), ParquetException);
+}
+
+TEST_F(TestSchemaDescriptor, BuildTree) {
+ NodeVector fields;
+ NodePtr schema;
+
+ fields.push_back(Int32("a", Repetition::REQUIRED));
+ fields.push_back(Int64("b", Repetition::OPTIONAL));
+ fields.push_back(ByteArray("c", Repetition::REPEATED));
+
+ // 3-level list encoding
+ NodePtr item1 = Int64("item1", Repetition::REQUIRED);
+ NodePtr item2 = Boolean("item2", Repetition::OPTIONAL);
+ NodePtr item3 = Int32("item3", Repetition::REPEATED);
+ NodePtr list(GroupNode::Make("records", Repetition::REPEATED,
+ {item1, item2, item3}, LogicalType::LIST));
+ NodePtr bag(GroupNode::Make("bag", Repetition::OPTIONAL, {list}));
+ fields.push_back(bag);
+
+ schema = GroupNode::Make("schema", Repetition::REPEATED, fields);
+
+ descr_.Init(schema);
+
+ size_t nleaves = 6;
+
+ // 6 leaves
+ ASSERT_EQ(nleaves, descr_.num_columns());
+
+ // mdef mrep
+ // required int32 a 0 0
+ // optional int64 b 1 0
+ // repeated byte_array c 1 1
+ // optional group bag 1 0
+ // repeated group records 2 1
+ // required int64 item1 2 1
+ // optional boolean item1 3 1
+ // repeated int32 item3 3 2
+ int16_t ex_max_def_levels[6] = {0, 1, 1, 2, 3, 3};
+ int16_t ex_max_rep_levels[6] = {0, 0, 1, 1, 1, 2};
+
+ for (size_t i = 0; i < nleaves; ++i) {
+ const ColumnDescriptor* col = descr_.Column(i);
+ EXPECT_EQ(ex_max_def_levels[i], col->max_definition_level()) << i;
+ EXPECT_EQ(ex_max_rep_levels[i], col->max_repetition_level()) << i;
+ }
+
+ // Init clears the leaves
+ descr_.Init(schema);
+ ASSERT_EQ(nleaves, descr_.num_columns());
+}
+
+} // namespace schema
+
+} // namespace parquet_cpp
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/04d75c7c/src/parquet/schema/schema-printer-test.cc
----------------------------------------------------------------------
diff --git a/src/parquet/schema/schema-printer-test.cc b/src/parquet/schema/schema-printer-test.cc
new file mode 100644
index 0000000..c21429a
--- /dev/null
+++ b/src/parquet/schema/schema-printer-test.cc
@@ -0,0 +1,72 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <cstdint>
+#include <string>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "parquet/util/test-common.h"
+
+#include "parquet/schema/printer.h"
+#include "parquet/schema/test-util.h"
+
+using std::string;
+using std::vector;
+
+namespace parquet_cpp {
+
+namespace schema {
+
+static std::string Print(const NodePtr& node) {
+ std::stringstream ss;
+ PrintSchema(node.get(), ss);
+ return ss.str();
+}
+
+TEST(TestSchemaPrinter, Examples) {
+ // Test schema 1
+ NodeVector fields;
+ fields.push_back(Int32("a", Repetition::REQUIRED));
+
+ // 3-level list encoding
+ NodePtr item1 = Int64("item1");
+ NodePtr item2 = Boolean("item2", Repetition::REQUIRED);
+ NodePtr list(GroupNode::Make("b", Repetition::REPEATED, {item1, item2},
+ LogicalType::LIST));
+ NodePtr bag(GroupNode::Make("bag", Repetition::OPTIONAL, {list}));
+ fields.push_back(bag);
+
+ NodePtr schema = GroupNode::Make("schema", Repetition::REPEATED, fields);
+
+ std::string result = Print(schema);
+ std::string expected = R"(repeated group schema {
+ required int32 a
+ optional group bag {
+ repeated group b {
+ optional int64 item1
+ required boolean item2
+ }
+ }
+}
+)";
+ ASSERT_EQ(expected, result);
+}
+
+} // namespace schema
+
+} // namespace parquet_cpp