You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@parquet.apache.org by we...@apache.org on 2017/01/26 17:02:51 UTC
[2/3] parquet-cpp git commit: PARQUET-844: Schema,
compression consolidation / flattening
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/13da51d3/src/parquet/schema.cc
----------------------------------------------------------------------
diff --git a/src/parquet/schema.cc b/src/parquet/schema.cc
new file mode 100644
index 0000000..13fca68
--- /dev/null
+++ b/src/parquet/schema.cc
@@ -0,0 +1,655 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "parquet/schema.h"
+#include "parquet/schema-internal.h"
+
+#include <algorithm>
+#include <memory>
+
+#include "parquet/exception.h"
+#include "parquet/thrift/parquet_types.h"
+#include "parquet/thrift/util.h"
+
+using parquet::format::SchemaElement;
+
+namespace parquet {
+
+namespace schema {
+
+// ----------------------------------------------------------------------
+// ColumnPath
+
+std::shared_ptr<ColumnPath> ColumnPath::FromDotString(const std::string& dotstring) {
+ std::stringstream ss(dotstring);
+ std::string item;
+ std::vector<std::string> path;
+ while (std::getline(ss, item, '.')) {
+ path.push_back(item);
+ }
+ return std::shared_ptr<ColumnPath>(new ColumnPath(std::move(path)));
+}
+
+std::shared_ptr<ColumnPath> ColumnPath::extend(const std::string& node_name) const {
+ std::vector<std::string> path;
+ path.reserve(path_.size() + 1);
+ path.resize(path_.size() + 1);
+ std::copy(path_.cbegin(), path_.cend(), path.begin());
+ path[path_.size()] = node_name;
+
+ return std::shared_ptr<ColumnPath>(new ColumnPath(std::move(path)));
+}
+
+std::string ColumnPath::ToDotString() const {
+ std::stringstream ss;
+ for (auto it = path_.cbegin(); it != path_.cend(); ++it) {
+ if (it != path_.cbegin()) { ss << "."; }
+ ss << *it;
+ }
+ return ss.str();
+}
+
+const std::vector<std::string>& ColumnPath::ToDotVector() const {
+ return path_;
+}
+
+// ----------------------------------------------------------------------
+// Base node
+
+bool Node::EqualsInternal(const Node* other) const {
+ return type_ == other->type_ && name_ == other->name_ &&
+ repetition_ == other->repetition_ && logical_type_ == other->logical_type_;
+}
+
+void Node::SetParent(const Node* parent) {
+ parent_ = parent;
+}
+
+// ----------------------------------------------------------------------
+// Primitive node
+
+PrimitiveNode::PrimitiveNode(const std::string& name, Repetition::type repetition,
+ Type::type type, LogicalType::type logical_type, int length, int precision, int scale,
+ int id)
+ : Node(Node::PRIMITIVE, name, repetition, logical_type, id),
+ physical_type_(type),
+ type_length_(length) {
+ std::stringstream ss;
+
+ // PARQUET-842: In an earlier revision, decimal_metadata_.isset was being
+ // set to true, but Impala will raise an incompatible metadata in such cases
+ memset(&decimal_metadata_, 0, sizeof(decimal_metadata_));
+
+ // Check if the physical and logical types match
+ // Mapping referred from Apache parquet-mr as on 2016-02-22
+ switch (logical_type) {
+ case LogicalType::NONE:
+ // Logical type not set
+ break;
+ case LogicalType::UTF8:
+ case LogicalType::JSON:
+ case LogicalType::BSON:
+ if (type != Type::BYTE_ARRAY) {
+ ss << LogicalTypeToString(logical_type);
+ ss << " can only annotate BYTE_ARRAY fields";
+ throw ParquetException(ss.str());
+ }
+ break;
+ case LogicalType::DECIMAL:
+ if ((type != Type::INT32) && (type != Type::INT64) && (type != Type::BYTE_ARRAY) &&
+ (type != Type::FIXED_LEN_BYTE_ARRAY)) {
+ ss << "DECIMAL can only annotate INT32, INT64, BYTE_ARRAY, and FIXED";
+ throw ParquetException(ss.str());
+ }
+ if (precision <= 0) {
+ ss << "Invalid DECIMAL precision: " << precision;
+ throw ParquetException(ss.str());
+ }
+ if (scale < 0) {
+ ss << "Invalid DECIMAL scale: " << scale;
+ throw ParquetException(ss.str());
+ }
+ if (scale > precision) {
+ ss << "Invalid DECIMAL scale " << scale;
+ ss << " cannot be greater than precision " << precision;
+ throw ParquetException(ss.str());
+ }
+ decimal_metadata_.isset = true;
+ decimal_metadata_.precision = precision;
+ decimal_metadata_.scale = scale;
+ break;
+ case LogicalType::DATE:
+ case LogicalType::TIME_MILLIS:
+ case LogicalType::UINT_8:
+ case LogicalType::UINT_16:
+ case LogicalType::UINT_32:
+ case LogicalType::INT_8:
+ case LogicalType::INT_16:
+ case LogicalType::INT_32:
+ if (type != Type::INT32) {
+ ss << LogicalTypeToString(logical_type);
+ ss << " can only annotate INT32";
+ throw ParquetException(ss.str());
+ }
+ break;
+ case LogicalType::TIME_MICROS:
+ case LogicalType::TIMESTAMP_MILLIS:
+ case LogicalType::TIMESTAMP_MICROS:
+ case LogicalType::UINT_64:
+ case LogicalType::INT_64:
+ if (type != Type::INT64) {
+ ss << LogicalTypeToString(logical_type);
+ ss << " can only annotate INT64";
+ throw ParquetException(ss.str());
+ }
+ break;
+ case LogicalType::INTERVAL:
+ if ((type != Type::FIXED_LEN_BYTE_ARRAY) || (length != 12)) {
+ ss << "INTERVAL can only annotate FIXED_LEN_BYTE_ARRAY(12)";
+ throw ParquetException(ss.str());
+ }
+ break;
+ case LogicalType::ENUM:
+ if (type != Type::BYTE_ARRAY) {
+ ss << "ENUM can only annotate BYTE_ARRAY fields";
+ throw ParquetException(ss.str());
+ }
+ break;
+ default:
+ ss << LogicalTypeToString(logical_type);
+ ss << " can not be applied to a primitive type";
+ throw ParquetException(ss.str());
+ }
+ if (type == Type::FIXED_LEN_BYTE_ARRAY) {
+ if (length <= 0) {
+ ss << "Invalid FIXED_LEN_BYTE_ARRAY length: " << length;
+ throw ParquetException(ss.str());
+ }
+ type_length_ = length;
+ }
+}
+
+bool PrimitiveNode::EqualsInternal(const PrimitiveNode* other) const {
+ bool is_equal = true;
+ if ((physical_type_ != other->physical_type_) ||
+ (logical_type_ != other->logical_type_)) {
+ return false;
+ }
+ if (logical_type_ == LogicalType::DECIMAL) {
+ is_equal &= (decimal_metadata_.precision == other->decimal_metadata_.precision) &&
+ (decimal_metadata_.scale == other->decimal_metadata_.scale);
+ }
+ if (physical_type_ == Type::FIXED_LEN_BYTE_ARRAY) {
+ is_equal &= (type_length_ == other->type_length_);
+ }
+ return is_equal;
+}
+
+bool PrimitiveNode::Equals(const Node* other) const {
+ if (!Node::EqualsInternal(other)) { return false; }
+ return EqualsInternal(static_cast<const PrimitiveNode*>(other));
+}
+
+void PrimitiveNode::Visit(Node::Visitor* visitor) {
+ visitor->Visit(this);
+}
+
+void PrimitiveNode::VisitConst(Node::ConstVisitor* visitor) const {
+ visitor->Visit(this);
+}
+
+// ----------------------------------------------------------------------
+// Group node
+
+bool GroupNode::EqualsInternal(const GroupNode* other) const {
+ if (this == other) { return true; }
+ if (this->field_count() != other->field_count()) { return false; }
+ for (int i = 0; i < this->field_count(); ++i) {
+ if (!this->field(i)->Equals(other->field(i).get())) { return false; }
+ }
+ return true;
+}
+
+bool GroupNode::Equals(const Node* other) const {
+ if (!Node::EqualsInternal(other)) { return false; }
+ return EqualsInternal(static_cast<const GroupNode*>(other));
+}
+
+void GroupNode::Visit(Node::Visitor* visitor) {
+ visitor->Visit(this);
+}
+
+void GroupNode::VisitConst(Node::ConstVisitor* visitor) const {
+ visitor->Visit(this);
+}
+
+// ----------------------------------------------------------------------
+// Node construction from Parquet metadata
+
+struct NodeParams {
+ explicit NodeParams(const std::string& name) : name(name) {}
+
+ const std::string& name;
+ Repetition::type repetition;
+ LogicalType::type logical_type;
+};
+
+static inline NodeParams GetNodeParams(const format::SchemaElement* element) {
+ NodeParams params(element->name);
+
+ params.repetition = FromThrift(element->repetition_type);
+ if (element->__isset.converted_type) {
+ params.logical_type = FromThrift(element->converted_type);
+ } else {
+ params.logical_type = LogicalType::NONE;
+ }
+ return params;
+}
+
+std::unique_ptr<Node> GroupNode::FromParquet(
+ const void* opaque_element, int node_id, const NodeVector& fields) {
+ const format::SchemaElement* element =
+ static_cast<const format::SchemaElement*>(opaque_element);
+ NodeParams params = GetNodeParams(element);
+ return std::unique_ptr<Node>(new GroupNode(
+ params.name, params.repetition, fields, params.logical_type, node_id));
+}
+
+std::unique_ptr<Node> PrimitiveNode::FromParquet(
+ const void* opaque_element, int node_id) {
+ const format::SchemaElement* element =
+ static_cast<const format::SchemaElement*>(opaque_element);
+ NodeParams params = GetNodeParams(element);
+
+ std::unique_ptr<PrimitiveNode> result =
+ std::unique_ptr<PrimitiveNode>(new PrimitiveNode(params.name, params.repetition,
+ FromThrift(element->type), params.logical_type, element->type_length,
+ element->precision, element->scale, node_id));
+
+ // Return as unique_ptr to the base type
+ return std::unique_ptr<Node>(result.release());
+}
+
+void GroupNode::ToParquet(void* opaque_element) const {
+ format::SchemaElement* element = static_cast<format::SchemaElement*>(opaque_element);
+ element->__set_name(name_);
+ element->__set_num_children(field_count());
+ element->__set_repetition_type(ToThrift(repetition_));
+ if (logical_type_ != LogicalType::NONE) {
+ element->__set_converted_type(ToThrift(logical_type_));
+ }
+}
+
+void PrimitiveNode::ToParquet(void* opaque_element) const {
+ format::SchemaElement* element = static_cast<format::SchemaElement*>(opaque_element);
+
+ element->__set_name(name_);
+ element->__set_num_children(0);
+ element->__set_repetition_type(ToThrift(repetition_));
+ if (logical_type_ != LogicalType::NONE) {
+ element->__set_converted_type(ToThrift(logical_type_));
+ }
+ element->__set_type(ToThrift(physical_type_));
+ if (physical_type_ == Type::FIXED_LEN_BYTE_ARRAY) {
+ element->__set_type_length(type_length_);
+ }
+ if (decimal_metadata_.isset) {
+ element->__set_precision(decimal_metadata_.precision);
+ element->__set_scale(decimal_metadata_.scale);
+ }
+}
+
+// ----------------------------------------------------------------------
+// Schema converters
+
+std::unique_ptr<Node> FlatSchemaConverter::Convert() {
+ const SchemaElement& root = elements_[0];
+
+ // Validate the root node
+ if (root.num_children == 0) {
+ throw ParquetException("Root node did not have children");
+ }
+
+ // Relaxing this restriction as some implementations don't set this
+ // if (root.repetition_type != FieldRepetitionType::REPEATED) {
+ // throw ParquetException("Root node was not FieldRepetitionType::REPEATED");
+ // }
+
+ return NextNode();
+}
+
+std::unique_ptr<Node> FlatSchemaConverter::NextNode() {
+ const SchemaElement& element = Next();
+
+ int node_id = next_id();
+
+ const void* opaque_element = static_cast<const void*>(&element);
+
+ if (element.num_children == 0) {
+ // Leaf (primitive) node
+ return PrimitiveNode::FromParquet(opaque_element, node_id);
+ } else {
+ // Group
+ NodeVector fields;
+ for (int i = 0; i < element.num_children; ++i) {
+ std::unique_ptr<Node> field = NextNode();
+ fields.push_back(NodePtr(field.release()));
+ }
+ return GroupNode::FromParquet(opaque_element, node_id, fields);
+ }
+}
+
+const format::SchemaElement& FlatSchemaConverter::Next() {
+ if (pos_ == length_) {
+ throw ParquetException("Malformed schema: not enough SchemaElement values");
+ }
+ return elements_[pos_++];
+}
+
+std::shared_ptr<SchemaDescriptor> FromParquet(const std::vector<SchemaElement>& schema) {
+ FlatSchemaConverter converter(&schema[0], schema.size());
+ std::unique_ptr<Node> root = converter.Convert();
+
+ std::shared_ptr<SchemaDescriptor> descr = std::make_shared<SchemaDescriptor>();
+ descr->Init(std::shared_ptr<GroupNode>(static_cast<GroupNode*>(root.release())));
+
+ return descr;
+}
+
+void ToParquet(const GroupNode* schema, std::vector<format::SchemaElement>* out) {
+ SchemaFlattener flattener(schema, out);
+ flattener.Flatten();
+}
+
+class SchemaVisitor : public Node::ConstVisitor {
+ public:
+ explicit SchemaVisitor(std::vector<format::SchemaElement>* elements)
+ : elements_(elements) {}
+ virtual ~SchemaVisitor() {}
+
+ void Visit(const Node* node) override {
+ format::SchemaElement element;
+ node->ToParquet(&element);
+ elements_->push_back(element);
+
+ if (node->is_group()) {
+ const GroupNode* group_node = static_cast<const GroupNode*>(node);
+ for (int i = 0; i < group_node->field_count(); ++i) {
+ group_node->field(i)->VisitConst(this);
+ }
+ }
+ }
+
+ private:
+ std::vector<format::SchemaElement>* elements_;
+};
+
+SchemaFlattener::SchemaFlattener(
+ const GroupNode* schema, std::vector<format::SchemaElement>* out)
+ : root_(schema), elements_(out) {}
+
+void SchemaFlattener::Flatten() {
+ SchemaVisitor visitor(elements_);
+ root_->VisitConst(&visitor);
+}
+
+// ----------------------------------------------------------------------
+// Schema printing
+
+class SchemaPrinter : public Node::ConstVisitor {
+ public:
+ explicit SchemaPrinter(std::ostream& stream, int indent_width)
+ : stream_(stream), indent_(0), indent_width_(2) {}
+
+ void Visit(const Node* node) override;
+
+ private:
+ void Visit(const PrimitiveNode* node);
+ void Visit(const GroupNode* node);
+
+ void Indent();
+
+ std::ostream& stream_;
+
+ int indent_;
+ int indent_width_;
+};
+
+static void PrintRepLevel(Repetition::type repetition, std::ostream& stream) {
+ switch (repetition) {
+ case Repetition::REQUIRED:
+ stream << "required";
+ break;
+ case Repetition::OPTIONAL:
+ stream << "optional";
+ break;
+ case Repetition::REPEATED:
+ stream << "repeated";
+ break;
+ default:
+ break;
+ }
+}
+
+static void PrintType(const PrimitiveNode* node, std::ostream& stream) {
+ switch (node->physical_type()) {
+ case Type::BOOLEAN:
+ stream << "boolean";
+ break;
+ case Type::INT32:
+ stream << "int32";
+ break;
+ case Type::INT64:
+ stream << "int64";
+ break;
+ case Type::INT96:
+ stream << "int96";
+ break;
+ case Type::FLOAT:
+ stream << "float";
+ break;
+ case Type::DOUBLE:
+ stream << "double";
+ break;
+ case Type::BYTE_ARRAY:
+ stream << "binary";
+ break;
+ case Type::FIXED_LEN_BYTE_ARRAY:
+ stream << "fixed_len_byte_array(" << node->type_length() << ")";
+ break;
+ default:
+ break;
+ }
+}
+
+static void PrintLogicalType(const PrimitiveNode* node, std::ostream& stream) {
+ auto lt = node->logical_type();
+ if (lt == LogicalType::DECIMAL) {
+ stream << " (" << LogicalTypeToString(lt) << "(" << node->decimal_metadata().precision
+ << "," << node->decimal_metadata().scale << "))";
+ } else if (lt != LogicalType::NONE) {
+ stream << " (" << LogicalTypeToString(lt) << ")";
+ }
+}
+
+void SchemaPrinter::Visit(const PrimitiveNode* node) {
+ PrintRepLevel(node->repetition(), stream_);
+ stream_ << " ";
+ PrintType(node, stream_);
+ stream_ << " " << node->name();
+ PrintLogicalType(node, stream_);
+ stream_ << ";" << std::endl;
+}
+
+void SchemaPrinter::Visit(const GroupNode* node) {
+ if (!node->parent()) {
+ stream_ << "message " << node->name() << " {" << std::endl;
+ } else {
+ PrintRepLevel(node->repetition(), stream_);
+ stream_ << " group " << node->name();
+ auto lt = node->logical_type();
+ if (lt != LogicalType::NONE) { stream_ << " (" << LogicalTypeToString(lt) << ")"; }
+ stream_ << " {" << std::endl;
+ }
+
+ indent_ += indent_width_;
+ for (int i = 0; i < node->field_count(); ++i) {
+ node->field(i)->VisitConst(this);
+ }
+ indent_ -= indent_width_;
+ Indent();
+ stream_ << "}" << std::endl;
+}
+
+void SchemaPrinter::Indent() {
+ if (indent_ > 0) {
+ std::string spaces(indent_, ' ');
+ stream_ << spaces;
+ }
+}
+
+void SchemaPrinter::Visit(const Node* node) {
+ Indent();
+ if (node->is_group()) {
+ Visit(static_cast<const GroupNode*>(node));
+ } else {
+ // Primitive
+ Visit(static_cast<const PrimitiveNode*>(node));
+ }
+}
+
+void PrintSchema(const Node* schema, std::ostream& stream, int indent_width) {
+ SchemaPrinter printer(stream, indent_width);
+ printer.Visit(schema);
+}
+
+} // namespace schema
+
+using schema::ColumnPath;
+using schema::Node;
+using schema::NodePtr;
+using schema::PrimitiveNode;
+using schema::GroupNode;
+
+void SchemaDescriptor::Init(std::unique_ptr<schema::Node> schema) {
+ Init(NodePtr(schema.release()));
+}
+
+void SchemaDescriptor::Init(const NodePtr& schema) {
+ schema_ = schema;
+
+ if (!schema_->is_group()) {
+ throw ParquetException("Must initialize with a schema group");
+ }
+
+ group_node_ = static_cast<const GroupNode*>(schema_.get());
+ leaves_.clear();
+
+ for (int i = 0; i < group_node_->field_count(); ++i) {
+ BuildTree(group_node_->field(i), 0, 0, group_node_->field(i));
+ }
+}
+
+bool SchemaDescriptor::Equals(const SchemaDescriptor& other) const {
+ if (this->num_columns() != other.num_columns()) { return false; }
+
+ for (int i = 0; i < this->num_columns(); ++i) {
+ if (!this->Column(i)->Equals(*other.Column(i))) { return false; }
+ }
+
+ return true;
+}
+
+void SchemaDescriptor::BuildTree(const NodePtr& node, int16_t max_def_level,
+ int16_t max_rep_level, const NodePtr& base) {
+ if (node->is_optional()) {
+ ++max_def_level;
+ } else if (node->is_repeated()) {
+ // Repeated fields add a definition level. This is used to distinguish
+ // between an empty list and a list with an item in it.
+ ++max_rep_level;
+ ++max_def_level;
+ }
+
+ // Now, walk the schema and create a ColumnDescriptor for each leaf node
+ if (node->is_group()) {
+ const GroupNode* group = static_cast<const GroupNode*>(node.get());
+ for (int i = 0; i < group->field_count(); ++i) {
+ BuildTree(group->field(i), max_def_level, max_rep_level, base);
+ }
+ } else {
+ // Primitive node, append to leaves
+ leaves_.push_back(ColumnDescriptor(node, max_def_level, max_rep_level, this));
+ leaf_to_base_.emplace(leaves_.size() - 1, base);
+ }
+}
+
+ColumnDescriptor::ColumnDescriptor(const schema::NodePtr& node,
+ int16_t max_definition_level, int16_t max_repetition_level,
+ const SchemaDescriptor* schema_descr)
+ : node_(node),
+ max_definition_level_(max_definition_level),
+ max_repetition_level_(max_repetition_level),
+ schema_descr_(schema_descr) {
+ if (!node_->is_primitive()) { throw ParquetException("Must be a primitive type"); }
+ primitive_node_ = static_cast<const PrimitiveNode*>(node_.get());
+}
+
+bool ColumnDescriptor::Equals(const ColumnDescriptor& other) const {
+ return primitive_node_->Equals(other.primitive_node_) &&
+ max_repetition_level() == other.max_repetition_level() &&
+ max_definition_level() == other.max_definition_level();
+}
+
+const ColumnDescriptor* SchemaDescriptor::Column(int i) const {
+ DCHECK(i >= 0 && i < static_cast<int>(leaves_.size()));
+ return &leaves_[i];
+}
+
+const schema::NodePtr& SchemaDescriptor::GetColumnRoot(int i) const {
+ DCHECK(i >= 0 && i < static_cast<int>(leaves_.size()));
+ return leaf_to_base_.find(i)->second;
+}
+
+int ColumnDescriptor::type_scale() const {
+ return primitive_node_->decimal_metadata().scale;
+}
+
+int ColumnDescriptor::type_precision() const {
+ return primitive_node_->decimal_metadata().precision;
+}
+
+int ColumnDescriptor::type_length() const {
+ return primitive_node_->type_length();
+}
+
+const std::shared_ptr<ColumnPath> ColumnDescriptor::path() const {
+ // Build the path in reverse order as we traverse the nodes to the top
+ std::vector<std::string> rpath_;
+ const Node* node = primitive_node_;
+ // The schema node is not part of the ColumnPath
+ while (node->parent()) {
+ rpath_.push_back(node->name());
+ node = node->parent();
+ }
+
+ // Build ColumnPath in correct order
+ std::vector<std::string> path_(rpath_.crbegin(), rpath_.crend());
+ return std::make_shared<ColumnPath>(std::move(path_));
+}
+
+} // namespace parquet
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/13da51d3/src/parquet/schema.h
----------------------------------------------------------------------
diff --git a/src/parquet/schema.h b/src/parquet/schema.h
new file mode 100644
index 0000000..30aea44
--- /dev/null
+++ b/src/parquet/schema.h
@@ -0,0 +1,405 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// This module contains the logical parquet-cpp types (independent of Thrift
+// structures), schema nodes, and related type tools
+
+#ifndef PARQUET_SCHEMA_TYPES_H
+#define PARQUET_SCHEMA_TYPES_H
+
+#include <cstdint>
+#include <memory>
+#include <ostream>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "parquet/types.h"
+#include "parquet/util/macros.h"
+#include "parquet/util/visibility.h"
+
+namespace parquet {
+
+class SchemaDescriptor;
+
+namespace schema {
+
+// List encodings: using the terminology from Impala to define different styles
+// of representing logical lists (a.k.a. ARRAY types) in Parquet schemas. Since
+// the converted type named in the Parquet metadata is ConvertedType::LIST we
+// use that terminology here. It also helps distinguish from the *_ARRAY
+// primitive types.
+//
+// One-level encoding: Only allows required lists with required cells
+// repeated value_type name
+//
+// Two-level encoding: Enables optional lists with only required cells
+// <required/optional> group list
+// repeated value_type item
+//
+// Three-level encoding: Enables optional lists with optional cells
+// <required/optional> group bag
+// repeated group list
+// <required/optional> value_type item
+//
+// 2- and 1-level encoding are respectively equivalent to 3-level encoding with
+// the non-repeated nodes set to required.
+//
+// The "official" encoding recommended in the Parquet spec is the 3-level, and
+// we use that as the default when creating list types. For semantic completeness
+// we allow the other two. Since all types of encodings will occur "in the
+// wild" we need to be able to interpret the associated definition levels in
+// the context of the actual encoding used in the file.
+//
+// NB: Some Parquet writers may not set ConvertedType::LIST on the repeated
+// SchemaElement, which could make things challenging if we are trying to infer
+// that a sequence of nodes semantically represents an array according to one
+// of these encodings (versus a struct containing an array). We should refuse
+// the temptation to guess, as they say.
+struct ListEncoding {
+ enum type { ONE_LEVEL, TWO_LEVEL, THREE_LEVEL };
+};
+
+struct DecimalMetadata {
+ bool isset;
+ int32_t scale;
+ int32_t precision;
+};
+
+class PARQUET_EXPORT ColumnPath {
+ public:
+ ColumnPath() : path_() {}
+ explicit ColumnPath(const std::vector<std::string>& path) : path_(path) {}
+ explicit ColumnPath(std::vector<std::string>&& path) : path_(path) {}
+
+ static std::shared_ptr<ColumnPath> FromDotString(const std::string& dotstring);
+
+ std::shared_ptr<ColumnPath> extend(const std::string& node_name) const;
+ std::string ToDotString() const;
+ const std::vector<std::string>& ToDotVector() const;
+
+ protected:
+ std::vector<std::string> path_;
+};
+
+class GroupNode;
+
+// Base class for logical schema types. A type has a name, repetition level,
+// and optionally a logical type (ConvertedType in Parquet metadata parlance)
+class PARQUET_EXPORT Node {
+ public:
+ enum type { PRIMITIVE, GROUP };
+
+ Node(Node::type type, const std::string& name, Repetition::type repetition,
+ LogicalType::type logical_type = LogicalType::NONE, int id = -1)
+ : type_(type),
+ name_(name),
+ repetition_(repetition),
+ logical_type_(logical_type),
+ id_(id),
+ parent_(nullptr) {}
+
+ virtual ~Node() {}
+
+ bool is_primitive() const { return type_ == Node::PRIMITIVE; }
+
+ bool is_group() const { return type_ == Node::GROUP; }
+
+ bool is_optional() const { return repetition_ == Repetition::OPTIONAL; }
+
+ bool is_repeated() const { return repetition_ == Repetition::REPEATED; }
+
+ bool is_required() const { return repetition_ == Repetition::REQUIRED; }
+
+ virtual bool Equals(const Node* other) const = 0;
+
+ const std::string& name() const { return name_; }
+
+ Node::type node_type() const { return type_; }
+
+ Repetition::type repetition() const { return repetition_; }
+
+ LogicalType::type logical_type() const { return logical_type_; }
+
+ int id() const { return id_; }
+
+ const Node* parent() const { return parent_; }
+
+ // ToParquet returns an opaque void* to avoid exporting
+ // parquet::SchemaElement into the public API
+ virtual void ToParquet(void* opaque_element) const = 0;
+
+ // Node::Visitor abstract class for walking schemas with the visitor pattern
+ class Visitor {
+ public:
+ virtual ~Visitor() {}
+
+ virtual void Visit(Node* node) = 0;
+ };
+ class ConstVisitor {
+ public:
+ virtual ~ConstVisitor() {}
+
+ virtual void Visit(const Node* node) = 0;
+ };
+
+ virtual void Visit(Visitor* visitor) = 0;
+ virtual void VisitConst(ConstVisitor* visitor) const = 0;
+
+ protected:
+ friend class GroupNode;
+
+ Node::type type_;
+ std::string name_;
+ Repetition::type repetition_;
+ LogicalType::type logical_type_;
+ int id_;
+ // Nodes should not be shared, they have a single parent.
+ const Node* parent_;
+
+ bool EqualsInternal(const Node* other) const;
+ void SetParent(const Node* p_parent);
+};
+
+// Save our breath all over the place with these typedefs
+typedef std::shared_ptr<Node> NodePtr;
+typedef std::vector<NodePtr> NodeVector;
+
+// A type that is one of the primitive Parquet storage types. In addition to
+// the other type metadata (name, repetition level, logical type), also has the
+// physical storage type and their type-specific metadata (byte width, decimal
+// parameters)
+class PARQUET_EXPORT PrimitiveNode : public Node {
+ public:
+ // FromParquet accepts an opaque void* to avoid exporting
+ // parquet::SchemaElement into the public API
+ static std::unique_ptr<Node> FromParquet(const void* opaque_element, int id);
+
+ static inline NodePtr Make(const std::string& name, Repetition::type repetition,
+ Type::type type, LogicalType::type logical_type = LogicalType::NONE,
+ int length = -1, int precision = -1, int scale = -1) {
+ return NodePtr(new PrimitiveNode(
+ name, repetition, type, logical_type, length, precision, scale));
+ }
+
+ bool Equals(const Node* other) const override;
+
+ Type::type physical_type() const { return physical_type_; }
+
+ int32_t type_length() const { return type_length_; }
+
+ const DecimalMetadata& decimal_metadata() const { return decimal_metadata_; }
+
+ void ToParquet(void* opaque_element) const override;
+ void Visit(Visitor* visitor) override;
+ void VisitConst(ConstVisitor* visitor) const override;
+
+ private:
+ PrimitiveNode(const std::string& name, Repetition::type repetition, Type::type type,
+ LogicalType::type logical_type = LogicalType::NONE, int length = -1,
+ int precision = -1, int scale = -1, int id = -1);
+
+ Type::type physical_type_;
+ int32_t type_length_;
+ DecimalMetadata decimal_metadata_;
+
+ // For FIXED_LEN_BYTE_ARRAY
+ void SetTypeLength(int32_t length) { type_length_ = length; }
+
+ // For Decimal logical type: Precision and scale
+ void SetDecimalMetadata(int32_t scale, int32_t precision) {
+ decimal_metadata_.scale = scale;
+ decimal_metadata_.precision = precision;
+ }
+
+ bool EqualsInternal(const PrimitiveNode* other) const;
+
+ FRIEND_TEST(TestPrimitiveNode, Attrs);
+ FRIEND_TEST(TestPrimitiveNode, Equals);
+ FRIEND_TEST(TestPrimitiveNode, PhysicalLogicalMapping);
+ FRIEND_TEST(TestPrimitiveNode, FromParquet);
+};
+
+class PARQUET_EXPORT GroupNode : public Node {
+ public:
+ // Like PrimitiveNode, GroupNode::FromParquet accepts an opaque void* to avoid exporting
+ // parquet::SchemaElement into the public API
+ static std::unique_ptr<Node> FromParquet(
+ const void* opaque_element, int id, const NodeVector& fields);
+
+ static inline NodePtr Make(const std::string& name, Repetition::type repetition,
+ const NodeVector& fields, LogicalType::type logical_type = LogicalType::NONE) {
+ return NodePtr(new GroupNode(name, repetition, fields, logical_type));
+ }
+
+ bool Equals(const Node* other) const override;
+
+ const NodePtr& field(int i) const { return fields_[i]; }
+
+ int field_count() const { return fields_.size(); }
+
+ void ToParquet(void* opaque_element) const override;
+ void Visit(Visitor* visitor) override;
+ void VisitConst(ConstVisitor* visitor) const override;
+
+ private:
+ GroupNode(const std::string& name, Repetition::type repetition,
+ const NodeVector& fields, LogicalType::type logical_type = LogicalType::NONE,
+ int id = -1)
+ : Node(Node::GROUP, name, repetition, logical_type, id), fields_(fields) {
+ for (NodePtr& field : fields_) {
+ field->SetParent(this);
+ }
+ }
+
+ NodeVector fields_;
+ bool EqualsInternal(const GroupNode* other) const;
+
+ FRIEND_TEST(TestGroupNode, Attrs);
+ FRIEND_TEST(TestGroupNode, Equals);
+};
+
+// ----------------------------------------------------------------------
+// Convenience primitive type factory functions
+
+#define PRIMITIVE_FACTORY(FuncName, TYPE) \
+ static inline NodePtr FuncName( \
+ const std::string& name, Repetition::type repetition = Repetition::OPTIONAL) { \
+ return PrimitiveNode::Make(name, repetition, Type::TYPE); \
+ }
+
+PRIMITIVE_FACTORY(Boolean, BOOLEAN);
+PRIMITIVE_FACTORY(Int32, INT32);
+PRIMITIVE_FACTORY(Int64, INT64);
+PRIMITIVE_FACTORY(Int96, INT96);
+PRIMITIVE_FACTORY(Float, FLOAT);
+PRIMITIVE_FACTORY(Double, DOUBLE);
+PRIMITIVE_FACTORY(ByteArray, BYTE_ARRAY);
+
+void PARQUET_EXPORT PrintSchema(
+ const schema::Node* schema, std::ostream& stream, int indent_width = 2);
+
+} // namespace schema
+
+// The ColumnDescriptor encapsulates information necessary to interpret
+// primitive column data in the context of a particular schema. We have to
+// examine the node structure of a column's path to the root in the schema tree
+// to be able to reassemble the nested structure from the repetition and
+// definition levels.
+class PARQUET_EXPORT ColumnDescriptor {
+ public:
+ ColumnDescriptor(const schema::NodePtr& node, int16_t max_definition_level,
+ int16_t max_repetition_level, const SchemaDescriptor* schema_descr = nullptr);
+
+ bool Equals(const ColumnDescriptor& other) const;
+
+ int16_t max_definition_level() const { return max_definition_level_; }
+
+ int16_t max_repetition_level() const { return max_repetition_level_; }
+
+ Type::type physical_type() const { return primitive_node_->physical_type(); }
+
+ LogicalType::type logical_type() const { return primitive_node_->logical_type(); }
+
+ const std::string& name() const { return primitive_node_->name(); }
+
+ const std::shared_ptr<schema::ColumnPath> path() const;
+
+ const schema::NodePtr& schema_node() const { return node_; }
+
+ int type_length() const;
+
+ int type_precision() const;
+
+ int type_scale() const;
+
+ private:
+ schema::NodePtr node_;
+ const schema::PrimitiveNode* primitive_node_;
+
+ int16_t max_definition_level_;
+ int16_t max_repetition_level_;
+
+ // When this descriptor is part of a real schema (and not being used for
+ // testing purposes), maintain a link back to the parent SchemaDescriptor to
+ // enable reverse graph traversals
+ const SchemaDescriptor* schema_descr_;
+};
+
+// Container for the converted Parquet schema with a computed information from
+// the schema analysis needed for file reading
+//
+// * Column index to Node
+// * Max repetition / definition levels for each primitive node
+//
+// The ColumnDescriptor objects produced by this class can be used to assist in
+// the reconstruction of fully materialized data structures from the
+// repetition-definition level encoding of nested data
+//
+// TODO(wesm): this object can be recomputed from a Schema
+class PARQUET_EXPORT SchemaDescriptor {
+ public:
+ SchemaDescriptor() {}
+ ~SchemaDescriptor() {}
+
+ // Analyze the schema
+ void Init(std::unique_ptr<schema::Node> schema);
+ void Init(const schema::NodePtr& schema);
+
+ const ColumnDescriptor* Column(int i) const;
+
+ bool Equals(const SchemaDescriptor& other) const;
+
+ // The number of physical columns appearing in the file
+ int num_columns() const { return leaves_.size(); }
+
+ const schema::NodePtr& schema_root() const { return schema_; }
+
+ const schema::GroupNode* group_node() const { return group_node_; }
+
+ // Returns the root (child of the schema root) node of the leaf(column) node
+ const schema::NodePtr& GetColumnRoot(int i) const;
+
+ const std::string& name() const { return group_node_->name(); }
+
+ private:
+ friend class ColumnDescriptor;
+
+ schema::NodePtr schema_;
+ const schema::GroupNode* group_node_;
+
+ void BuildTree(const schema::NodePtr& node, int16_t max_def_level,
+ int16_t max_rep_level, const schema::NodePtr& base);
+
+ // Result of leaf node / tree analysis
+ std::vector<ColumnDescriptor> leaves_;
+
+ // Mapping between leaf nodes and root group of leaf (first node
+ // below the schema's root group)
+ //
+ // For example, the leaf `a.b.c.d` would have a link back to `a`
+ //
+ // -- a <------
+ // -- -- b |
+ // -- -- -- c |
+ // -- -- -- -- d
+ std::unordered_map<int, const schema::NodePtr> leaf_to_base_;
+};
+
+} // namespace parquet
+
+#endif // PARQUET_SCHEMA_TYPES_H
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/13da51d3/src/parquet/schema/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/src/parquet/schema/CMakeLists.txt b/src/parquet/schema/CMakeLists.txt
deleted file mode 100644
index 8aa9969..0000000
--- a/src/parquet/schema/CMakeLists.txt
+++ /dev/null
@@ -1,28 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# Headers: top level
-install(FILES
- descriptor.h
- printer.h
- types.h
- DESTINATION include/parquet/schema)
-
-ADD_PARQUET_TEST(schema-converter-test)
-ADD_PARQUET_TEST(schema-descriptor-test)
-ADD_PARQUET_TEST(schema-printer-test)
-ADD_PARQUET_TEST(schema-types-test)
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/13da51d3/src/parquet/schema/converter.cc
----------------------------------------------------------------------
diff --git a/src/parquet/schema/converter.cc b/src/parquet/schema/converter.cc
deleted file mode 100644
index 3b18af3..0000000
--- a/src/parquet/schema/converter.cc
+++ /dev/null
@@ -1,124 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include "parquet/schema/converter.h"
-
-#include "parquet/exception.h"
-#include "parquet/schema/descriptor.h"
-#include "parquet/schema/types.h"
-#include "parquet/thrift/parquet_types.h"
-
-using parquet::format::SchemaElement;
-
-namespace parquet {
-
-namespace schema {
-
-std::unique_ptr<Node> FlatSchemaConverter::Convert() {
- const SchemaElement& root = elements_[0];
-
- // Validate the root node
- if (root.num_children == 0) {
- throw ParquetException("Root node did not have children");
- }
-
- // Relaxing this restriction as some implementations don't set this
- // if (root.repetition_type != FieldRepetitionType::REPEATED) {
- // throw ParquetException("Root node was not FieldRepetitionType::REPEATED");
- // }
-
- return NextNode();
-}
-
-std::unique_ptr<Node> FlatSchemaConverter::NextNode() {
- const SchemaElement& element = Next();
-
- int node_id = next_id();
-
- const void* opaque_element = static_cast<const void*>(&element);
-
- if (element.num_children == 0) {
- // Leaf (primitive) node
- return PrimitiveNode::FromParquet(opaque_element, node_id);
- } else {
- // Group
- NodeVector fields;
- for (int i = 0; i < element.num_children; ++i) {
- std::unique_ptr<Node> field = NextNode();
- fields.push_back(NodePtr(field.release()));
- }
- return GroupNode::FromParquet(opaque_element, node_id, fields);
- }
-}
-
-const format::SchemaElement& FlatSchemaConverter::Next() {
- if (pos_ == length_) {
- throw ParquetException("Malformed schema: not enough SchemaElement values");
- }
- return elements_[pos_++];
-}
-
-std::shared_ptr<SchemaDescriptor> FromParquet(const std::vector<SchemaElement>& schema) {
- FlatSchemaConverter converter(&schema[0], schema.size());
- std::unique_ptr<Node> root = converter.Convert();
-
- std::shared_ptr<SchemaDescriptor> descr = std::make_shared<SchemaDescriptor>();
- descr->Init(std::shared_ptr<GroupNode>(static_cast<GroupNode*>(root.release())));
-
- return descr;
-}
-
-void ToParquet(const GroupNode* schema, std::vector<format::SchemaElement>* out) {
- SchemaFlattener flattener(schema, out);
- flattener.Flatten();
-}
-
-class SchemaVisitor : public Node::ConstVisitor {
- public:
- explicit SchemaVisitor(std::vector<format::SchemaElement>* elements)
- : elements_(elements) {}
- virtual ~SchemaVisitor() {}
-
- void Visit(const Node* node) override {
- format::SchemaElement element;
- node->ToParquet(&element);
- elements_->push_back(element);
-
- if (node->is_group()) {
- const GroupNode* group_node = static_cast<const GroupNode*>(node);
- for (int i = 0; i < group_node->field_count(); ++i) {
- group_node->field(i)->VisitConst(this);
- }
- }
- }
-
- private:
- std::vector<format::SchemaElement>* elements_;
-};
-
-SchemaFlattener::SchemaFlattener(
- const GroupNode* schema, std::vector<format::SchemaElement>* out)
- : root_(schema), elements_(out) {}
-
-void SchemaFlattener::Flatten() {
- SchemaVisitor visitor(elements_);
- root_->VisitConst(&visitor);
-}
-
-} // namespace schema
-
-} // namespace parquet
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/13da51d3/src/parquet/schema/converter.h
----------------------------------------------------------------------
diff --git a/src/parquet/schema/converter.h b/src/parquet/schema/converter.h
deleted file mode 100644
index 617d985..0000000
--- a/src/parquet/schema/converter.h
+++ /dev/null
@@ -1,91 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-// Conversion routines for converting to and from flat Parquet metadata. Among
-// other things, this limits the exposure of the internals of the Thrift
-// metadata structs to the rest of the library.
-
-// NB: This file is not part of the schema public API and only used internally
-// for converting to and from Parquet Thrift metadata
-
-#ifndef PARQUET_SCHEMA_CONVERTER_H
-#define PARQUET_SCHEMA_CONVERTER_H
-
-#include <memory>
-#include <vector>
-
-namespace parquet {
-
-namespace format {
-class SchemaElement;
-}
-
-class SchemaDescriptor;
-
-namespace schema {
-
-class GroupNode;
-class Node;
-
-// ----------------------------------------------------------------------
-// Conversion from Parquet Thrift metadata
-
-std::shared_ptr<SchemaDescriptor> FromParquet(
- const std::vector<format::SchemaElement>& schema);
-
-class FlatSchemaConverter {
- public:
- FlatSchemaConverter(const format::SchemaElement* elements, int length)
- : elements_(elements), length_(length), pos_(0), current_id_(0) {}
-
- std::unique_ptr<Node> Convert();
-
- private:
- const format::SchemaElement* elements_;
- int length_;
- int pos_;
- int current_id_;
-
- int next_id() { return current_id_++; }
-
- const format::SchemaElement& Next();
-
- std::unique_ptr<Node> NextNode();
-};
-
-// ----------------------------------------------------------------------
-// Conversion to Parquet Thrift metadata
-
-void ToParquet(const GroupNode* schema, std::vector<format::SchemaElement>* out);
-
-// Converts nested parquet schema back to a flat vector of Thrift structs
-class SchemaFlattener {
- public:
- SchemaFlattener(const GroupNode* schema, std::vector<format::SchemaElement>* out);
-
- void Flatten();
-
- private:
- const GroupNode* root_;
- std::vector<format::SchemaElement>* elements_;
-};
-
-} // namespace schema
-
-} // namespace parquet
-
-#endif // PARQUET_SCHEMA_CONVERTER_H
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/13da51d3/src/parquet/schema/descriptor.cc
----------------------------------------------------------------------
diff --git a/src/parquet/schema/descriptor.cc b/src/parquet/schema/descriptor.cc
deleted file mode 100644
index 0b0d006..0000000
--- a/src/parquet/schema/descriptor.cc
+++ /dev/null
@@ -1,138 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include "parquet/schema/descriptor.h"
-
-#include "parquet/exception.h"
-#include "parquet/util/logging.h"
-
-namespace parquet {
-
-using schema::ColumnPath;
-using schema::Node;
-using schema::NodePtr;
-using schema::PrimitiveNode;
-using schema::GroupNode;
-
-void SchemaDescriptor::Init(std::unique_ptr<schema::Node> schema) {
- Init(NodePtr(schema.release()));
-}
-
-void SchemaDescriptor::Init(const NodePtr& schema) {
- schema_ = schema;
-
- if (!schema_->is_group()) {
- throw ParquetException("Must initialize with a schema group");
- }
-
- group_node_ = static_cast<const GroupNode*>(schema_.get());
- leaves_.clear();
-
- for (int i = 0; i < group_node_->field_count(); ++i) {
- BuildTree(group_node_->field(i), 0, 0, group_node_->field(i));
- }
-}
-
-bool SchemaDescriptor::Equals(const SchemaDescriptor& other) const {
- if (this->num_columns() != other.num_columns()) { return false; }
-
- for (int i = 0; i < this->num_columns(); ++i) {
- if (!this->Column(i)->Equals(*other.Column(i))) { return false; }
- }
-
- return true;
-}
-
-void SchemaDescriptor::BuildTree(const NodePtr& node, int16_t max_def_level,
- int16_t max_rep_level, const NodePtr& base) {
- if (node->is_optional()) {
- ++max_def_level;
- } else if (node->is_repeated()) {
- // Repeated fields add a definition level. This is used to distinguish
- // between an empty list and a list with an item in it.
- ++max_rep_level;
- ++max_def_level;
- }
-
- // Now, walk the schema and create a ColumnDescriptor for each leaf node
- if (node->is_group()) {
- const GroupNode* group = static_cast<const GroupNode*>(node.get());
- for (int i = 0; i < group->field_count(); ++i) {
- BuildTree(group->field(i), max_def_level, max_rep_level, base);
- }
- } else {
- // Primitive node, append to leaves
- leaves_.push_back(ColumnDescriptor(node, max_def_level, max_rep_level, this));
- leaf_to_base_.emplace(leaves_.size() - 1, base);
- }
-}
-
-ColumnDescriptor::ColumnDescriptor(const schema::NodePtr& node,
- int16_t max_definition_level, int16_t max_repetition_level,
- const SchemaDescriptor* schema_descr)
- : node_(node),
- max_definition_level_(max_definition_level),
- max_repetition_level_(max_repetition_level),
- schema_descr_(schema_descr) {
- if (!node_->is_primitive()) { throw ParquetException("Must be a primitive type"); }
- primitive_node_ = static_cast<const PrimitiveNode*>(node_.get());
-}
-
-bool ColumnDescriptor::Equals(const ColumnDescriptor& other) const {
- return primitive_node_->Equals(other.primitive_node_) &&
- max_repetition_level() == other.max_repetition_level() &&
- max_definition_level() == other.max_definition_level();
-}
-
-const ColumnDescriptor* SchemaDescriptor::Column(int i) const {
- DCHECK(i >= 0 && i < static_cast<int>(leaves_.size()));
- return &leaves_[i];
-}
-
-const schema::NodePtr& SchemaDescriptor::GetColumnRoot(int i) const {
- DCHECK(i >= 0 && i < static_cast<int>(leaves_.size()));
- return leaf_to_base_.find(i)->second;
-}
-
-int ColumnDescriptor::type_scale() const {
- return primitive_node_->decimal_metadata().scale;
-}
-
-int ColumnDescriptor::type_precision() const {
- return primitive_node_->decimal_metadata().precision;
-}
-
-int ColumnDescriptor::type_length() const {
- return primitive_node_->type_length();
-}
-
-const std::shared_ptr<ColumnPath> ColumnDescriptor::path() const {
- // Build the path in reverse order as we traverse the nodes to the top
- std::vector<std::string> rpath_;
- const Node* node = primitive_node_;
- // The schema node is not part of the ColumnPath
- while (node->parent()) {
- rpath_.push_back(node->name());
- node = node->parent();
- }
-
- // Build ColumnPath in correct order
- std::vector<std::string> path_(rpath_.crbegin(), rpath_.crend());
- return std::make_shared<ColumnPath>(std::move(path_));
-}
-
-} // namespace parquet
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/13da51d3/src/parquet/schema/descriptor.h
----------------------------------------------------------------------
diff --git a/src/parquet/schema/descriptor.h b/src/parquet/schema/descriptor.h
deleted file mode 100644
index ae7b60e..0000000
--- a/src/parquet/schema/descriptor.h
+++ /dev/null
@@ -1,142 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#ifndef PARQUET_SCHEMA_DESCRIPTOR_H
-#define PARQUET_SCHEMA_DESCRIPTOR_H
-
-#include "parquet/schema/types.h"
-#include "parquet/types.h"
-#include "parquet/util/visibility.h"
-#include <cstdint>
-#include <cstdlib>
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <vector>
-
-namespace parquet {
-
-class SchemaDescriptor;
-
-// The ColumnDescriptor encapsulates information necessary to interpret
-// primitive column data in the context of a particular schema. We have to
-// examine the node structure of a column's path to the root in the schema tree
-// to be able to reassemble the nested structure from the repetition and
-// definition levels.
-class PARQUET_EXPORT ColumnDescriptor {
- public:
- ColumnDescriptor(const schema::NodePtr& node, int16_t max_definition_level,
- int16_t max_repetition_level, const SchemaDescriptor* schema_descr = nullptr);
-
- bool Equals(const ColumnDescriptor& other) const;
-
- int16_t max_definition_level() const { return max_definition_level_; }
-
- int16_t max_repetition_level() const { return max_repetition_level_; }
-
- Type::type physical_type() const { return primitive_node_->physical_type(); }
-
- LogicalType::type logical_type() const { return primitive_node_->logical_type(); }
-
- const std::string& name() const { return primitive_node_->name(); }
-
- const std::shared_ptr<schema::ColumnPath> path() const;
-
- const schema::NodePtr& schema_node() const { return node_; }
-
- int type_length() const;
-
- int type_precision() const;
-
- int type_scale() const;
-
- private:
- schema::NodePtr node_;
- const schema::PrimitiveNode* primitive_node_;
-
- int16_t max_definition_level_;
- int16_t max_repetition_level_;
-
- // When this descriptor is part of a real schema (and not being used for
- // testing purposes), maintain a link back to the parent SchemaDescriptor to
- // enable reverse graph traversals
- const SchemaDescriptor* schema_descr_;
-};
-
-// Container for the converted Parquet schema with a computed information from
-// the schema analysis needed for file reading
-//
-// * Column index to Node
-// * Max repetition / definition levels for each primitive node
-//
-// The ColumnDescriptor objects produced by this class can be used to assist in
-// the reconstruction of fully materialized data structures from the
-// repetition-definition level encoding of nested data
-//
-// TODO(wesm): this object can be recomputed from a Schema
-class PARQUET_EXPORT SchemaDescriptor {
- public:
- SchemaDescriptor() {}
- ~SchemaDescriptor() {}
-
- // Analyze the schema
- void Init(std::unique_ptr<schema::Node> schema);
- void Init(const schema::NodePtr& schema);
-
- const ColumnDescriptor* Column(int i) const;
-
- bool Equals(const SchemaDescriptor& other) const;
-
- // The number of physical columns appearing in the file
- int num_columns() const { return leaves_.size(); }
-
- const schema::NodePtr& schema_root() const { return schema_; }
-
- const schema::GroupNode* group_node() const { return group_node_; }
-
- // Returns the root (child of the schema root) node of the leaf(column) node
- const schema::NodePtr& GetColumnRoot(int i) const;
-
- const std::string& name() const { return group_node_->name(); }
-
- private:
- friend class ColumnDescriptor;
-
- schema::NodePtr schema_;
- const schema::GroupNode* group_node_;
-
- void BuildTree(const schema::NodePtr& node, int16_t max_def_level,
- int16_t max_rep_level, const schema::NodePtr& base);
-
- // Result of leaf node / tree analysis
- std::vector<ColumnDescriptor> leaves_;
-
- // Mapping between leaf nodes and root group of leaf (first node
- // below the schema's root group)
- //
- // For example, the leaf `a.b.c.d` would have a link back to `a`
- //
- // -- a <------
- // -- -- b |
- // -- -- -- c |
- // -- -- -- -- d
- std::unordered_map<int, const schema::NodePtr> leaf_to_base_;
-};
-
-} // namespace parquet
-
-#endif // PARQUET_SCHEMA_DESCRIPTOR_H
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/13da51d3/src/parquet/schema/printer.cc
----------------------------------------------------------------------
diff --git a/src/parquet/schema/printer.cc b/src/parquet/schema/printer.cc
deleted file mode 100644
index ca11244..0000000
--- a/src/parquet/schema/printer.cc
+++ /dev/null
@@ -1,159 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include "parquet/schema/printer.h"
-
-#include <memory>
-#include <string>
-
-#include "parquet/schema/types.h"
-#include "parquet/types.h"
-
-namespace parquet {
-
-namespace schema {
-
-class SchemaPrinter : public Node::ConstVisitor {
- public:
- explicit SchemaPrinter(std::ostream& stream, int indent_width)
- : stream_(stream), indent_(0), indent_width_(2) {}
-
- void Visit(const Node* node) override;
-
- private:
- void Visit(const PrimitiveNode* node);
- void Visit(const GroupNode* node);
-
- void Indent();
-
- std::ostream& stream_;
-
- int indent_;
- int indent_width_;
-};
-
-static void PrintRepLevel(Repetition::type repetition, std::ostream& stream) {
- switch (repetition) {
- case Repetition::REQUIRED:
- stream << "required";
- break;
- case Repetition::OPTIONAL:
- stream << "optional";
- break;
- case Repetition::REPEATED:
- stream << "repeated";
- break;
- default:
- break;
- }
-}
-
-static void PrintType(const PrimitiveNode* node, std::ostream& stream) {
- switch (node->physical_type()) {
- case Type::BOOLEAN:
- stream << "boolean";
- break;
- case Type::INT32:
- stream << "int32";
- break;
- case Type::INT64:
- stream << "int64";
- break;
- case Type::INT96:
- stream << "int96";
- break;
- case Type::FLOAT:
- stream << "float";
- break;
- case Type::DOUBLE:
- stream << "double";
- break;
- case Type::BYTE_ARRAY:
- stream << "binary";
- break;
- case Type::FIXED_LEN_BYTE_ARRAY:
- stream << "fixed_len_byte_array(" << node->type_length() << ")";
- break;
- default:
- break;
- }
-}
-
-static void PrintLogicalType(const PrimitiveNode* node, std::ostream& stream) {
- auto lt = node->logical_type();
- if (lt == LogicalType::DECIMAL) {
- stream << " (" << LogicalTypeToString(lt) << "(" << node->decimal_metadata().precision
- << "," << node->decimal_metadata().scale << "))";
- } else if (lt != LogicalType::NONE) {
- stream << " (" << LogicalTypeToString(lt) << ")";
- }
-}
-
-void SchemaPrinter::Visit(const PrimitiveNode* node) {
- PrintRepLevel(node->repetition(), stream_);
- stream_ << " ";
- PrintType(node, stream_);
- stream_ << " " << node->name();
- PrintLogicalType(node, stream_);
- stream_ << ";" << std::endl;
-}
-
-void SchemaPrinter::Visit(const GroupNode* node) {
- if (!node->parent()) {
- stream_ << "message " << node->name() << " {" << std::endl;
- } else {
- PrintRepLevel(node->repetition(), stream_);
- stream_ << " group " << node->name();
- auto lt = node->logical_type();
- if (lt != LogicalType::NONE) { stream_ << " (" << LogicalTypeToString(lt) << ")"; }
- stream_ << " {" << std::endl;
- }
-
- indent_ += indent_width_;
- for (int i = 0; i < node->field_count(); ++i) {
- node->field(i)->VisitConst(this);
- }
- indent_ -= indent_width_;
- Indent();
- stream_ << "}" << std::endl;
-}
-
-void SchemaPrinter::Indent() {
- if (indent_ > 0) {
- std::string spaces(indent_, ' ');
- stream_ << spaces;
- }
-}
-
-void SchemaPrinter::Visit(const Node* node) {
- Indent();
- if (node->is_group()) {
- Visit(static_cast<const GroupNode*>(node));
- } else {
- // Primitive
- Visit(static_cast<const PrimitiveNode*>(node));
- }
-}
-
-void PrintSchema(const Node* schema, std::ostream& stream, int indent_width) {
- SchemaPrinter printer(stream, indent_width);
- printer.Visit(schema);
-}
-
-} // namespace schema
-
-} // namespace parquet
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/13da51d3/src/parquet/schema/printer.h
----------------------------------------------------------------------
diff --git a/src/parquet/schema/printer.h b/src/parquet/schema/printer.h
deleted file mode 100644
index c37ef90..0000000
--- a/src/parquet/schema/printer.h
+++ /dev/null
@@ -1,40 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-// A simple Schema printer using the visitor pattern
-
-#ifndef PARQUET_SCHEMA_PRINTER_H
-#define PARQUET_SCHEMA_PRINTER_H
-
-#include <ostream>
-
-#include "parquet/util/visibility.h"
-
-namespace parquet {
-
-namespace schema {
-
-class Node;
-
-void PARQUET_EXPORT PrintSchema(
- const Node* schema, std::ostream& stream, int indent_width = 2);
-
-} // namespace schema
-
-} // namespace parquet
-
-#endif // PARQUET_SCHEMA_PRINTER_H
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/13da51d3/src/parquet/schema/schema-converter-test.cc
----------------------------------------------------------------------
diff --git a/src/parquet/schema/schema-converter-test.cc b/src/parquet/schema/schema-converter-test.cc
deleted file mode 100644
index c752919..0000000
--- a/src/parquet/schema/schema-converter-test.cc
+++ /dev/null
@@ -1,222 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include <gtest/gtest.h>
-
-#include <cstdlib>
-#include <memory>
-#include <string>
-#include <vector>
-
-#include "parquet/exception.h"
-#include "parquet/schema/converter.h"
-#include "parquet/schema/test-util.h"
-#include "parquet/schema/types.h"
-#include "parquet/thrift/parquet_types.h"
-#include "parquet/types.h"
-
-using std::string;
-using std::vector;
-
-using parquet::format::ConvertedType;
-using parquet::format::FieldRepetitionType;
-using parquet::format::SchemaElement;
-
-namespace parquet {
-
-namespace schema {
-
-// ----------------------------------------------------------------------
-// Test convert group
-
-class TestSchemaConverter : public ::testing::Test {
- public:
- void setUp() { name_ = "parquet_schema"; }
-
- void Convert(const parquet::format::SchemaElement* elements, int length) {
- FlatSchemaConverter converter(elements, length);
- node_ = converter.Convert();
- ASSERT_TRUE(node_->is_group());
- group_ = static_cast<const GroupNode*>(node_.get());
- }
-
- protected:
- std::string name_;
- const GroupNode* group_;
- std::unique_ptr<Node> node_;
-};
-
-bool check_for_parent_consistency(const GroupNode* node) {
- // Each node should have the group as parent
- for (int i = 0; i < node->field_count(); i++) {
- const NodePtr& field = node->field(i);
- if (field->parent() != node) { return false; }
- if (field->is_group()) {
- const GroupNode* group = static_cast<GroupNode*>(field.get());
- if (!check_for_parent_consistency(group)) { return false; }
- }
- }
- return true;
-}
-
-TEST_F(TestSchemaConverter, NestedExample) {
- SchemaElement elt;
- std::vector<SchemaElement> elements;
- elements.push_back(NewGroup(name_, FieldRepetitionType::REPEATED, 2, 0));
-
- // A primitive one
- elements.push_back(
- NewPrimitive("a", FieldRepetitionType::REQUIRED, format::Type::INT32, 1));
-
- // A group
- elements.push_back(NewGroup("bag", FieldRepetitionType::OPTIONAL, 1, 2));
-
- // 3-level list encoding, by hand
- elt = NewGroup("b", FieldRepetitionType::REPEATED, 1, 3);
- elt.__set_converted_type(ConvertedType::LIST);
- elements.push_back(elt);
- elements.push_back(
- NewPrimitive("item", FieldRepetitionType::OPTIONAL, format::Type::INT64, 4));
-
- Convert(&elements[0], elements.size());
-
- // Construct the expected schema
- NodeVector fields;
- fields.push_back(Int32("a", Repetition::REQUIRED));
-
- // 3-level list encoding
- NodePtr item = Int64("item");
- NodePtr list(GroupNode::Make("b", Repetition::REPEATED, {item}, LogicalType::LIST));
- NodePtr bag(GroupNode::Make("bag", Repetition::OPTIONAL, {list}));
- fields.push_back(bag);
-
- NodePtr schema = GroupNode::Make(name_, Repetition::REPEATED, fields);
-
- ASSERT_TRUE(schema->Equals(group_));
-
- // Check that the parent relationship in each node is consitent
- ASSERT_EQ(group_->parent(), nullptr);
- ASSERT_TRUE(check_for_parent_consistency(group_));
-}
-
-TEST_F(TestSchemaConverter, InvalidRoot) {
- // According to the Parquet specification, the first element in the
- // list<SchemaElement> is a group whose children (and their descendants)
- // contain all of the rest of the flattened schema elements. If the first
- // element is not a group, it is a malformed Parquet file.
-
- SchemaElement elements[2];
- elements[0] =
- NewPrimitive("not-a-group", FieldRepetitionType::REQUIRED, format::Type::INT32, 0);
- ASSERT_THROW(Convert(elements, 2), ParquetException);
-
- // While the Parquet spec indicates that the root group should have REPEATED
- // repetition type, some implementations may return REQUIRED or OPTIONAL
- // groups as the first element. These tests check that this is okay as a
- // practicality matter.
- elements[0] = NewGroup("not-repeated", FieldRepetitionType::REQUIRED, 1, 0);
- elements[1] = NewPrimitive("a", FieldRepetitionType::REQUIRED, format::Type::INT32, 1);
- Convert(elements, 2);
-
- elements[0] = NewGroup("not-repeated", FieldRepetitionType::OPTIONAL, 1, 0);
- Convert(elements, 2);
-}
-
-TEST_F(TestSchemaConverter, NotEnoughChildren) {
- // Throw a ParquetException, but don't core dump or anything
- SchemaElement elt;
- std::vector<SchemaElement> elements;
- elements.push_back(NewGroup(name_, FieldRepetitionType::REPEATED, 2, 0));
- ASSERT_THROW(Convert(&elements[0], 1), ParquetException);
-}
-
-// ----------------------------------------------------------------------
-// Schema tree flatten / unflatten
-
-class TestSchemaFlatten : public ::testing::Test {
- public:
- void setUp() { name_ = "parquet_schema"; }
-
- void Flatten(const GroupNode* schema) { ToParquet(schema, &elements_); }
-
- protected:
- std::string name_;
- std::vector<format::SchemaElement> elements_;
-};
-
-TEST_F(TestSchemaFlatten, DecimalMetadata) {
- // Checks that DecimalMetadata is only set for DecimalTypes
- NodePtr node = PrimitiveNode::Make(
- "decimal", Repetition::REQUIRED, Type::INT64, LogicalType::DECIMAL, -1, 8, 4);
- NodePtr group =
- GroupNode::Make("group", Repetition::REPEATED, {node}, LogicalType::LIST);
- Flatten(reinterpret_cast<GroupNode*>(group.get()));
- ASSERT_EQ("decimal", elements_[1].name);
- ASSERT_TRUE(elements_[1].__isset.precision);
- ASSERT_TRUE(elements_[1].__isset.scale);
-
- elements_.clear();
- // Not for integers with no logical type
- group =
- GroupNode::Make("group", Repetition::REPEATED, {Int64("int64")}, LogicalType::LIST);
- Flatten(reinterpret_cast<GroupNode*>(group.get()));
- ASSERT_EQ("int64", elements_[1].name);
- ASSERT_FALSE(elements_[0].__isset.precision);
- ASSERT_FALSE(elements_[0].__isset.scale);
-}
-
-TEST_F(TestSchemaFlatten, NestedExample) {
- SchemaElement elt;
- std::vector<SchemaElement> elements;
- elements.push_back(NewGroup(name_, FieldRepetitionType::REPEATED, 2, 0));
-
- // A primitive one
- elements.push_back(
- NewPrimitive("a", FieldRepetitionType::REQUIRED, format::Type::INT32, 1));
-
- // A group
- elements.push_back(NewGroup("bag", FieldRepetitionType::OPTIONAL, 1, 2));
-
- // 3-level list encoding, by hand
- elt = NewGroup("b", FieldRepetitionType::REPEATED, 1, 3);
- elt.__set_converted_type(ConvertedType::LIST);
- elements.push_back(elt);
- elements.push_back(
- NewPrimitive("item", FieldRepetitionType::OPTIONAL, format::Type::INT64, 4));
-
- // Construct the schema
- NodeVector fields;
- fields.push_back(Int32("a", Repetition::REQUIRED));
-
- // 3-level list encoding
- NodePtr item = Int64("item");
- NodePtr list(GroupNode::Make("b", Repetition::REPEATED, {item}, LogicalType::LIST));
- NodePtr bag(GroupNode::Make("bag", Repetition::OPTIONAL, {list}));
- fields.push_back(bag);
-
- NodePtr schema = GroupNode::Make(name_, Repetition::REPEATED, fields);
-
- Flatten(static_cast<GroupNode*>(schema.get()));
- ASSERT_EQ(elements_.size(), elements.size());
- for (size_t i = 0; i < elements_.size(); i++) {
- ASSERT_EQ(elements_[i], elements[i]);
- }
-}
-
-} // namespace schema
-
-} // namespace parquet
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/13da51d3/src/parquet/schema/schema-descriptor-test.cc
----------------------------------------------------------------------
diff --git a/src/parquet/schema/schema-descriptor-test.cc b/src/parquet/schema/schema-descriptor-test.cc
deleted file mode 100644
index 4b7f67c..0000000
--- a/src/parquet/schema/schema-descriptor-test.cc
+++ /dev/null
@@ -1,190 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-// Schema / column descriptor correctness tests (from flat Parquet schemas)
-
-#include <cstdint>
-#include <cstdlib>
-#include <gtest/gtest.h>
-#include <string>
-#include <vector>
-
-#include "parquet/exception.h"
-#include "parquet/schema/descriptor.h"
-#include "parquet/schema/types.h"
-#include "parquet/types.h"
-
-using std::string;
-using std::vector;
-
-namespace parquet {
-
-namespace schema {
-
-TEST(TestColumnDescriptor, TestAttrs) {
- NodePtr node = PrimitiveNode::Make(
- "name", Repetition::OPTIONAL, Type::BYTE_ARRAY, LogicalType::UTF8);
- ColumnDescriptor descr(node, 4, 1);
-
- ASSERT_EQ("name", descr.name());
- ASSERT_EQ(4, descr.max_definition_level());
- ASSERT_EQ(1, descr.max_repetition_level());
-
- ASSERT_EQ(Type::BYTE_ARRAY, descr.physical_type());
-
- ASSERT_EQ(-1, descr.type_length());
-
- // Test FIXED_LEN_BYTE_ARRAY
- node = PrimitiveNode::Make("name", Repetition::OPTIONAL, Type::FIXED_LEN_BYTE_ARRAY,
- LogicalType::DECIMAL, 12, 10, 4);
- descr = ColumnDescriptor(node, 4, 1);
-
- ASSERT_EQ(Type::FIXED_LEN_BYTE_ARRAY, descr.physical_type());
- ASSERT_EQ(12, descr.type_length());
-}
-
-class TestSchemaDescriptor : public ::testing::Test {
- public:
- void setUp() {}
-
- protected:
- SchemaDescriptor descr_;
-};
-
-TEST_F(TestSchemaDescriptor, InitNonGroup) {
- NodePtr node = PrimitiveNode::Make("field", Repetition::OPTIONAL, Type::INT32);
-
- ASSERT_THROW(descr_.Init(node), ParquetException);
-}
-
-TEST_F(TestSchemaDescriptor, Equals) {
- NodePtr schema;
-
- NodePtr inta = Int32("a", Repetition::REQUIRED);
- NodePtr intb = Int64("b", Repetition::OPTIONAL);
- NodePtr intb2 = Int64("b2", Repetition::OPTIONAL);
- NodePtr intc = ByteArray("c", Repetition::REPEATED);
-
- NodePtr item1 = Int64("item1", Repetition::REQUIRED);
- NodePtr item2 = Boolean("item2", Repetition::OPTIONAL);
- NodePtr item3 = Int32("item3", Repetition::REPEATED);
- NodePtr list(GroupNode::Make(
- "records", Repetition::REPEATED, {item1, item2, item3}, LogicalType::LIST));
-
- NodePtr bag(GroupNode::Make("bag", Repetition::OPTIONAL, {list}));
- NodePtr bag2(GroupNode::Make("bag", Repetition::REQUIRED, {list}));
-
- SchemaDescriptor descr1;
- descr1.Init(GroupNode::Make("schema", Repetition::REPEATED, {inta, intb, intc, bag}));
-
- ASSERT_TRUE(descr1.Equals(descr1));
-
- SchemaDescriptor descr2;
- descr2.Init(GroupNode::Make("schema", Repetition::REPEATED, {inta, intb, intc, bag2}));
- ASSERT_FALSE(descr1.Equals(descr2));
-
- SchemaDescriptor descr3;
- descr3.Init(GroupNode::Make("schema", Repetition::REPEATED, {inta, intb2, intc, bag}));
- ASSERT_FALSE(descr1.Equals(descr3));
-
- // Robust to name of parent node
- SchemaDescriptor descr4;
- descr4.Init(GroupNode::Make("SCHEMA", Repetition::REPEATED, {inta, intb, intc, bag}));
- ASSERT_TRUE(descr1.Equals(descr4));
-
- SchemaDescriptor descr5;
- descr5.Init(
- GroupNode::Make("schema", Repetition::REPEATED, {inta, intb, intc, bag, intb2}));
- ASSERT_FALSE(descr1.Equals(descr5));
-
- // Different max repetition / definition levels
- ColumnDescriptor col1(inta, 5, 1);
- ColumnDescriptor col2(inta, 6, 1);
- ColumnDescriptor col3(inta, 5, 2);
-
- ASSERT_TRUE(col1.Equals(col1));
- ASSERT_FALSE(col1.Equals(col2));
- ASSERT_FALSE(col1.Equals(col3));
-}
-
-TEST_F(TestSchemaDescriptor, BuildTree) {
- NodeVector fields;
- NodePtr schema;
-
- NodePtr inta = Int32("a", Repetition::REQUIRED);
- fields.push_back(inta);
- fields.push_back(Int64("b", Repetition::OPTIONAL));
- fields.push_back(ByteArray("c", Repetition::REPEATED));
-
- // 3-level list encoding
- NodePtr item1 = Int64("item1", Repetition::REQUIRED);
- NodePtr item2 = Boolean("item2", Repetition::OPTIONAL);
- NodePtr item3 = Int32("item3", Repetition::REPEATED);
- NodePtr list(GroupNode::Make(
- "records", Repetition::REPEATED, {item1, item2, item3}, LogicalType::LIST));
- NodePtr bag(GroupNode::Make("bag", Repetition::OPTIONAL, {list}));
- fields.push_back(bag);
-
- schema = GroupNode::Make("schema", Repetition::REPEATED, fields);
-
- descr_.Init(schema);
-
- int nleaves = 6;
-
- // 6 leaves
- ASSERT_EQ(nleaves, descr_.num_columns());
-
- // mdef mrep
- // required int32 a 0 0
- // optional int64 b 1 0
- // repeated byte_array c 1 1
- // optional group bag 1 0
- // repeated group records 2 1
- // required int64 item1 2 1
- // optional boolean item2 3 1
- // repeated int32 item3 3 2
- int16_t ex_max_def_levels[6] = {0, 1, 1, 2, 3, 3};
- int16_t ex_max_rep_levels[6] = {0, 0, 1, 1, 1, 2};
-
- for (int i = 0; i < nleaves; ++i) {
- const ColumnDescriptor* col = descr_.Column(i);
- EXPECT_EQ(ex_max_def_levels[i], col->max_definition_level()) << i;
- EXPECT_EQ(ex_max_rep_levels[i], col->max_repetition_level()) << i;
- }
-
- ASSERT_EQ(descr_.Column(0)->path()->ToDotString(), "a");
- ASSERT_EQ(descr_.Column(1)->path()->ToDotString(), "b");
- ASSERT_EQ(descr_.Column(2)->path()->ToDotString(), "c");
- ASSERT_EQ(descr_.Column(3)->path()->ToDotString(), "bag.records.item1");
- ASSERT_EQ(descr_.Column(4)->path()->ToDotString(), "bag.records.item2");
- ASSERT_EQ(descr_.Column(5)->path()->ToDotString(), "bag.records.item3");
-
- ASSERT_EQ(inta.get(), descr_.GetColumnRoot(0).get());
- ASSERT_EQ(bag.get(), descr_.GetColumnRoot(3).get());
- ASSERT_EQ(bag.get(), descr_.GetColumnRoot(4).get());
- ASSERT_EQ(bag.get(), descr_.GetColumnRoot(5).get());
-
- ASSERT_EQ(schema.get(), descr_.group_node());
-
- // Init clears the leaves
- descr_.Init(schema);
- ASSERT_EQ(nleaves, descr_.num_columns());
-}
-
-} // namespace schema
-
-} // namespace parquet
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/13da51d3/src/parquet/schema/schema-printer-test.cc
----------------------------------------------------------------------
diff --git a/src/parquet/schema/schema-printer-test.cc b/src/parquet/schema/schema-printer-test.cc
deleted file mode 100644
index 29140f0..0000000
--- a/src/parquet/schema/schema-printer-test.cc
+++ /dev/null
@@ -1,76 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include <gtest/gtest.h>
-
-#include <iosfwd>
-#include <string>
-#include <vector>
-
-#include "parquet/schema/printer.h"
-#include "parquet/schema/types.h"
-#include "parquet/types.h"
-
-using std::string;
-using std::vector;
-
-namespace parquet {
-
-namespace schema {
-
-static std::string Print(const NodePtr& node) {
- std::stringstream ss;
- PrintSchema(node.get(), ss);
- return ss.str();
-}
-
-TEST(TestSchemaPrinter, Examples) {
- // Test schema 1
- NodeVector fields;
- fields.push_back(Int32("a", Repetition::REQUIRED));
-
- // 3-level list encoding
- NodePtr item1 = Int64("item1");
- NodePtr item2 = Boolean("item2", Repetition::REQUIRED);
- NodePtr list(
- GroupNode::Make("b", Repetition::REPEATED, {item1, item2}, LogicalType::LIST));
- NodePtr bag(GroupNode::Make("bag", Repetition::OPTIONAL, {list}));
- fields.push_back(bag);
-
- fields.push_back(PrimitiveNode::Make(
- "c", Repetition::REQUIRED, Type::INT32, LogicalType::DECIMAL, -1, 3, 2));
-
- NodePtr schema = GroupNode::Make("schema", Repetition::REPEATED, fields);
-
- std::string result = Print(schema);
- std::string expected = R"(message schema {
- required int32 a;
- optional group bag {
- repeated group b (LIST) {
- optional int64 item1;
- required boolean item2;
- }
- }
- required int32 c (DECIMAL(3,2));
-}
-)";
- ASSERT_EQ(expected, result);
-}
-
-} // namespace schema
-
-} // namespace parquet