You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@parquet.apache.org by we...@apache.org on 2016/03/27 20:31:54 UTC
parquet-cpp git commit: PARQUET-566: Add method to retrieve the full
column path
Repository: parquet-cpp
Updated Branches:
refs/heads/master fbb25bf3e -> f785e4c7b
PARQUET-566: Add method to retrieve the full column path
This is based on the idea of `org.apache.parquet.hadoop.metadata.ColumnPath`.
Author: Uwe L. Korn <uw...@xhochy.com>
Closes #82 from xhochy/parquet-566 and squashes the following commits:
dd48b01 [Uwe L. Korn] Move friend declaration into protected section
43f51d1 [Uwe L. Korn] Construct ColumnPath only on request
bb30ab3 [Uwe L. Korn] Initialise parent with nullptr
98cf302 [Uwe L. Korn] Add parent node reference
799f553 [Uwe L. Korn] Deactivate C++11 lint checks
032de01 [Uwe L. Korn] Use stringstream for readability
ad887a9 [Uwe L. Korn] Adhere to Google naming conventions
ec3c008 [Uwe L. Korn] PARQUET-566: Add method to retrieve the full column path
Project: http://git-wip-us.apache.org/repos/asf/parquet-cpp/repo
Commit: http://git-wip-us.apache.org/repos/asf/parquet-cpp/commit/f785e4c7
Tree: http://git-wip-us.apache.org/repos/asf/parquet-cpp/tree/f785e4c7
Diff: http://git-wip-us.apache.org/repos/asf/parquet-cpp/diff/f785e4c7
Branch: refs/heads/master
Commit: f785e4c7b46e5fa21da5a49051285ae2264e3758
Parents: fbb25bf
Author: Uwe L. Korn <uw...@xhochy.com>
Authored: Sun Mar 27 11:31:44 2016 -0700
Committer: Wes McKinney <we...@apache.org>
Committed: Sun Mar 27 11:31:44 2016 -0700
----------------------------------------------------------------------
CMakeLists.txt | 2 +-
src/parquet/schema/descriptor.cc | 17 ++++++++++
src/parquet/schema/descriptor.h | 5 ++-
src/parquet/schema/schema-converter-test.cc | 21 ++++++++++++
src/parquet/schema/schema-descriptor-test.cc | 9 +++++-
src/parquet/schema/schema-types-test.cc | 15 +++++++++
src/parquet/schema/types.cc | 39 +++++++++++++++++++++++
src/parquet/schema/types.h | 35 ++++++++++++++++++--
8 files changed, 138 insertions(+), 5 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/f785e4c7/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 661d813..f3207c0 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -276,7 +276,7 @@ if (UNIX)
add_custom_target(lint ${BUILD_SUPPORT_DIR}/cpplint.py
--verbose=2
--linelength=90
- --filter=-whitespace/comments,-readability/todo,-build/header_guard,-runtime/references,-readability/check
+ --filter=-whitespace/comments,-readability/todo,-build/header_guard,-runtime/references,-readability/check,-build/c++11
`find ${CMAKE_CURRENT_SOURCE_DIR}/src -name \\*.cc -or -name \\*.h | sed -e '/parquet\\/thrift/g'`)
endif (UNIX)
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/f785e4c7/src/parquet/schema/descriptor.cc
----------------------------------------------------------------------
diff --git a/src/parquet/schema/descriptor.cc b/src/parquet/schema/descriptor.cc
index 1246f84..21c1cd5 100644
--- a/src/parquet/schema/descriptor.cc
+++ b/src/parquet/schema/descriptor.cc
@@ -21,6 +21,8 @@
namespace parquet_cpp {
+using schema::ColumnPath;
+using schema::Node;
using schema::NodePtr;
using schema::PrimitiveNode;
using schema::GroupNode;
@@ -96,4 +98,19 @@ int ColumnDescriptor::type_length() const {
return primitive_node_->type_length();
}
+const std::shared_ptr<ColumnPath> ColumnDescriptor::path() const {
+ // Build the path in reverse order as we traverse the nodes to the top
+ std::vector<std::string> rpath_;
+ const Node* node = primitive_node_;
+ // The schema node is not part of the ColumnPath
+ while (node->parent()) {
+ rpath_.push_back(node->name());
+ node = node->parent();
+ }
+
+ // Build ColumnPath in correct order
+ std::vector<std::string> path_(rpath_.crbegin(), rpath_.crend());
+ return std::make_shared<ColumnPath>(std::move(path_));
+}
+
} // namespace parquet_cpp
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/f785e4c7/src/parquet/schema/descriptor.h
----------------------------------------------------------------------
diff --git a/src/parquet/schema/descriptor.h b/src/parquet/schema/descriptor.h
index 3fad182..836aefe 100644
--- a/src/parquet/schema/descriptor.h
+++ b/src/parquet/schema/descriptor.h
@@ -39,7 +39,8 @@ class SchemaDescriptor;
class ColumnDescriptor {
public:
ColumnDescriptor(const schema::NodePtr& node, int16_t max_definition_level,
- int16_t max_repetition_level, const SchemaDescriptor* schema_descr = nullptr);
+ int16_t max_repetition_level,
+ const SchemaDescriptor* schema_descr = nullptr);
int16_t max_definition_level() const {
return max_definition_level_;
@@ -61,6 +62,8 @@ class ColumnDescriptor {
return primitive_node_->name();
}
+ const std::shared_ptr<schema::ColumnPath> path() const;
+
int type_length() const;
int type_precision() const;
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/f785e4c7/src/parquet/schema/schema-converter-test.cc
----------------------------------------------------------------------
diff --git a/src/parquet/schema/schema-converter-test.cc b/src/parquet/schema/schema-converter-test.cc
index f749b40..1ae9d00 100644
--- a/src/parquet/schema/schema-converter-test.cc
+++ b/src/parquet/schema/schema-converter-test.cc
@@ -62,6 +62,23 @@ class TestSchemaConverter : public ::testing::Test {
std::unique_ptr<Node> node_;
};
+bool check_for_parent_consistency(const GroupNode* node) {
+ // Each node should have the group as parent
+ for (int i = 0; i < node->field_count(); i++) {
+ const NodePtr& field = node->field(i);
+ if (field->parent() != node) {
+ return false;
+ }
+ if (field->is_group()) {
+ const GroupNode* group = static_cast<GroupNode*>(field.get());
+ if (!check_for_parent_consistency(group)) {
+ return false;
+ }
+ }
+ }
+ return true;
+}
+
TEST_F(TestSchemaConverter, NestedExample) {
SchemaElement elt;
std::vector<SchemaElement> elements;
@@ -96,6 +113,10 @@ TEST_F(TestSchemaConverter, NestedExample) {
NodePtr schema = GroupNode::Make(name_, Repetition::REPEATED, fields);
ASSERT_TRUE(schema->Equals(group_));
+
+ // Check that the parent relationship in each node is consitent
+ ASSERT_EQ(group_->parent(), nullptr);
+ ASSERT_TRUE(check_for_parent_consistency(group_));
}
TEST_F(TestSchemaConverter, InvalidRoot) {
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/f785e4c7/src/parquet/schema/schema-descriptor-test.cc
----------------------------------------------------------------------
diff --git a/src/parquet/schema/schema-descriptor-test.cc b/src/parquet/schema/schema-descriptor-test.cc
index eda33a9..6daf577 100644
--- a/src/parquet/schema/schema-descriptor-test.cc
+++ b/src/parquet/schema/schema-descriptor-test.cc
@@ -106,7 +106,7 @@ TEST_F(TestSchemaDescriptor, BuildTree) {
// optional group bag 1 0
// repeated group records 2 1
// required int64 item1 2 1
- // optional boolean item1 3 1
+ // optional boolean item2 3 1
// repeated int32 item3 3 2
int16_t ex_max_def_levels[6] = {0, 1, 1, 2, 3, 3};
int16_t ex_max_rep_levels[6] = {0, 0, 1, 1, 1, 2};
@@ -117,6 +117,13 @@ TEST_F(TestSchemaDescriptor, BuildTree) {
EXPECT_EQ(ex_max_rep_levels[i], col->max_repetition_level()) << i;
}
+ ASSERT_EQ(descr_.Column(0)->path()->ToDotString(), "a");
+ ASSERT_EQ(descr_.Column(1)->path()->ToDotString(), "b");
+ ASSERT_EQ(descr_.Column(2)->path()->ToDotString(), "c");
+ ASSERT_EQ(descr_.Column(3)->path()->ToDotString(), "bag.records.item1");
+ ASSERT_EQ(descr_.Column(4)->path()->ToDotString(), "bag.records.item2");
+ ASSERT_EQ(descr_.Column(5)->path()->ToDotString(), "bag.records.item3");
+
// Init clears the leaves
descr_.Init(schema);
ASSERT_EQ(nleaves, descr_.num_columns());
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/f785e4c7/src/parquet/schema/schema-types-test.cc
----------------------------------------------------------------------
diff --git a/src/parquet/schema/schema-types-test.cc b/src/parquet/schema/schema-types-test.cc
index fa4718a..d58c2e7 100644
--- a/src/parquet/schema/schema-types-test.cc
+++ b/src/parquet/schema/schema-types-test.cc
@@ -35,6 +35,21 @@ namespace parquet_cpp {
namespace schema {
// ----------------------------------------------------------------------
+// ColumnPath
+
+TEST(TestColumnPath, TestAttrs) {
+ ColumnPath path(std::vector<std::string>({"toplevel", "leaf"}));
+
+ ASSERT_EQ(path.ToDotString(), "toplevel.leaf");
+
+ std::shared_ptr<ColumnPath> path_ptr = ColumnPath::FromDotString("toplevel.leaf");
+ ASSERT_EQ(path_ptr->ToDotString(), "toplevel.leaf");
+
+ std::shared_ptr<ColumnPath> extended = path_ptr->extend("anotherlevel");
+ ASSERT_EQ(extended->ToDotString(), "toplevel.leaf.anotherlevel");
+}
+
+// ----------------------------------------------------------------------
// Primitive node
class TestPrimitiveNode : public ::testing::Test {
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/f785e4c7/src/parquet/schema/types.cc
----------------------------------------------------------------------
diff --git a/src/parquet/schema/types.cc b/src/parquet/schema/types.cc
index b57fd37..a6be222 100644
--- a/src/parquet/schema/types.cc
+++ b/src/parquet/schema/types.cc
@@ -17,6 +17,7 @@
#include "parquet/schema/types.h"
+#include <algorithm>
#include <memory>
#include "parquet/exception.h"
@@ -28,6 +29,40 @@ namespace parquet_cpp {
namespace schema {
// ----------------------------------------------------------------------
+// ColumnPath
+
+std::shared_ptr<ColumnPath> ColumnPath::FromDotString(const std::string& dotstring) {
+ std::stringstream ss(dotstring);
+ std::string item;
+ std::vector<std::string> path;
+ while (std::getline(ss, item, '.')) {
+ path.push_back(item);
+ }
+ return std::shared_ptr<ColumnPath>(new ColumnPath(std::move(path)));
+}
+
+std::shared_ptr<ColumnPath> ColumnPath::extend(const std::string& node_name) const {
+ std::vector<std::string> path;
+ path.reserve(path_.size() + 1);
+ path.resize(path_.size() + 1);
+ std::copy(path_.cbegin(), path_.cend(), path.begin());
+ path[path_.size()] = node_name;
+
+ return std::shared_ptr<ColumnPath>(new ColumnPath(std::move(path)));
+}
+
+std::string ColumnPath::ToDotString() const {
+ std::stringstream ss;
+ for (auto it = path_.cbegin(); it != path_.cend(); ++it) {
+ if (it != path_.cbegin()) {
+ ss << ".";
+ }
+ ss << *it;
+ }
+ return ss.str();
+}
+
+// ----------------------------------------------------------------------
// Base node
bool Node::EqualsInternal(const Node* other) const {
@@ -37,6 +72,10 @@ bool Node::EqualsInternal(const Node* other) const {
logical_type_ == other->logical_type_;
}
+void Node::SetParent(const Node* parent) {
+ parent_ = parent;
+}
+
// ----------------------------------------------------------------------
// Primitive node
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/f785e4c7/src/parquet/schema/types.h
----------------------------------------------------------------------
diff --git a/src/parquet/schema/types.h b/src/parquet/schema/types.h
index 2eaee8b..0972ac6 100644
--- a/src/parquet/schema/types.h
+++ b/src/parquet/schema/types.h
@@ -78,6 +78,23 @@ struct DecimalMetadata {
int32_t precision;
};
+class ColumnPath {
+ public:
+ ColumnPath() : path_() {}
+ explicit ColumnPath(const std::vector<std::string>& path) : path_(path) {}
+ explicit ColumnPath(std::vector<std::string>&& path) : path_(path) {}
+
+ static std::shared_ptr<ColumnPath> FromDotString(const std::string& dotstring);
+
+ std::shared_ptr<ColumnPath> extend(const std::string& node_name) const;
+ std::string ToDotString() const;
+
+ protected:
+ std::vector<std::string> path_;
+};
+
+class GroupNode;
+
// Base class for logical schema types. A type has a name, repetition level,
// and optionally a logical type (ConvertedType in Parquet metadata parlance)
class Node {
@@ -95,7 +112,8 @@ class Node {
name_(name),
repetition_(repetition),
logical_type_(logical_type),
- id_(id) {}
+ id_(id),
+ parent_(nullptr) {}
virtual ~Node() {}
@@ -141,6 +159,10 @@ class Node {
return id_;
}
+ const Node* parent() const {
+ return parent_;
+ }
+
// Node::Visitor abstract class for walking schemas with the visitor pattern
class Visitor {
public:
@@ -152,13 +174,18 @@ class Node {
virtual void Visit(Visitor* visitor) = 0;
protected:
+ friend class GroupNode;
+
Node::type type_;
std::string name_;
Repetition::type repetition_;
LogicalType::type logical_type_;
int id_;
+ // Nodes should not be shared, they have a single parent.
+ const Node* parent_;
bool EqualsInternal(const Node* other) const;
+ void SetParent(const Node* p_parent);
};
// Save our breath all over the place with these typedefs
@@ -259,7 +286,11 @@ class GroupNode : public Node {
LogicalType::type logical_type = LogicalType::NONE,
int id = -1) :
Node(Node::GROUP, name, repetition, logical_type, id),
- fields_(fields) {}
+ fields_(fields) {
+ for (NodePtr& field : fields_) {
+ field->SetParent(this);
+ }
+ }
NodeVector fields_;
bool EqualsInternal(const GroupNode* other) const;