You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@parquet.apache.org by we...@apache.org on 2016/03/27 20:31:54 UTC

parquet-cpp git commit: PARQUET-566: Add method to retrieve the full column path

Repository: parquet-cpp
Updated Branches:
  refs/heads/master fbb25bf3e -> f785e4c7b


PARQUET-566: Add method to retrieve the full column path

This is based on the idea of `org.apache.parquet.hadoop.metadata.ColumnPath`.

Author: Uwe L. Korn <uw...@xhochy.com>

Closes #82 from xhochy/parquet-566 and squashes the following commits:

dd48b01 [Uwe L. Korn] Move friend declaration into protected section
43f51d1 [Uwe L. Korn] Construct ColumnPath only on request
bb30ab3 [Uwe L. Korn] Initialise parent with nullptr
98cf302 [Uwe L. Korn] Add parent node reference
799f553 [Uwe L. Korn] Deactivate C++11 lint checks
032de01 [Uwe L. Korn] Use stringstream for readability
ad887a9 [Uwe L. Korn] Adhere to Google naming conventions
ec3c008 [Uwe L. Korn] PARQUET-566: Add method to retrieve the full column path


Project: http://git-wip-us.apache.org/repos/asf/parquet-cpp/repo
Commit: http://git-wip-us.apache.org/repos/asf/parquet-cpp/commit/f785e4c7
Tree: http://git-wip-us.apache.org/repos/asf/parquet-cpp/tree/f785e4c7
Diff: http://git-wip-us.apache.org/repos/asf/parquet-cpp/diff/f785e4c7

Branch: refs/heads/master
Commit: f785e4c7b46e5fa21da5a49051285ae2264e3758
Parents: fbb25bf
Author: Uwe L. Korn <uw...@xhochy.com>
Authored: Sun Mar 27 11:31:44 2016 -0700
Committer: Wes McKinney <we...@apache.org>
Committed: Sun Mar 27 11:31:44 2016 -0700

----------------------------------------------------------------------
 CMakeLists.txt                               |  2 +-
 src/parquet/schema/descriptor.cc             | 17 ++++++++++
 src/parquet/schema/descriptor.h              |  5 ++-
 src/parquet/schema/schema-converter-test.cc  | 21 ++++++++++++
 src/parquet/schema/schema-descriptor-test.cc |  9 +++++-
 src/parquet/schema/schema-types-test.cc      | 15 +++++++++
 src/parquet/schema/types.cc                  | 39 +++++++++++++++++++++++
 src/parquet/schema/types.h                   | 35 ++++++++++++++++++--
 8 files changed, 138 insertions(+), 5 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/f785e4c7/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 661d813..f3207c0 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -276,7 +276,7 @@ if (UNIX)
   add_custom_target(lint ${BUILD_SUPPORT_DIR}/cpplint.py
   --verbose=2
   --linelength=90
-  --filter=-whitespace/comments,-readability/todo,-build/header_guard,-runtime/references,-readability/check
+  --filter=-whitespace/comments,-readability/todo,-build/header_guard,-runtime/references,-readability/check,-build/c++11
     `find ${CMAKE_CURRENT_SOURCE_DIR}/src -name \\*.cc -or -name \\*.h | sed -e '/parquet\\/thrift/g'`)
 endif (UNIX)
 

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/f785e4c7/src/parquet/schema/descriptor.cc
----------------------------------------------------------------------
diff --git a/src/parquet/schema/descriptor.cc b/src/parquet/schema/descriptor.cc
index 1246f84..21c1cd5 100644
--- a/src/parquet/schema/descriptor.cc
+++ b/src/parquet/schema/descriptor.cc
@@ -21,6 +21,8 @@
 
 namespace parquet_cpp {
 
+using schema::ColumnPath;
+using schema::Node;
 using schema::NodePtr;
 using schema::PrimitiveNode;
 using schema::GroupNode;
@@ -96,4 +98,19 @@ int ColumnDescriptor::type_length() const {
   return primitive_node_->type_length();
 }
 
+const std::shared_ptr<ColumnPath> ColumnDescriptor::path() const {
+  // Build the path in reverse order as we traverse the nodes to the top
+  std::vector<std::string> rpath_;
+  const Node* node = primitive_node_;
+  // The schema node is not part of the ColumnPath
+  while (node->parent()) {
+    rpath_.push_back(node->name());
+    node = node->parent();
+  }
+
+  // Build ColumnPath in correct order
+  std::vector<std::string> path_(rpath_.crbegin(), rpath_.crend());
+  return std::make_shared<ColumnPath>(std::move(path_));
+}
+
 } // namespace parquet_cpp

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/f785e4c7/src/parquet/schema/descriptor.h
----------------------------------------------------------------------
diff --git a/src/parquet/schema/descriptor.h b/src/parquet/schema/descriptor.h
index 3fad182..836aefe 100644
--- a/src/parquet/schema/descriptor.h
+++ b/src/parquet/schema/descriptor.h
@@ -39,7 +39,8 @@ class SchemaDescriptor;
 class ColumnDescriptor {
  public:
   ColumnDescriptor(const schema::NodePtr& node, int16_t max_definition_level,
-      int16_t max_repetition_level, const SchemaDescriptor* schema_descr = nullptr);
+      int16_t max_repetition_level,
+      const SchemaDescriptor* schema_descr = nullptr);
 
   int16_t max_definition_level() const {
     return max_definition_level_;
@@ -61,6 +62,8 @@ class ColumnDescriptor {
     return primitive_node_->name();
   }
 
+  const std::shared_ptr<schema::ColumnPath> path() const;
+
   int type_length() const;
 
   int type_precision() const;

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/f785e4c7/src/parquet/schema/schema-converter-test.cc
----------------------------------------------------------------------
diff --git a/src/parquet/schema/schema-converter-test.cc b/src/parquet/schema/schema-converter-test.cc
index f749b40..1ae9d00 100644
--- a/src/parquet/schema/schema-converter-test.cc
+++ b/src/parquet/schema/schema-converter-test.cc
@@ -62,6 +62,23 @@ class TestSchemaConverter : public ::testing::Test {
   std::unique_ptr<Node> node_;
 };
 
+bool check_for_parent_consistency(const GroupNode* node) {
+  // Each node should have the group as parent
+  for (int i = 0; i < node->field_count(); i++) {
+    const NodePtr& field = node->field(i);
+    if (field->parent() != node) {
+      return false;
+    }
+    if (field->is_group()) {
+      const GroupNode* group = static_cast<GroupNode*>(field.get());
+      if (!check_for_parent_consistency(group)) {
+        return false;
+      }
+    }
+  }
+  return true;
+}
+
 TEST_F(TestSchemaConverter, NestedExample) {
   SchemaElement elt;
   std::vector<SchemaElement> elements;
@@ -96,6 +113,10 @@ TEST_F(TestSchemaConverter, NestedExample) {
   NodePtr schema = GroupNode::Make(name_, Repetition::REPEATED, fields);
 
   ASSERT_TRUE(schema->Equals(group_));
+
+  // Check that the parent relationship in each node is consitent
+  ASSERT_EQ(group_->parent(), nullptr);
+  ASSERT_TRUE(check_for_parent_consistency(group_));
 }
 
 TEST_F(TestSchemaConverter, InvalidRoot) {

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/f785e4c7/src/parquet/schema/schema-descriptor-test.cc
----------------------------------------------------------------------
diff --git a/src/parquet/schema/schema-descriptor-test.cc b/src/parquet/schema/schema-descriptor-test.cc
index eda33a9..6daf577 100644
--- a/src/parquet/schema/schema-descriptor-test.cc
+++ b/src/parquet/schema/schema-descriptor-test.cc
@@ -106,7 +106,7 @@ TEST_F(TestSchemaDescriptor, BuildTree) {
   // optional group bag          1    0
   //   repeated group records    2    1
   //     required int64 item1    2    1
-  //     optional boolean item1  3    1
+  //     optional boolean item2  3    1
   //     repeated int32 item3    3    2
   int16_t ex_max_def_levels[6] = {0, 1, 1, 2, 3, 3};
   int16_t ex_max_rep_levels[6] = {0, 0, 1, 1, 1, 2};
@@ -117,6 +117,13 @@ TEST_F(TestSchemaDescriptor, BuildTree) {
     EXPECT_EQ(ex_max_rep_levels[i], col->max_repetition_level()) << i;
   }
 
+  ASSERT_EQ(descr_.Column(0)->path()->ToDotString(), "a");
+  ASSERT_EQ(descr_.Column(1)->path()->ToDotString(), "b");
+  ASSERT_EQ(descr_.Column(2)->path()->ToDotString(), "c");
+  ASSERT_EQ(descr_.Column(3)->path()->ToDotString(), "bag.records.item1");
+  ASSERT_EQ(descr_.Column(4)->path()->ToDotString(), "bag.records.item2");
+  ASSERT_EQ(descr_.Column(5)->path()->ToDotString(), "bag.records.item3");
+
   // Init clears the leaves
   descr_.Init(schema);
   ASSERT_EQ(nleaves, descr_.num_columns());

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/f785e4c7/src/parquet/schema/schema-types-test.cc
----------------------------------------------------------------------
diff --git a/src/parquet/schema/schema-types-test.cc b/src/parquet/schema/schema-types-test.cc
index fa4718a..d58c2e7 100644
--- a/src/parquet/schema/schema-types-test.cc
+++ b/src/parquet/schema/schema-types-test.cc
@@ -35,6 +35,21 @@ namespace parquet_cpp {
 namespace schema {
 
 // ----------------------------------------------------------------------
+// ColumnPath
+
+TEST(TestColumnPath, TestAttrs) {
+  ColumnPath path(std::vector<std::string>({"toplevel", "leaf"}));
+
+  ASSERT_EQ(path.ToDotString(), "toplevel.leaf");
+
+  std::shared_ptr<ColumnPath> path_ptr = ColumnPath::FromDotString("toplevel.leaf");
+  ASSERT_EQ(path_ptr->ToDotString(), "toplevel.leaf");
+
+  std::shared_ptr<ColumnPath> extended = path_ptr->extend("anotherlevel");
+  ASSERT_EQ(extended->ToDotString(), "toplevel.leaf.anotherlevel");
+}
+
+// ----------------------------------------------------------------------
 // Primitive node
 
 class TestPrimitiveNode : public ::testing::Test {

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/f785e4c7/src/parquet/schema/types.cc
----------------------------------------------------------------------
diff --git a/src/parquet/schema/types.cc b/src/parquet/schema/types.cc
index b57fd37..a6be222 100644
--- a/src/parquet/schema/types.cc
+++ b/src/parquet/schema/types.cc
@@ -17,6 +17,7 @@
 
 #include "parquet/schema/types.h"
 
+#include <algorithm>
 #include <memory>
 
 #include "parquet/exception.h"
@@ -28,6 +29,40 @@ namespace parquet_cpp {
 namespace schema {
 
 // ----------------------------------------------------------------------
+// ColumnPath
+
+std::shared_ptr<ColumnPath> ColumnPath::FromDotString(const std::string& dotstring) {
+  std::stringstream ss(dotstring);
+  std::string item;
+  std::vector<std::string> path;
+  while (std::getline(ss, item, '.')) {
+    path.push_back(item);
+  }
+  return std::shared_ptr<ColumnPath>(new ColumnPath(std::move(path)));
+}
+
+std::shared_ptr<ColumnPath> ColumnPath::extend(const std::string& node_name) const {
+  std::vector<std::string> path;
+  path.reserve(path_.size() + 1);
+  path.resize(path_.size() + 1);
+  std::copy(path_.cbegin(), path_.cend(), path.begin());
+  path[path_.size()] = node_name;
+
+  return std::shared_ptr<ColumnPath>(new ColumnPath(std::move(path)));
+}
+
+std::string ColumnPath::ToDotString() const {
+  std::stringstream ss;
+  for (auto it = path_.cbegin(); it != path_.cend(); ++it) {
+    if (it != path_.cbegin()) {
+      ss << ".";
+    }
+    ss << *it;
+  }
+  return ss.str();
+}
+
+// ----------------------------------------------------------------------
 // Base node
 
 bool Node::EqualsInternal(const Node* other) const {
@@ -37,6 +72,10 @@ bool Node::EqualsInternal(const Node* other) const {
     logical_type_ == other->logical_type_;
 }
 
+void Node::SetParent(const Node* parent) {
+    parent_ = parent;
+}
+
 // ----------------------------------------------------------------------
 // Primitive node
 

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/f785e4c7/src/parquet/schema/types.h
----------------------------------------------------------------------
diff --git a/src/parquet/schema/types.h b/src/parquet/schema/types.h
index 2eaee8b..0972ac6 100644
--- a/src/parquet/schema/types.h
+++ b/src/parquet/schema/types.h
@@ -78,6 +78,23 @@ struct DecimalMetadata {
   int32_t precision;
 };
 
+class ColumnPath {
+ public:
+  ColumnPath() : path_() {}
+  explicit ColumnPath(const std::vector<std::string>& path) : path_(path) {}
+  explicit ColumnPath(std::vector<std::string>&& path) : path_(path) {}
+
+  static std::shared_ptr<ColumnPath> FromDotString(const std::string& dotstring);
+
+  std::shared_ptr<ColumnPath> extend(const std::string& node_name) const;
+  std::string ToDotString() const;
+
+ protected:
+  std::vector<std::string> path_;
+};
+
+class GroupNode;
+
 // Base class for logical schema types. A type has a name, repetition level,
 // and optionally a logical type (ConvertedType in Parquet metadata parlance)
 class Node {
@@ -95,7 +112,8 @@ class Node {
       name_(name),
       repetition_(repetition),
       logical_type_(logical_type),
-      id_(id) {}
+      id_(id),
+      parent_(nullptr) {}
 
   virtual ~Node() {}
 
@@ -141,6 +159,10 @@ class Node {
     return id_;
   }
 
+  const Node* parent() const {
+    return parent_;
+  }
+
   // Node::Visitor abstract class for walking schemas with the visitor pattern
   class Visitor {
    public:
@@ -152,13 +174,18 @@ class Node {
   virtual void Visit(Visitor* visitor) = 0;
 
  protected:
+  friend class GroupNode;
+
   Node::type type_;
   std::string name_;
   Repetition::type repetition_;
   LogicalType::type logical_type_;
   int id_;
+  // Nodes should not be shared, they have a single parent.
+  const Node* parent_;
 
   bool EqualsInternal(const Node* other) const;
+  void SetParent(const Node* p_parent);
 };
 
 // Save our breath all over the place with these typedefs
@@ -259,7 +286,11 @@ class GroupNode : public Node {
       LogicalType::type logical_type = LogicalType::NONE,
       int id = -1) :
       Node(Node::GROUP, name, repetition, logical_type, id),
-      fields_(fields) {}
+      fields_(fields) {
+      for (NodePtr& field : fields_) {
+        field->SetParent(this);
+      }
+    }
 
   NodeVector fields_;
   bool EqualsInternal(const GroupNode* other) const;