You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@orc.apache.org by om...@apache.org on 2017/05/10 20:02:56 UTC

orc git commit: ORC-183: Add a method in Type to build type

Repository: orc
Updated Branches:
  refs/heads/master ec95303b0 -> 90f138b06


ORC-183: Add a method in Type to build type

Added static Type* buildTypeFromString(const std::string& input)

Fixes #115

Signed-off-by: Owen O'Malley <om...@apache.org>


Project: http://git-wip-us.apache.org/repos/asf/orc/repo
Commit: http://git-wip-us.apache.org/repos/asf/orc/commit/90f138b0
Tree: http://git-wip-us.apache.org/repos/asf/orc/tree/90f138b0
Diff: http://git-wip-us.apache.org/repos/asf/orc/diff/90f138b0

Branch: refs/heads/master
Commit: 90f138b06053b18b86b09edcb33bf8dc25d7f659
Parents: ec95303
Author: Gang Wu <ga...@alibaba-inc.com>
Authored: Thu Apr 27 21:31:35 2017 -0700
Committer: Owen O'Malley <om...@apache.org>
Committed: Wed May 10 13:02:22 2017 -0700

----------------------------------------------------------------------
 c++/include/orc/Type.hh |   5 ++
 c++/src/TypeImpl.cc     | 196 +++++++++++++++++++++++++++++++++++++++++++
 c++/src/TypeImpl.hh     |  67 +++++++++++++++
 c++/test/TestType.cc    |  23 +++++
 4 files changed, 291 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/orc/blob/90f138b0/c++/include/orc/Type.hh
----------------------------------------------------------------------
diff --git a/c++/include/orc/Type.hh b/c++/include/orc/Type.hh
index 25b8f53..68f5ec2 100644
--- a/c++/include/orc/Type.hh
+++ b/c++/include/orc/Type.hh
@@ -82,6 +82,11 @@ namespace orc {
      * @return a reference to the union type
      */
     virtual Type* addUnionChild(ORC_UNIQUE_PTR<Type> fieldType) = 0;
+
+    /**
+     * Build a Type object from string text representation.
+     */
+    static ORC_UNIQUE_PTR<Type> buildTypeFromString(const std::string& input);
   };
 
   const int64_t DEFAULT_DECIMAL_SCALE = 18;

http://git-wip-us.apache.org/repos/asf/orc/blob/90f138b0/c++/src/TypeImpl.cc
----------------------------------------------------------------------
diff --git a/c++/src/TypeImpl.cc b/c++/src/TypeImpl.cc
index fdf66a0..6074f94 100644
--- a/c++/src/TypeImpl.cc
+++ b/c++/src/TypeImpl.cc
@@ -491,4 +491,200 @@ namespace orc {
     return std::unique_ptr<Type>(result);
   }
 
+  ORC_UNIQUE_PTR<Type> Type::buildTypeFromString(const std::string& input) {
+    std::vector<std::pair<std::string, Type*> > res =
+      TypeImpl::parseType(input, 0, input.size());
+    if (res.size() != 1) {
+      throw std::logic_error("Invalid type string.");
+    }
+    return ORC_UNIQUE_PTR<Type>(res[0].second);
+  }
+
+  Type* TypeImpl::parseArrayType(const std::string &input,
+                                 size_t start,
+                                 size_t end) {
+    TypeImpl* arrayType = new TypeImpl(LIST);
+    std::vector<std::pair<std::string, Type*> > v =
+      TypeImpl::parseType(input, start, end);
+    if (v.size() != 1) {
+      throw std::logic_error("Array type must contain exactly one sub type.");
+    }
+    arrayType->addChildType(ORC_UNIQUE_PTR<Type>(v[0].second));
+    return arrayType;
+  }
+
+  Type* TypeImpl::parseMapType(const std::string &input,
+                               size_t start,
+                               size_t end) {
+    TypeImpl * mapType = new TypeImpl(MAP);
+    std::vector<std::pair<std::string, Type*> > v =
+      TypeImpl::parseType(input, start, end);
+    if (v.size() != 2) {
+      throw std::logic_error(
+        "Map type must contain exactly two sub types.");
+    }
+    mapType->addChildType(ORC_UNIQUE_PTR<Type>(v[0].second));
+    mapType->addChildType(ORC_UNIQUE_PTR<Type>(v[1].second));
+    return mapType;
+  }
+
+  Type* TypeImpl::parseStructType(const std::string &input,
+                                  size_t start,
+                                  size_t end) {
+    TypeImpl* structType = new TypeImpl(STRUCT);
+    std::vector<std::pair<std::string, Type*> > v =
+      TypeImpl::parseType(input, start, end);
+    if (v.size() == 0) {
+      throw std::logic_error(
+        "Struct type must contain at least one sub type.");
+    }
+    for (size_t i = 0; i < v.size(); ++i) {
+      structType->addStructField(v[i].first, ORC_UNIQUE_PTR<Type>(v[i].second));
+    }
+    return structType;
+  }
+
+  Type* TypeImpl::parseUnionType(const std::string &input,
+                                 size_t start,
+                                 size_t end) {
+    TypeImpl* unionType = new TypeImpl(UNION);
+    std::vector<std::pair<std::string, Type*> > v =
+      TypeImpl::parseType(input, start, end);
+    if (v.size() == 0) {
+      throw std::logic_error("Union type must contain at least one sub type.");
+    }
+    for (size_t i = 0; i < v.size(); ++i) {
+      unionType->addChildType(ORC_UNIQUE_PTR<Type>(v[i].second));
+    }
+    return unionType;
+  }
+
+  Type* TypeImpl::parseDecimalType(const std::string &input,
+                                   size_t start,
+                                   size_t end) {
+    size_t sep = input.find(',', start);
+    if (sep + 1 >= end || sep == std::string::npos) {
+      throw std::logic_error("Decimal type must specify precision and scale.");
+    }
+    uint64_t precision =
+      static_cast<uint64_t>(atoi(input.substr(start, sep - start).c_str()));
+    uint64_t scale =
+      static_cast<uint64_t>(atoi(input.substr(sep + 1, end - sep - 1).c_str()));
+    return new TypeImpl(DECIMAL, precision, scale);
+  }
+
+  Type* TypeImpl::parseCategory(std::string category,
+                                const std::string &input,
+                                size_t start,
+                                size_t end) {
+    if (category == "boolean") {
+      return new TypeImpl(BOOLEAN);
+    } else if (category == "tinyint") {
+      return new TypeImpl(BYTE);
+    } else if (category == "smallint") {
+      return new TypeImpl(SHORT);
+    } else if (category == "int") {
+      return new TypeImpl(INT);
+    } else if (category == "bigint") {
+      return new TypeImpl(LONG);
+    } else if (category == "float") {
+      return new TypeImpl(FLOAT);
+    } else if (category == "double") {
+      return new TypeImpl(DOUBLE);
+    } else if (category == "string") {
+      return new TypeImpl(STRING);
+    } else if (category == "binary") {
+      return new TypeImpl(BINARY);
+    } else if (category == "timestamp") {
+      return new TypeImpl(TIMESTAMP);
+    } else if (category == "array") {
+      return parseArrayType(input, start, end);
+    } else if (category == "map") {
+      return parseMapType(input, start, end);
+    } else if (category == "struct") {
+      return parseStructType(input, start, end);
+    } else if (category == "uniontype") {
+      return parseUnionType(input, start, end);
+    } else if (category == "decimal") {
+      return parseDecimalType(input, start, end);
+    } else if (category == "date") {
+      return new TypeImpl(DATE);
+    } else if (category == "varchar") {
+      uint64_t maxLength = static_cast<uint64_t>(
+        atoi(input.substr(start, end - start).c_str()));
+      return new TypeImpl(VARCHAR, maxLength);
+    } else if (category == "char") {
+      uint64_t maxLength = static_cast<uint64_t>(
+        atoi(input.substr(start, end - start).c_str()));
+      return new TypeImpl(CHAR, maxLength);
+    } else {
+      throw std::logic_error("Unknown type " + category);
+    }
+  }
+
+  std::vector<std::pair<std::string, Type *> > TypeImpl::parseType(
+                                                       const std::string &input,
+                                                       size_t start,
+                                                       size_t end) {
+    std::string types = input.substr(start, end - start);
+    std::vector<std::pair<std::string, Type *> > res;
+    size_t pos = 0;
+
+    while (pos < types.size()) {
+      size_t endPos = pos;
+      while (endPos < types.size() && isalnum(types[endPos])) {
+        ++endPos;
+      }
+
+      std::string fieldName;
+      if (types[endPos] == ':') {
+        fieldName = types.substr(pos, endPos - pos);
+        pos = ++endPos;
+        while (endPos < types.size() && isalpha(types[endPos])) {
+          ++endPos;
+        }
+      }
+
+      size_t nextPos = endPos + 1;
+      if (types[endPos] == '<') {
+        int count = 1;
+        while (nextPos < types.size()) {
+          if (types[nextPos] == '<') {
+            ++count;
+          } else if (types[nextPos] == '>') {
+            --count;
+          }
+          if (count == 0) {
+            break;
+          }
+          ++nextPos;
+        }
+        if (nextPos == types.size()) {
+          throw std::logic_error("Invalid type string. Cannot find closing >");
+        }
+      } else if (types[endPos] == '(') {
+        while (nextPos < types.size() && types[nextPos] != ')') {
+          ++nextPos;
+        }
+        if (nextPos == types.size()) {
+          throw std::logic_error("Invalid type string. Cannot find closing )");
+        }
+      } else if (types[endPos] != ',' && types[endPos] != '\0') {
+        throw std::logic_error("Unrecognized character.");
+      }
+
+      std::string category = types.substr(pos, endPos - pos);
+      Type* type = parseCategory(category, types, endPos + 1, nextPos);
+      res.push_back(std::make_pair(fieldName, type));
+
+      if (types[nextPos] == ')' || types[nextPos] == '>') {
+        pos = nextPos + 2;
+      } else {
+        pos = nextPos;
+      }
+    }
+
+    return res;
+  }
+
 }

http://git-wip-us.apache.org/repos/asf/orc/blob/90f138b0/c++/src/TypeImpl.hh
----------------------------------------------------------------------
diff --git a/c++/src/TypeImpl.hh b/c++/src/TypeImpl.hh
index e2866e4..3c3f739 100644
--- a/c++/src/TypeImpl.hh
+++ b/c++/src/TypeImpl.hh
@@ -98,6 +98,11 @@ namespace orc {
      */
     void addChildType(std::unique_ptr<Type> childType);
 
+    static std::vector<std::pair<std::string, Type *> > parseType(
+      const std::string &input,
+      size_t start,
+      size_t end);
+
   private:
     /**
      * Assign ids to this node and its children giving this
@@ -110,6 +115,68 @@ namespace orc {
      * Ensure that ids are assigned to all of the nodes.
      */
     void ensureIdAssigned() const;
+
+    /**
+     * Parse array type from string
+     * @param input the input string of an array type
+     * @param start start position of the input string
+     * @param end end position of the input string
+     */
+    static Type* parseArrayType(const std::string &input,
+                                size_t start,
+                                size_t end);
+
+    /**
+     * Parse map type from string
+     * @param input the input string of a map type
+     * @param start start position of the input string
+     * @param end end position of the input string
+     */
+    static Type* parseMapType(const std::string &input,
+                              size_t start,
+                              size_t end);
+
+    /**
+     * Parse struct type from string
+     * @param input the input string of a struct type
+     * @param start start position of the input string
+     * @param end end position of the input string
+     */
+    static Type* parseStructType(const std::string &input,
+                                 size_t start,
+                                 size_t end);
+
+    /**
+     * Parse union type from string
+     * @param input the input string of an union type
+     * @param start start position of the input string
+     * @param end end position of the input string
+     */
+    static Type* parseUnionType(const std::string &input,
+                                size_t start,
+                                size_t end);
+
+    /**
+     * Parse decimal type from string
+     * @param input the input string of a decimal type
+     * @param start start position of the input string
+     * @param end end position of the input string
+     */
+    static Type* parseDecimalType(const std::string &input,
+                                  size_t start,
+                                  size_t end);
+
+    /**
+     * Parse type for a category
+     * @param category type name
+     * @param input the input string of the category
+     * @param start start position of the input string
+     * @param end end position of the input string
+     */
+    static Type* parseCategory(std::string category,
+                               const std::string &input,
+                               size_t start,
+                               size_t end);
   };
 
   std::unique_ptr<Type> convertType(const proto::Type& type,

http://git-wip-us.apache.org/repos/asf/orc/blob/90f138b0/c++/test/TestType.cc
----------------------------------------------------------------------
diff --git a/c++/test/TestType.cc b/c++/test/TestType.cc
index 3c595d0..8ce9313 100644
--- a/c++/test/TestType.cc
+++ b/c++/test/TestType.cc
@@ -274,4 +274,27 @@ namespace orc {
     EXPECT_EQ(13, cutType->getSubtype(1)->getColumnId());
     EXPECT_EQ(13, cutType->getSubtype(1)->getMaximumColumnId());
   }
+
+  TEST(TestType, buildTypeFromString) {
+    std::string typeStr = "struct<a:int,b:string,c:decimal(10,2),d:varchar(5)>";
+    ORC_UNIQUE_PTR<Type> type = Type::buildTypeFromString(typeStr);
+    EXPECT_EQ(typeStr, type->toString());
+
+    typeStr = "map<boolean,float>";
+    type = Type::buildTypeFromString(typeStr);
+    EXPECT_EQ(typeStr, type->toString());
+
+    typeStr = "uniontype<bigint,binary,timestamp>";
+    type = Type::buildTypeFromString(typeStr);
+    EXPECT_EQ(typeStr, type->toString());
+
+    typeStr = "struct<a:bigint,b:struct<a:binary,b:timestamp>>";
+    type = Type::buildTypeFromString(typeStr);
+    EXPECT_EQ(typeStr, type->toString());
+
+    typeStr =
+      "struct<a:bigint,b:struct<a:binary,b:timestamp>,c:map<double,tinyint>>";
+    type = Type::buildTypeFromString(typeStr);
+    EXPECT_EQ(typeStr, type->toString());
+  }
 }