You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@orc.apache.org by om...@apache.org on 2016/08/18 21:22:06 UTC
orc git commit: ORC-92. Add support for nested column id selection (Chunyang Wen via omalley)

Repository: orc
Updated Branches:
  refs/heads/master a907324be -> d43aa04ec


ORC-92. Add support for nested column id selection (Chunyang Wen via omalley)

Fixes #54

Signed-off-by: Owen O'Malley <om...@apache.org>


Project: http://git-wip-us.apache.org/repos/asf/orc/repo
Commit: http://git-wip-us.apache.org/repos/asf/orc/commit/d43aa04e
Tree: http://git-wip-us.apache.org/repos/asf/orc/tree/d43aa04e
Diff: http://git-wip-us.apache.org/repos/asf/orc/diff/d43aa04e

Branch: refs/heads/master
Commit: d43aa04ecf763e0ded2a36bd424bcd9f9ef251c2
Parents: a907324
Author: wenchunyang <we...@baidu.com>
Authored: Wed Aug 10 10:04:21 2016 +0800
Committer: Owen O'Malley <om...@apache.org>
Committed: Thu Aug 18 14:12:57 2016 -0700

----------------------------------------------------------------------
 c++/include/orc/Reader.hh |  22 +++++-
 c++/src/Reader.cc         | 170 +++++++++++++++++++++++++++--------------
 tools/test/TestMatch.cc   |  29 +++++++
 3 files changed, 160 insertions(+), 61 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/orc/blob/d43aa04e/c++/include/orc/Reader.hh
----------------------------------------------------------------------
diff --git a/c++/include/orc/Reader.hh b/c++/include/orc/Reader.hh
index 14d4d3c..25a0a17 100644
--- a/c++/include/orc/Reader.hh
+++ b/c++/include/orc/Reader.hh
@@ -517,6 +517,18 @@ namespace orc {
     ReaderOptions& include(const std::list<std::string>& include);
 
     /**
+     * Selects which type ids to read. The root type is always 0 and the
+     * rest of the types are labeled in a preorder traversal of the tree.
+     * The parent types are automatically selected, but the children are not.
+     *
+     * This option clears any previous setting of the selected columns or
+     * types.
+     * @param types a list of the type ids to read
+     * @return this
+     */
+    ReaderOptions& includeTypes(const std::list<uint64_t>& types);
+
+    /**
      * Set the section of the file to process.
      * @param offset the starting byte offset
      * @param length the number of bytes to read
@@ -581,13 +593,17 @@ namespace orc {
     ReaderOptions& setMemoryPool(MemoryPool& pool);
 
     /**
-     * Were the include indexes set?
+     * Were the field ids set?
      */
     bool getIndexesSet() const;
 
     /**
-     * Get the list of selected columns to read. All children of the selected
-     * columns are also selected.
+     * Were the type ids set?
+     */
+    bool getTypeIdsSet() const;
+
+    /**
+     * Get the list of selected field or type ids to read.
      */
     const std::list<uint64_t>& getInclude() const;
 

http://git-wip-us.apache.org/repos/asf/orc/blob/d43aa04e/c++/src/Reader.cc
----------------------------------------------------------------------
diff --git a/c++/src/Reader.cc b/c++/src/Reader.cc
index b3eeb4a..184adf4 100644
--- a/c++/src/Reader.cc
+++ b/c++/src/Reader.cc
@@ -35,6 +35,7 @@
 #include <sstream>
 #include <string>
 #include <vector>
+#include <set>
 
 namespace orc {
 
@@ -76,9 +77,15 @@ namespace orc {
     return buffer.str();
   }
 
+  enum ColumnSelection {
+    ColumnSelection_NONE = 0,
+    ColumnSelection_FIELD_NAMES = 1,
+    ColumnSelection_FIELD_IDS = 2,
+    ColumnSelection_TYPE_IDS = 3
+  };
+
   struct ReaderOptionsPrivate {
-    bool setIndexes;
-    bool setNames;
+    ColumnSelection selection;
     std::list<uint64_t> includedColumnIndexes;
     std::list<std::string> includedColumnNames;
     uint64_t dataStart;
@@ -91,8 +98,7 @@ namespace orc {
     std::string serializedTail;
 
     ReaderOptionsPrivate() {
-      setIndexes = false;
-      setNames = false;
+      selection = ColumnSelection_NONE;
       dataStart = 0;
       dataLength = std::numeric_limits<uint64_t>::max();
       tailLocation = std::numeric_limits<uint64_t>::max();
@@ -134,22 +140,26 @@ namespace orc {
   }
 
   ReaderOptions& ReaderOptions::include(const std::list<uint64_t>& include) {
-    privateBits->setIndexes = true;
+    privateBits->selection = ColumnSelection_FIELD_IDS;
     privateBits->includedColumnIndexes.assign(include.begin(), include.end());
-    privateBits->setNames = false;
     privateBits->includedColumnNames.clear();
     return *this;
   }
 
-  ReaderOptions& ReaderOptions::include
-       (const std::list<std::string>& include) {
-    privateBits->setNames = true;
+  ReaderOptions& ReaderOptions::include(const std::list<std::string>& include) {
+    privateBits->selection = ColumnSelection_FIELD_NAMES;
     privateBits->includedColumnNames.assign(include.begin(), include.end());
-    privateBits->setIndexes = false;
     privateBits->includedColumnIndexes.clear();
     return *this;
   }
 
+  ReaderOptions& ReaderOptions::includeTypes(const std::list<uint64_t>& types) {
+    privateBits->selection = ColumnSelection_TYPE_IDS;
+    privateBits->includedColumnIndexes.assign(types.begin(), types.end());
+    privateBits->includedColumnNames.clear();
+    return *this;
+  }
+
   ReaderOptions& ReaderOptions::range(uint64_t offset,
                                       uint64_t length) {
     privateBits->dataStart = offset;
@@ -178,7 +188,11 @@ namespace orc {
   }
 
   bool ReaderOptions::getIndexesSet() const {
-    return privateBits->setIndexes;
+    return privateBits->selection == ColumnSelection_FIELD_IDS;
+  }
+
+  bool ReaderOptions::getTypeIdsSet() const {
+    return privateBits->selection == ColumnSelection_TYPE_IDS;
   }
 
   const std::list<uint64_t>& ReaderOptions::getInclude() const {
@@ -186,7 +200,7 @@ namespace orc {
   }
 
   bool ReaderOptions::getNamesSet() const {
-    return privateBits->setNames;
+    return privateBits->selection == ColumnSelection_FIELD_NAMES;
   }
 
   const std::list<std::string>& ReaderOptions::getIncludeNames() const {
@@ -1113,10 +1127,24 @@ namespace orc {
     proto::StripeFooter getStripeFooter(const proto::StripeInformation& info);
     void startNextStripe();
     void checkOrcVersion();
-    void selectType(const Type& type);
     void readMetadata() const;
-    void updateSelected(const std::list<uint64_t>& fieldIds);
-    void updateSelected(const std::list<std::string>& fieldNames);
+
+    // Select the columns from the options object
+    void updateSelected();
+
+    // Select a field by name
+    void updateSelectedByName(const std::string& name);
+    // Select a field by id
+    void updateSelectedByFieldId(uint64_t fieldId);
+    // Select a type by id
+    void updateSelectedByTypeId(uint64_t typeId);
+
+    // Select all of the recursive children of the given type.
+    void selectChildren(const Type& type);
+
+    // For each child of type, select it if one of its children
+    // is selected.
+    bool selectParents(const Type& type);
 
   public:
     /**
@@ -1277,25 +1305,31 @@ namespace orc {
     }
 
     schema = convertType(footer->types(0), *footer);
+    updateSelected();
+  }
 
+  void ReaderImpl::updateSelected() {
     selectedColumns.assign(static_cast<size_t>(footer->types_size()), false);
     if (schema->getKind() == STRUCT && options.getIndexesSet()) {
-      updateSelected(options.getInclude());
+      for(std::list<uint64_t>::const_iterator field = options.getInclude().begin();
+          field != options.getInclude().end(); ++field) {
+        updateSelectedByFieldId(*field);
+      }
     } else if (schema->getKind() == STRUCT && options.getNamesSet()) {
-      updateSelected(options.getIncludeNames());
+      for(std::list<std::string>::const_iterator field = options.getIncludeNames().begin();
+          field != options.getIncludeNames().end(); ++field) {
+        updateSelectedByName(*field);
+      }
+    } else if (options.getTypeIdsSet()) {
+      for(std::list<uint64_t>::const_iterator typeId = options.getInclude().begin();
+          typeId != options.getInclude().end(); ++typeId) {
+        updateSelectedByTypeId(*typeId);
+      }
     } else {
+      // default is to select all columns
       std::fill(selectedColumns.begin(), selectedColumns.end(), true);
     }
-    selectedColumns[0] = true;
-  }
-
-  void ReaderImpl::selectType(const Type& type) {
-    if (!selectedColumns[static_cast<size_t>(type.getColumnId())]) {
-      selectedColumns[static_cast<size_t>(type.getColumnId())] = true;
-      for (uint64_t i=0; i < type.getSubtypeCount(); i++) {
-        selectType(*type.getSubtype(i));
-      }
-    }
+    selectParents(*schema);
   }
 
   std::string ReaderImpl::getSerializedFileTail() const {
@@ -2209,43 +2243,63 @@ namespace orc {
     }
   }
 
-  void ReaderImpl::updateSelected(const std::list<uint64_t>& fieldIds) {
-    uint64_t childCount = schema->getSubtypeCount();
-    for(std::list<uint64_t>::const_iterator i = fieldIds.begin();
-        i != fieldIds.end(); ++i) {
-      if (*i >= childCount) {
-        std::stringstream buffer;
-        buffer << "Invalid column selected " << *i << " out of "
-               << childCount;
-        throw ParseError(buffer.str());
+  void ReaderImpl::updateSelectedByFieldId(uint64_t fieldId) {
+    if (fieldId < schema->getSubtypeCount()) {
+      selectChildren(*schema->getSubtype(fieldId));
+    } else {
+      std::stringstream buffer;
+      buffer << "Invalid column selected " << fieldId << " out of "
+             << schema->getSubtypeCount();
+      throw ParseError(buffer.str());
+    }
+  }
+
+  void ReaderImpl::updateSelectedByTypeId(uint64_t typeId) {
+    if (typeId < selectedColumns.size()) {
+      selectedColumns[typeId] = true;
+    } else {
+      std::stringstream buffer;
+      buffer << "Invalid type id selected " << typeId << " out of "
+             << selectedColumns.size();
+      throw ParseError(buffer.str());
+    }
+  }
+
+  void ReaderImpl::updateSelectedByName(const std::string& fieldName) {
+    for(size_t field=0; field < schema->getSubtypeCount(); ++field) {
+      if (schema->getFieldName(field) == fieldName) {
+        selectChildren(*schema->getSubtype(field));
+        return;
       }
-      const Type& child = *schema->getSubtype(*i);
-      for(size_t c = child.getColumnId();
-          c <= child.getMaximumColumnId(); ++c){
+    }
+    throw ParseError("Invalid column selected " + fieldName);
+  }
+
+  void ReaderImpl::selectChildren(const Type& type) {
+    size_t id = static_cast<size_t>(type.getColumnId());
+    if (!selectedColumns[id]) {
+      selectedColumns[id] = true;
+      for(size_t c = id; c <= type.getMaximumColumnId(); ++c){
         selectedColumns[c] = true;
       }
     }
   }
 
-  void ReaderImpl::updateSelected(const std::list<std::string>& fieldNames) {
-    uint64_t childCount = schema->getSubtypeCount();
-    for(std::list<std::string>::const_iterator i = fieldNames.begin();
-        i != fieldNames.end(); ++i) {
-      bool foundMatch = false;
-      for(size_t field=0; field < childCount; ++field) {
-        if (schema->getFieldName(field) == *i) {
-          const Type& child = *schema->getSubtype(field);
-          for(size_t c = child.getColumnId();
-              c <= child.getMaximumColumnId(); ++c){
-            selectedColumns[c] = true;
-          }
-          foundMatch = true;
-          break;
-        }
-      }
-      if (!foundMatch) {
-        throw ParseError("Invalid column selected " + *i);
-      }
+  /**
+   * Recurses over a type tree and selects the parents of every selected type.
+   * @return true if any child was selected.
+   */
+  bool ReaderImpl::selectParents(const Type& type) {
+    size_t id = static_cast<size_t>(type.getColumnId());
+    if (selectedColumns[id]) {
+      return true;
+    }
+    bool result = false;
+    for(uint64_t c=0; c < type.getSubtypeCount(); ++c) {
+      result |= selectParents(*type.getSubtype(c));
     }
+    selectedColumns[id] = result;
+    return result;
   }
+
 }// namespace

http://git-wip-us.apache.org/repos/asf/orc/blob/d43aa04e/tools/test/TestMatch.cc
----------------------------------------------------------------------
diff --git a/tools/test/TestMatch.cc b/tools/test/TestMatch.cc
index c7759ae..9f0c297 100644
--- a/tools/test/TestMatch.cc
+++ b/tools/test/TestMatch.cc
@@ -1035,6 +1035,35 @@ TEST(TestMatch, selectColumns) {
         << "\"887336a7\", \"value\": {\"int1\": -941468492, \"string1\": "
         << "\"887336a7\"}}]}";
     EXPECT_EQ(expectedMap.str(), line);
+
+    // Map column #12
+    // two subtypes with column id:
+    // map<string(20),struct(21)<int1(22):int,string1(23):string>
+    cols.clear();
+    cols.push_back(20);
+    cols.push_back(22);
+    cols.push_back(23);
+    opts.includeTypes(cols);
+    reader = orc::createReader(orc::readLocalFile(filename), opts);
+    c = reader->getSelectedColumns();
+    for (unsigned int i=1; i < c.size(); i++) {
+      if (i>=19 && i<=23)
+        EXPECT_TRUE(c[i]);
+      else
+        EXPECT_TRUE(!c[i]);
+    }
+    batch = reader->createRowBatch(1);
+    line.clear();
+    printer = createColumnPrinter(line, &reader->getSelectedType());
+    reader->next(*batch);
+    printer->reset(*batch);
+    printer->printRow(0);
+    std::ostringstream expectedMapWithColumnId;
+    expectedMapWithColumnId << "{\"map\": [{\"key\": \"ba419d35-x\", \"value\": {\"int1\":"
+        << " -1598014431, \"string1\": \"ba419d35-x\"}}, {\"key\": "
+        << "\"887336a7\", \"value\": {\"int1\": -941468492, \"string1\": "
+        << "\"887336a7\"}}]}";
+    EXPECT_EQ(expectedMapWithColumnId.str(), line);
 }
 
 TEST(TestMatch, memoryUse) {