You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@orc.apache.org by om...@apache.org on 2016/08/18 21:22:06 UTC
orc git commit: ORC-92. Add support for nested column id selection
(Chunyang Wen via omalley)
Repository: orc
Updated Branches:
refs/heads/master a907324be -> d43aa04ec
ORC-92. Add support for nested column id selection (Chunyang Wen via omalley)
Fixes #54
Signed-off-by: Owen O'Malley <om...@apache.org>
Project: http://git-wip-us.apache.org/repos/asf/orc/repo
Commit: http://git-wip-us.apache.org/repos/asf/orc/commit/d43aa04e
Tree: http://git-wip-us.apache.org/repos/asf/orc/tree/d43aa04e
Diff: http://git-wip-us.apache.org/repos/asf/orc/diff/d43aa04e
Branch: refs/heads/master
Commit: d43aa04ecf763e0ded2a36bd424bcd9f9ef251c2
Parents: a907324
Author: wenchunyang <we...@baidu.com>
Authored: Wed Aug 10 10:04:21 2016 +0800
Committer: Owen O'Malley <om...@apache.org>
Committed: Thu Aug 18 14:12:57 2016 -0700
----------------------------------------------------------------------
c++/include/orc/Reader.hh | 22 +++++-
c++/src/Reader.cc | 170 +++++++++++++++++++++++++++--------------
tools/test/TestMatch.cc | 29 +++++++
3 files changed, 160 insertions(+), 61 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/orc/blob/d43aa04e/c++/include/orc/Reader.hh
----------------------------------------------------------------------
diff --git a/c++/include/orc/Reader.hh b/c++/include/orc/Reader.hh
index 14d4d3c..25a0a17 100644
--- a/c++/include/orc/Reader.hh
+++ b/c++/include/orc/Reader.hh
@@ -517,6 +517,18 @@ namespace orc {
ReaderOptions& include(const std::list<std::string>& include);
/**
+ * Selects which type ids to read. The root type is always 0 and the
+ * rest of the types are labeled in a preorder traversal of the tree.
+ * The parent types are automatically selected, but the children are not.
+ *
+ * This option clears any previous setting of the selected columns or
+ * types.
+ * @param types a list of the type ids to read
+ * @return this
+ */
+ ReaderOptions& includeTypes(const std::list<uint64_t>& types);
+
+ /**
* Set the section of the file to process.
* @param offset the starting byte offset
* @param length the number of bytes to read
@@ -581,13 +593,17 @@ namespace orc {
ReaderOptions& setMemoryPool(MemoryPool& pool);
/**
- * Were the include indexes set?
+ * Were the field ids set?
*/
bool getIndexesSet() const;
/**
- * Get the list of selected columns to read. All children of the selected
- * columns are also selected.
+ * Were the type ids set?
+ */
+ bool getTypeIdsSet() const;
+
+ /**
+ * Get the list of selected field or type ids to read.
*/
const std::list<uint64_t>& getInclude() const;
http://git-wip-us.apache.org/repos/asf/orc/blob/d43aa04e/c++/src/Reader.cc
----------------------------------------------------------------------
diff --git a/c++/src/Reader.cc b/c++/src/Reader.cc
index b3eeb4a..184adf4 100644
--- a/c++/src/Reader.cc
+++ b/c++/src/Reader.cc
@@ -35,6 +35,7 @@
#include <sstream>
#include <string>
#include <vector>
+#include <set>
namespace orc {
@@ -76,9 +77,15 @@ namespace orc {
return buffer.str();
}
+ enum ColumnSelection {
+ ColumnSelection_NONE = 0,
+ ColumnSelection_FIELD_NAMES = 1,
+ ColumnSelection_FIELD_IDS = 2,
+ ColumnSelection_TYPE_IDS = 3
+ };
+
struct ReaderOptionsPrivate {
- bool setIndexes;
- bool setNames;
+ ColumnSelection selection;
std::list<uint64_t> includedColumnIndexes;
std::list<std::string> includedColumnNames;
uint64_t dataStart;
@@ -91,8 +98,7 @@ namespace orc {
std::string serializedTail;
ReaderOptionsPrivate() {
- setIndexes = false;
- setNames = false;
+ selection = ColumnSelection_NONE;
dataStart = 0;
dataLength = std::numeric_limits<uint64_t>::max();
tailLocation = std::numeric_limits<uint64_t>::max();
@@ -134,22 +140,26 @@ namespace orc {
}
ReaderOptions& ReaderOptions::include(const std::list<uint64_t>& include) {
- privateBits->setIndexes = true;
+ privateBits->selection = ColumnSelection_FIELD_IDS;
privateBits->includedColumnIndexes.assign(include.begin(), include.end());
- privateBits->setNames = false;
privateBits->includedColumnNames.clear();
return *this;
}
- ReaderOptions& ReaderOptions::include
- (const std::list<std::string>& include) {
- privateBits->setNames = true;
+ ReaderOptions& ReaderOptions::include(const std::list<std::string>& include) {
+ privateBits->selection = ColumnSelection_FIELD_NAMES;
privateBits->includedColumnNames.assign(include.begin(), include.end());
- privateBits->setIndexes = false;
privateBits->includedColumnIndexes.clear();
return *this;
}
+ ReaderOptions& ReaderOptions::includeTypes(const std::list<uint64_t>& types) {
+ privateBits->selection = ColumnSelection_TYPE_IDS;
+ privateBits->includedColumnIndexes.assign(types.begin(), types.end());
+ privateBits->includedColumnNames.clear();
+ return *this;
+ }
+
ReaderOptions& ReaderOptions::range(uint64_t offset,
uint64_t length) {
privateBits->dataStart = offset;
@@ -178,7 +188,11 @@ namespace orc {
}
bool ReaderOptions::getIndexesSet() const {
- return privateBits->setIndexes;
+ return privateBits->selection == ColumnSelection_FIELD_IDS;
+ }
+
+ bool ReaderOptions::getTypeIdsSet() const {
+ return privateBits->selection == ColumnSelection_TYPE_IDS;
}
const std::list<uint64_t>& ReaderOptions::getInclude() const {
@@ -186,7 +200,7 @@ namespace orc {
}
bool ReaderOptions::getNamesSet() const {
- return privateBits->setNames;
+ return privateBits->selection == ColumnSelection_FIELD_NAMES;
}
const std::list<std::string>& ReaderOptions::getIncludeNames() const {
@@ -1113,10 +1127,24 @@ namespace orc {
proto::StripeFooter getStripeFooter(const proto::StripeInformation& info);
void startNextStripe();
void checkOrcVersion();
- void selectType(const Type& type);
void readMetadata() const;
- void updateSelected(const std::list<uint64_t>& fieldIds);
- void updateSelected(const std::list<std::string>& fieldNames);
+
+ // Select the columns from the options object
+ void updateSelected();
+
+ // Select a field by name
+ void updateSelectedByName(const std::string& name);
+ // Select a field by id
+ void updateSelectedByFieldId(uint64_t fieldId);
+ // Select a type by id
+ void updateSelectedByTypeId(uint64_t typeId);
+
+ // Select all of the recursive children of the given type.
+ void selectChildren(const Type& type);
+
+ // For each child of type, select it if one of its children
+ // is selected.
+ bool selectParents(const Type& type);
public:
/**
@@ -1277,25 +1305,31 @@ namespace orc {
}
schema = convertType(footer->types(0), *footer);
+ updateSelected();
+ }
+ void ReaderImpl::updateSelected() {
selectedColumns.assign(static_cast<size_t>(footer->types_size()), false);
if (schema->getKind() == STRUCT && options.getIndexesSet()) {
- updateSelected(options.getInclude());
+ for(std::list<uint64_t>::const_iterator field = options.getInclude().begin();
+ field != options.getInclude().end(); ++field) {
+ updateSelectedByFieldId(*field);
+ }
} else if (schema->getKind() == STRUCT && options.getNamesSet()) {
- updateSelected(options.getIncludeNames());
+ for(std::list<std::string>::const_iterator field = options.getIncludeNames().begin();
+ field != options.getIncludeNames().end(); ++field) {
+ updateSelectedByName(*field);
+ }
+ } else if (options.getTypeIdsSet()) {
+ for(std::list<uint64_t>::const_iterator typeId = options.getInclude().begin();
+ typeId != options.getInclude().end(); ++typeId) {
+ updateSelectedByTypeId(*typeId);
+ }
} else {
+ // default is to select all columns
std::fill(selectedColumns.begin(), selectedColumns.end(), true);
}
- selectedColumns[0] = true;
- }
-
- void ReaderImpl::selectType(const Type& type) {
- if (!selectedColumns[static_cast<size_t>(type.getColumnId())]) {
- selectedColumns[static_cast<size_t>(type.getColumnId())] = true;
- for (uint64_t i=0; i < type.getSubtypeCount(); i++) {
- selectType(*type.getSubtype(i));
- }
- }
+ selectParents(*schema);
}
std::string ReaderImpl::getSerializedFileTail() const {
@@ -2209,43 +2243,63 @@ namespace orc {
}
}
- void ReaderImpl::updateSelected(const std::list<uint64_t>& fieldIds) {
- uint64_t childCount = schema->getSubtypeCount();
- for(std::list<uint64_t>::const_iterator i = fieldIds.begin();
- i != fieldIds.end(); ++i) {
- if (*i >= childCount) {
- std::stringstream buffer;
- buffer << "Invalid column selected " << *i << " out of "
- << childCount;
- throw ParseError(buffer.str());
+ void ReaderImpl::updateSelectedByFieldId(uint64_t fieldId) {
+ if (fieldId < schema->getSubtypeCount()) {
+ selectChildren(*schema->getSubtype(fieldId));
+ } else {
+ std::stringstream buffer;
+ buffer << "Invalid column selected " << fieldId << " out of "
+ << schema->getSubtypeCount();
+ throw ParseError(buffer.str());
+ }
+ }
+
+ void ReaderImpl::updateSelectedByTypeId(uint64_t typeId) {
+ if (typeId < selectedColumns.size()) {
+ selectedColumns[typeId] = true;
+ } else {
+ std::stringstream buffer;
+ buffer << "Invalid type id selected " << typeId << " out of "
+ << selectedColumns.size();
+ throw ParseError(buffer.str());
+ }
+ }
+
+ void ReaderImpl::updateSelectedByName(const std::string& fieldName) {
+ for(size_t field=0; field < schema->getSubtypeCount(); ++field) {
+ if (schema->getFieldName(field) == fieldName) {
+ selectChildren(*schema->getSubtype(field));
+ return;
}
- const Type& child = *schema->getSubtype(*i);
- for(size_t c = child.getColumnId();
- c <= child.getMaximumColumnId(); ++c){
+ }
+ throw ParseError("Invalid column selected " + fieldName);
+ }
+
+ void ReaderImpl::selectChildren(const Type& type) {
+ size_t id = static_cast<size_t>(type.getColumnId());
+ if (!selectedColumns[id]) {
+ selectedColumns[id] = true;
+ for(size_t c = id; c <= type.getMaximumColumnId(); ++c){
selectedColumns[c] = true;
}
}
}
- void ReaderImpl::updateSelected(const std::list<std::string>& fieldNames) {
- uint64_t childCount = schema->getSubtypeCount();
- for(std::list<std::string>::const_iterator i = fieldNames.begin();
- i != fieldNames.end(); ++i) {
- bool foundMatch = false;
- for(size_t field=0; field < childCount; ++field) {
- if (schema->getFieldName(field) == *i) {
- const Type& child = *schema->getSubtype(field);
- for(size_t c = child.getColumnId();
- c <= child.getMaximumColumnId(); ++c){
- selectedColumns[c] = true;
- }
- foundMatch = true;
- break;
- }
- }
- if (!foundMatch) {
- throw ParseError("Invalid column selected " + *i);
- }
+ /**
+ * Recurses over a type tree and selects the parents of every selected type.
+ * @return true if any child was selected.
+ */
+ bool ReaderImpl::selectParents(const Type& type) {
+ size_t id = static_cast<size_t>(type.getColumnId());
+ if (selectedColumns[id]) {
+ return true;
+ }
+ bool result = false;
+ for(uint64_t c=0; c < type.getSubtypeCount(); ++c) {
+ result |= selectParents(*type.getSubtype(c));
}
+ selectedColumns[id] = result;
+ return result;
}
+
}// namespace
http://git-wip-us.apache.org/repos/asf/orc/blob/d43aa04e/tools/test/TestMatch.cc
----------------------------------------------------------------------
diff --git a/tools/test/TestMatch.cc b/tools/test/TestMatch.cc
index c7759ae..9f0c297 100644
--- a/tools/test/TestMatch.cc
+++ b/tools/test/TestMatch.cc
@@ -1035,6 +1035,35 @@ TEST(TestMatch, selectColumns) {
<< "\"887336a7\", \"value\": {\"int1\": -941468492, \"string1\": "
<< "\"887336a7\"}}]}";
EXPECT_EQ(expectedMap.str(), line);
+
+ // Map column #12
+ // two subtypes with column id:
+ // map<string(20),struct(21)<int1(22):int,string1(23):string>
+ cols.clear();
+ cols.push_back(20);
+ cols.push_back(22);
+ cols.push_back(23);
+ opts.includeTypes(cols);
+ reader = orc::createReader(orc::readLocalFile(filename), opts);
+ c = reader->getSelectedColumns();
+ for (unsigned int i=1; i < c.size(); i++) {
+ if (i>=19 && i<=23)
+ EXPECT_TRUE(c[i]);
+ else
+ EXPECT_TRUE(!c[i]);
+ }
+ batch = reader->createRowBatch(1);
+ line.clear();
+ printer = createColumnPrinter(line, &reader->getSelectedType());
+ reader->next(*batch);
+ printer->reset(*batch);
+ printer->printRow(0);
+ std::ostringstream expectedMapWithColumnId;
+ expectedMapWithColumnId << "{\"map\": [{\"key\": \"ba419d35-x\", \"value\": {\"int1\":"
+ << " -1598014431, \"string1\": \"ba419d35-x\"}}, {\"key\": "
+ << "\"887336a7\", \"value\": {\"int1\": -941468492, \"string1\": "
+ << "\"887336a7\"}}]}";
+ EXPECT_EQ(expectedMapWithColumnId.str(), line);
}
TEST(TestMatch, memoryUse) {