You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@orc.apache.org by om...@apache.org on 2016/09/07 04:18:27 UTC
orc git commit: ORC-97 Add support for nested column id selection.
(Chunyang Wen reviewed by omalley)
Repository: orc
Updated Branches:
refs/heads/branch-1.2 a0606e468 -> 9de9e1abc
ORC-97 Add support for nested column id selection. (Chunyang Wen reviewed by
omalley)
Fixes #59
Signed-off-by: Owen O'Malley <om...@apache.org>
Project: http://git-wip-us.apache.org/repos/asf/orc/repo
Commit: http://git-wip-us.apache.org/repos/asf/orc/commit/9de9e1ab
Tree: http://git-wip-us.apache.org/repos/asf/orc/tree/9de9e1ab
Diff: http://git-wip-us.apache.org/repos/asf/orc/diff/9de9e1ab
Branch: refs/heads/branch-1.2
Commit: 9de9e1abc7ace5246848a81e96259ac8343e5902
Parents: a0606e4
Author: wenchunyang <we...@baidu.com>
Authored: Wed Aug 10 10:04:21 2016 +0800
Committer: Owen O'Malley <om...@apache.org>
Committed: Tue Sep 6 21:18:05 2016 -0700
----------------------------------------------------------------------
c++/src/Reader.cc | 72 +++++++++++++++++++++++++++++++++++---------
tools/test/TestMatch.cc | 25 +++++++++++++++
2 files changed, 82 insertions(+), 15 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/orc/blob/9de9e1ab/c++/src/Reader.cc
----------------------------------------------------------------------
diff --git a/c++/src/Reader.cc b/c++/src/Reader.cc
index 184adf4..9b1f1b9 100644
--- a/c++/src/Reader.cc
+++ b/c++/src/Reader.cc
@@ -35,6 +35,7 @@
#include <sstream>
#include <string>
#include <vector>
+#include <iterator>
#include <set>
namespace orc {
@@ -79,9 +80,9 @@ namespace orc {
enum ColumnSelection {
ColumnSelection_NONE = 0,
- ColumnSelection_FIELD_NAMES = 1,
+ ColumnSelection_NAMES = 1,
ColumnSelection_FIELD_IDS = 2,
- ColumnSelection_TYPE_IDS = 3
+ ColumnSelection_TYPE_IDS = 3,
};
struct ReaderOptionsPrivate {
@@ -147,7 +148,7 @@ namespace orc {
}
ReaderOptions& ReaderOptions::include(const std::list<std::string>& include) {
- privateBits->selection = ColumnSelection_FIELD_NAMES;
+ privateBits->selection = ColumnSelection_NAMES;
privateBits->includedColumnNames.assign(include.begin(), include.end());
privateBits->includedColumnIndexes.clear();
return *this;
@@ -200,7 +201,7 @@ namespace orc {
}
bool ReaderOptions::getNamesSet() const {
- return privateBits->selection == ColumnSelection_FIELD_NAMES;
+ return privateBits->selection == ColumnSelection_NAMES;
}
const std::list<std::string>& ReaderOptions::getIncludeNames() const {
@@ -1122,6 +1123,8 @@ namespace orc {
proto::StripeInformation currentStripeInfo;
proto::StripeFooter currentStripeFooter;
std::unique_ptr<ColumnReader> reader;
+ std::map<std::string, uint64_t> nameIdMap;
+ std::map<uint64_t, const Type*> idTypeMap;
// internal methods
proto::StripeFooter getStripeFooter(const proto::StripeInformation& info);
@@ -1129,6 +1132,10 @@ namespace orc {
void checkOrcVersion();
void readMetadata() const;
+ // build map from type name and id, id to Type
+ void buildTypeNameIdMap(const Type* type, std::vector<std::string>& columns);
+ std::string toDotColumnPath(const std::vector<std::string>& columns);
+
// Select the columns from the options object
void updateSelected();
@@ -1305,6 +1312,8 @@ namespace orc {
}
schema = convertType(footer->types(0), *footer);
+ std::vector<std::string> columns;
+ buildTypeNameIdMap(schema.get(), columns);
updateSelected();
}
@@ -1330,6 +1339,7 @@ namespace orc {
std::fill(selectedColumns.begin(), selectedColumns.end(), true);
}
selectParents(*schema);
+ selectedColumns[0] = true; // column 0 is selected by default
}
std::string ReaderImpl::getSerializedFileTail() const {
@@ -2256,7 +2266,8 @@ namespace orc {
void ReaderImpl::updateSelectedByTypeId(uint64_t typeId) {
if (typeId < selectedColumns.size()) {
- selectedColumns[typeId] = true;
+ const Type& type = *idTypeMap[typeId];
+ selectChildren(type);
} else {
std::stringstream buffer;
buffer << "Invalid type id selected " << typeId << " out of "
@@ -2266,13 +2277,12 @@ namespace orc {
}
void ReaderImpl::updateSelectedByName(const std::string& fieldName) {
- for(size_t field=0; field < schema->getSubtypeCount(); ++field) {
- if (schema->getFieldName(field) == fieldName) {
- selectChildren(*schema->getSubtype(field));
- return;
- }
+ std::map<std::string, uint64_t>::const_iterator ite = nameIdMap.find(fieldName);
+ if (ite != nameIdMap.end()) {
+ updateSelectedByTypeId(ite->second);
+ } else {
+ throw ParseError("Invalid column selected " + fieldName);
}
- throw ParseError("Invalid column selected " + fieldName);
}
void ReaderImpl::selectChildren(const Type& type) {
@@ -2291,10 +2301,7 @@ namespace orc {
*/
bool ReaderImpl::selectParents(const Type& type) {
size_t id = static_cast<size_t>(type.getColumnId());
- if (selectedColumns[id]) {
- return true;
- }
- bool result = false;
+ bool result = selectedColumns[id];
for(uint64_t c=0; c < type.getSubtypeCount(); ++c) {
result |= selectParents(*type.getSubtype(c));
}
@@ -2302,4 +2309,39 @@ namespace orc {
return result;
}
+ /**
+ * Recurses over a type tree and build two maps
+ * map<TypeName, TypeId>, map<TypeId, Type>
+ */
+ void ReaderImpl::buildTypeNameIdMap(const Type* type, std::vector<std::string>& columns) {
+ // map<type_id, Type*>
+ idTypeMap[type->getColumnId()] = type;
+
+ if (orc::STRUCT == type->getKind()) {
+ for (size_t i = 0; i < type->getSubtypeCount(); ++i) {
+ const std::string& fieldName = type->getFieldName(i);
+ columns.push_back(fieldName);
+ nameIdMap[toDotColumnPath(columns)] = type->getSubtype(i)->getColumnId();
+ buildTypeNameIdMap(type->getSubtype(i), columns);
+ columns.pop_back();
+ }
+ } else {
+ // other non-primitive type
+ for (size_t j = 0; j < type->getSubtypeCount(); ++j) {
+ buildTypeNameIdMap(type->getSubtype(j), columns);
+ }
+ }
+ }
+
+ std::string ReaderImpl::toDotColumnPath(const std::vector<std::string>& columns) {
+ if (columns.empty()) {
+ return std::string();
+ }
+ std::ostringstream columnStream;
+ std::copy(columns.begin(), columns.end(),
+ std::ostream_iterator<std::string>(columnStream, "."));
+ std::string columnPath = columnStream.str();
+ return columnPath.substr(0, columnPath.length() - 1);
+ }
+
}// namespace
http://git-wip-us.apache.org/repos/asf/orc/blob/9de9e1ab/tools/test/TestMatch.cc
----------------------------------------------------------------------
diff --git a/tools/test/TestMatch.cc b/tools/test/TestMatch.cc
index 9f0c297..3f754ca 100644
--- a/tools/test/TestMatch.cc
+++ b/tools/test/TestMatch.cc
@@ -1064,6 +1064,31 @@ TEST(TestMatch, selectColumns) {
<< "\"887336a7\", \"value\": {\"int1\": -941468492, \"string1\": "
<< "\"887336a7\"}}]}";
EXPECT_EQ(expectedMapWithColumnId.str(), line);
+
+ // Struct column #10, with field name: middle
+ std::list<std::string> colNames;
+ colNames.push_back("middle.list.int1");
+ colNames.push_back("middle.list.string1");
+ opts.include(colNames);
+ reader = orc::createReader(orc::readLocalFile(filename), opts);
+ c = reader->getSelectedColumns();
+ for (unsigned int i=1; i < c.size(); i++) {
+ if (i>=10 && i<=14)
+ EXPECT_TRUE(c[i]);
+ else
+ EXPECT_TRUE(!c[i]);
+ }
+ batch = reader->createRowBatch(1);
+ line.clear();
+ printer = createColumnPrinter(line, &reader->getSelectedType());
+ reader->next(*batch);
+ printer->reset(*batch);
+ printer->printRow(0);
+ std::ostringstream expectedStructWithColumnName;
+ expectedStructWithColumnName << "{\"middle\": {\"list\": "
+ << "[{\"int1\": -941468492, \"string1\": \"887336a7\"}, "
+ << "{\"int1\": -1598014431, \"string1\": \"ba419d35-x\"}]}}";
+ EXPECT_EQ(expectedStructWithColumnName.str(), line);
}
TEST(TestMatch, memoryUse) {