You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@parquet.apache.org by we...@apache.org on 2016/03/23 02:34:18 UTC
parquet-cpp git commit: PARQUET-568: Enable top-level column
selection.
Repository: parquet-cpp
Updated Branches:
refs/heads/master 1ebf7a1b8 -> 486aa105e
PARQUET-568: Enable top-level column selection.
Author: Aliaksei Sandryhaila <al...@hp.com>
Closes #81 from asandryh/PARQUET-568 and squashes the following commits:
f619ed0 [Aliaksei Sandryhaila] Addressed PR comments.
bf12164 [Aliaksei Sandryhaila] Added column selection capability to parquet_reader.
Project: http://git-wip-us.apache.org/repos/asf/parquet-cpp/repo
Commit: http://git-wip-us.apache.org/repos/asf/parquet-cpp/commit/486aa105
Tree: http://git-wip-us.apache.org/repos/asf/parquet-cpp/tree/486aa105
Diff: http://git-wip-us.apache.org/repos/asf/parquet-cpp/diff/486aa105
Branch: refs/heads/master
Commit: 486aa105eb80602095355484c8000b1125d2897a
Parents: 1ebf7a1
Author: Aliaksei Sandryhaila <al...@hp.com>
Authored: Tue Mar 22 18:34:09 2016 -0700
Committer: Wes McKinney <we...@apache.org>
Committed: Tue Mar 22 18:34:09 2016 -0700
----------------------------------------------------------------------
example/parquet_reader.cc | 16 +++++++++++++---
src/parquet/file/reader.cc | 38 +++++++++++++++++++++++++-------------
src/parquet/file/reader.h | 4 +++-
src/parquet/reader-test.cc | 29 +++++++++++++++++++++++++++--
4 files changed, 68 insertions(+), 19 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/486aa105/example/parquet_reader.cc
----------------------------------------------------------------------
diff --git a/example/parquet_reader.cc b/example/parquet_reader.cc
index 1712ce5..9dafeb7 100644
--- a/example/parquet_reader.cc
+++ b/example/parquet_reader.cc
@@ -17,6 +17,7 @@
#include <iostream>
#include <memory>
+#include <list>
#include <parquet/api/reader.h>
@@ -24,7 +25,7 @@ using namespace parquet_cpp;
int main(int argc, char** argv) {
if (argc > 3) {
- std::cerr << "Usage: parquet_reader [--only-stats] [--no-memory-map] <file>"
+ std::cerr << "Usage: parquet_reader [--only-stats] [--no-memory-map] [--columns=...] <file>"
<< std::endl;
return -1;
}
@@ -34,12 +35,21 @@ int main(int argc, char** argv) {
bool memory_map = true;
// Read command-line options
- char *param;
+ const std::string COLUMNS_PREFIX = "--columns=";
+ std::list<int> columns;
+
+ char *param, *value;
for (int i = 1; i < argc; i++) {
if ((param = std::strstr(argv[i], "--only-stats"))) {
print_values = false;
} else if ((param = std::strstr(argv[i], "--no-memory-map"))) {
memory_map = false;
+ } else if ((param = std::strstr(argv[i], COLUMNS_PREFIX.c_str()))) {
+ value = std::strtok(param+COLUMNS_PREFIX.length(), "," );
+ while (value) {
+ columns.push_back(std::atoi(value));
+ value = std::strtok(nullptr, "," );
+ }
} else {
filename = argv[i];
}
@@ -48,7 +58,7 @@ int main(int argc, char** argv) {
try {
std::unique_ptr<ParquetFileReader> reader = ParquetFileReader::OpenFile(filename,
memory_map);
- reader->DebugPrint(std::cout, print_values);
+ reader->DebugPrint(std::cout, columns, print_values);
} catch (const std::exception& e) {
std::cerr << "Parquet error: "
<< e.what()
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/486aa105/src/parquet/file/reader.cc
----------------------------------------------------------------------
diff --git a/src/parquet/file/reader.cc b/src/parquet/file/reader.cc
index 9020008..ff61941 100644
--- a/src/parquet/file/reader.cc
+++ b/src/parquet/file/reader.cc
@@ -133,11 +133,24 @@ std::shared_ptr<RowGroupReader> ParquetFileReader::RowGroup(int i) {
// the fixed initial size is just for an example
#define COL_WIDTH "20"
-void ParquetFileReader::DebugPrint(std::ostream& stream, bool print_values) {
+void ParquetFileReader::DebugPrint(std::ostream& stream,
+ std::list<int> selected_columns, bool print_values) {
stream << "File statistics:\n";
- stream << "Total rows: " << this->num_rows() << "\n";
+ stream << "Total rows: " << num_rows() << "\n";
- for (int i = 0; i < num_columns(); ++i) {
+ if (selected_columns.size() == 0) {
+ for (int i = 0; i < num_columns(); i++) {
+ selected_columns.push_back(i);
+ }
+ } else {
+ for (auto i : selected_columns) {
+ if (i < 0 || i >= num_columns()) {
+ throw ParquetException("Selected column is out of range");
+ }
+ }
+ }
+
+ for (auto i : selected_columns) {
const ColumnDescriptor* descr = schema_->Column(i);
stream << "Column " << i << ": "
<< descr->name()
@@ -152,9 +165,7 @@ void ParquetFileReader::DebugPrint(std::ostream& stream, bool print_values) {
auto group_reader = RowGroup(r);
// Print column metadata
- int num_columns = group_reader->num_columns();
-
- for (int i = 0; i < num_columns; ++i) {
+ for (auto i : selected_columns) {
RowGroupStatistics stats = group_reader->GetColumnStats(i);
stream << "Column " << i << ": "
@@ -174,9 +185,10 @@ void ParquetFileReader::DebugPrint(std::ostream& stream, bool print_values) {
static constexpr int bufsize = 25;
char buffer[bufsize];
- // Create readers for all columns and print contents
- vector<std::shared_ptr<Scanner> > scanners(num_columns, NULL);
- for (int i = 0; i < num_columns; ++i) {
+ // Create readers for selected columns and print contents
+ vector<std::shared_ptr<Scanner> > scanners(selected_columns.size(), NULL);
+ int j = 0;
+ for (auto i : selected_columns) {
std::shared_ptr<ColumnReader> col_reader = group_reader->Column(i);
std::stringstream ss;
@@ -188,17 +200,17 @@ void ParquetFileReader::DebugPrint(std::ostream& stream, bool print_values) {
// This is OK in this method as long as the RowGroupReader does not get
// deleted
- scanners[i] = Scanner::Make(col_reader);
+ scanners[j++] = Scanner::Make(col_reader);
}
stream << "\n";
bool hasRow;
do {
hasRow = false;
- for (int i = 0; i < num_columns; ++i) {
- if (scanners[i]->HasNext()) {
+ for (auto scanner : scanners) {
+ if (scanner->HasNext()) {
hasRow = true;
- scanners[i]->PrintNext(stream, 17);
+ scanner->PrintNext(stream, 17);
}
}
stream << "\n";
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/486aa105/src/parquet/file/reader.h
----------------------------------------------------------------------
diff --git a/src/parquet/file/reader.h b/src/parquet/file/reader.h
index f4455ac..3a54cfb 100644
--- a/src/parquet/file/reader.h
+++ b/src/parquet/file/reader.h
@@ -21,6 +21,7 @@
#include <cstdint>
#include <iosfwd>
#include <memory>
+#include <list>
#include <string>
#include "parquet/column/page.h"
@@ -119,7 +120,8 @@ class ParquetFileReader {
return schema_->Column(i);
}
- void DebugPrint(std::ostream& stream, bool print_values = true);
+ void DebugPrint(std::ostream& stream, std::list<int> selected_columns,
+ bool print_values = true);
private:
// PIMPL idiom
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/486aa105/src/parquet/reader-test.cc
----------------------------------------------------------------------
diff --git a/src/parquet/reader-test.cc b/src/parquet/reader-test.cc
index 10bcff7..034d4e2 100644
--- a/src/parquet/reader-test.cc
+++ b/src/parquet/reader-test.cc
@@ -124,13 +124,38 @@ TEST_F(TestAllTypesPlain, TestSetScannerBatchSize) {
TEST_F(TestAllTypesPlain, DebugPrintWorks) {
std::stringstream ss;
- // Automatically parses metadata
- reader_->DebugPrint(ss);
+ std::list<int> columns;
+ reader_->DebugPrint(ss, columns);
std::string result = ss.str();
ASSERT_GT(result.size(), 0);
}
+TEST_F(TestAllTypesPlain, ColumnSelection) {
+ std::stringstream ss;
+
+ std::list<int> columns;
+ columns.push_back(5);
+ columns.push_back(0);
+ columns.push_back(10);
+ reader_->DebugPrint(ss, columns);
+
+ std::string result = ss.str();
+ ASSERT_GT(result.size(), 0);
+}
+
+TEST_F(TestAllTypesPlain, ColumnSelectionOutOfRange) {
+ std::stringstream ss;
+
+ std::list<int> columns;
+ columns.push_back(100);
+ ASSERT_THROW(reader_->DebugPrint(ss, columns), ParquetException);
+
+ columns.clear();
+ columns.push_back(-1);
+ ASSERT_THROW(reader_->DebugPrint(ss, columns), ParquetException);
+}
+
class TestLocalFileSource : public ::testing::Test {
public: