You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@parquet.apache.org by we...@apache.org on 2016/03/23 02:34:18 UTC

parquet-cpp git commit: PARQUET-568: Enable top-level column selection.

Repository: parquet-cpp
Updated Branches:
  refs/heads/master 1ebf7a1b8 -> 486aa105e


PARQUET-568: Enable top-level column selection.

Author: Aliaksei Sandryhaila <al...@hp.com>

Closes #81 from asandryh/PARQUET-568 and squashes the following commits:

f619ed0 [Aliaksei Sandryhaila] Addressed PR comments.
bf12164 [Aliaksei Sandryhaila] Added column selection capability to parquet_reader.


Project: http://git-wip-us.apache.org/repos/asf/parquet-cpp/repo
Commit: http://git-wip-us.apache.org/repos/asf/parquet-cpp/commit/486aa105
Tree: http://git-wip-us.apache.org/repos/asf/parquet-cpp/tree/486aa105
Diff: http://git-wip-us.apache.org/repos/asf/parquet-cpp/diff/486aa105

Branch: refs/heads/master
Commit: 486aa105eb80602095355484c8000b1125d2897a
Parents: 1ebf7a1
Author: Aliaksei Sandryhaila <al...@hp.com>
Authored: Tue Mar 22 18:34:09 2016 -0700
Committer: Wes McKinney <we...@apache.org>
Committed: Tue Mar 22 18:34:09 2016 -0700

----------------------------------------------------------------------
 example/parquet_reader.cc  | 16 +++++++++++++---
 src/parquet/file/reader.cc | 38 +++++++++++++++++++++++++-------------
 src/parquet/file/reader.h  |  4 +++-
 src/parquet/reader-test.cc | 29 +++++++++++++++++++++++++++--
 4 files changed, 68 insertions(+), 19 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/486aa105/example/parquet_reader.cc
----------------------------------------------------------------------
diff --git a/example/parquet_reader.cc b/example/parquet_reader.cc
index 1712ce5..9dafeb7 100644
--- a/example/parquet_reader.cc
+++ b/example/parquet_reader.cc
@@ -17,6 +17,7 @@
 
 #include <iostream>
 #include <memory>
+#include <list>
 
 #include <parquet/api/reader.h>
 
@@ -24,7 +25,7 @@ using namespace parquet_cpp;
 
 int main(int argc, char** argv) {
   if (argc > 3) {
-    std::cerr << "Usage: parquet_reader [--only-stats] [--no-memory-map] <file>"
+    std::cerr << "Usage: parquet_reader [--only-stats] [--no-memory-map] [--columns=...] <file>"
               << std::endl;
     return -1;
   }
@@ -34,12 +35,21 @@ int main(int argc, char** argv) {
   bool memory_map = true;
 
   // Read command-line options
-  char *param;
+  const std::string COLUMNS_PREFIX = "--columns=";
+  std::list<int> columns;
+
+  char *param, *value;
   for (int i = 1; i < argc; i++) {
     if ((param = std::strstr(argv[i], "--only-stats"))) {
       print_values = false;
     } else if ((param = std::strstr(argv[i], "--no-memory-map"))) {
       memory_map = false;
+    } else if ((param = std::strstr(argv[i], COLUMNS_PREFIX.c_str()))) {
+      value = std::strtok(param+COLUMNS_PREFIX.length(), "," );
+      while (value) {
+        columns.push_back(std::atoi(value));
+        value = std::strtok(nullptr, "," );
+      }
     } else {
       filename = argv[i];
     }
@@ -48,7 +58,7 @@ int main(int argc, char** argv) {
   try {
     std::unique_ptr<ParquetFileReader> reader = ParquetFileReader::OpenFile(filename,
         memory_map);
-    reader->DebugPrint(std::cout, print_values);
+    reader->DebugPrint(std::cout, columns, print_values);
   } catch (const std::exception& e) {
     std::cerr << "Parquet error: "
               << e.what()

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/486aa105/src/parquet/file/reader.cc
----------------------------------------------------------------------
diff --git a/src/parquet/file/reader.cc b/src/parquet/file/reader.cc
index 9020008..ff61941 100644
--- a/src/parquet/file/reader.cc
+++ b/src/parquet/file/reader.cc
@@ -133,11 +133,24 @@ std::shared_ptr<RowGroupReader> ParquetFileReader::RowGroup(int i) {
 // the fixed initial size is just for an example
 #define COL_WIDTH "20"
 
-void ParquetFileReader::DebugPrint(std::ostream& stream, bool print_values) {
+void ParquetFileReader::DebugPrint(std::ostream& stream,
+    std::list<int> selected_columns, bool print_values) {
   stream << "File statistics:\n";
-  stream << "Total rows: " << this->num_rows() << "\n";
+  stream << "Total rows: " << num_rows() << "\n";
 
-  for (int i = 0; i < num_columns(); ++i) {
+  if (selected_columns.size() == 0) {
+    for (int i = 0; i < num_columns(); i++) {
+      selected_columns.push_back(i);
+    }
+  } else {
+    for (auto i : selected_columns) {
+      if (i < 0 || i >= num_columns()) {
+        throw ParquetException("Selected column is out of range");
+      }
+    }
+  }
+
+  for (auto i : selected_columns) {
     const ColumnDescriptor* descr = schema_->Column(i);
     stream << "Column " << i << ": "
            << descr->name()
@@ -152,9 +165,7 @@ void ParquetFileReader::DebugPrint(std::ostream& stream, bool print_values) {
     auto group_reader = RowGroup(r);
 
     // Print column metadata
-    int num_columns = group_reader->num_columns();
-
-    for (int i = 0; i < num_columns; ++i) {
+    for (auto i : selected_columns) {
       RowGroupStatistics stats = group_reader->GetColumnStats(i);
 
       stream << "Column " << i << ": "
@@ -174,9 +185,10 @@ void ParquetFileReader::DebugPrint(std::ostream& stream, bool print_values) {
     static constexpr int bufsize = 25;
     char buffer[bufsize];
 
-    // Create readers for all columns and print contents
-    vector<std::shared_ptr<Scanner> > scanners(num_columns, NULL);
-    for (int i = 0; i < num_columns; ++i) {
+    // Create readers for selected columns and print contents
+    vector<std::shared_ptr<Scanner> > scanners(selected_columns.size(), NULL);
+    int j = 0;
+    for (auto i : selected_columns) {
       std::shared_ptr<ColumnReader> col_reader = group_reader->Column(i);
 
       std::stringstream ss;
@@ -188,17 +200,17 @@ void ParquetFileReader::DebugPrint(std::ostream& stream, bool print_values) {
 
       // This is OK in this method as long as the RowGroupReader does not get
       // deleted
-      scanners[i] = Scanner::Make(col_reader);
+      scanners[j++] = Scanner::Make(col_reader);
     }
     stream << "\n";
 
     bool hasRow;
     do {
       hasRow = false;
-      for (int i = 0; i < num_columns; ++i) {
-        if (scanners[i]->HasNext()) {
+      for (auto scanner : scanners) {
+        if (scanner->HasNext()) {
           hasRow = true;
-          scanners[i]->PrintNext(stream, 17);
+          scanner->PrintNext(stream, 17);
         }
       }
       stream << "\n";

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/486aa105/src/parquet/file/reader.h
----------------------------------------------------------------------
diff --git a/src/parquet/file/reader.h b/src/parquet/file/reader.h
index f4455ac..3a54cfb 100644
--- a/src/parquet/file/reader.h
+++ b/src/parquet/file/reader.h
@@ -21,6 +21,7 @@
 #include <cstdint>
 #include <iosfwd>
 #include <memory>
+#include <list>
 #include <string>
 
 #include "parquet/column/page.h"
@@ -119,7 +120,8 @@ class ParquetFileReader {
     return schema_->Column(i);
   }
 
-  void DebugPrint(std::ostream& stream, bool print_values = true);
+  void DebugPrint(std::ostream& stream, std::list<int> selected_columns,
+      bool print_values = true);
 
  private:
   // PIMPL idiom

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/486aa105/src/parquet/reader-test.cc
----------------------------------------------------------------------
diff --git a/src/parquet/reader-test.cc b/src/parquet/reader-test.cc
index 10bcff7..034d4e2 100644
--- a/src/parquet/reader-test.cc
+++ b/src/parquet/reader-test.cc
@@ -124,13 +124,38 @@ TEST_F(TestAllTypesPlain, TestSetScannerBatchSize) {
 TEST_F(TestAllTypesPlain, DebugPrintWorks) {
   std::stringstream ss;
 
-  // Automatically parses metadata
-  reader_->DebugPrint(ss);
+  std::list<int> columns;
+  reader_->DebugPrint(ss, columns);
 
   std::string result = ss.str();
   ASSERT_GT(result.size(), 0);
 }
 
+TEST_F(TestAllTypesPlain, ColumnSelection) {
+  std::stringstream ss;
+
+  std::list<int> columns;
+  columns.push_back(5);
+  columns.push_back(0);
+  columns.push_back(10);
+  reader_->DebugPrint(ss, columns);
+
+  std::string result = ss.str();
+  ASSERT_GT(result.size(), 0);
+}
+
+TEST_F(TestAllTypesPlain, ColumnSelectionOutOfRange) {
+  std::stringstream ss;
+
+  std::list<int> columns;
+  columns.push_back(100);
+  ASSERT_THROW(reader_->DebugPrint(ss, columns), ParquetException);
+
+  columns.clear();
+  columns.push_back(-1);
+  ASSERT_THROW(reader_->DebugPrint(ss, columns), ParquetException);
+}
+
 
 class TestLocalFileSource : public ::testing::Test {
  public: