You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@orc.apache.org by do...@apache.org on 2021/12/15 23:58:11 UTC

[orc] branch branch-1.7 updated: ORC-1012: Support specifying columns in orc-scan (#921)

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a commit to branch branch-1.7
in repository https://gitbox.apache.org/repos/asf/orc.git


The following commit(s) were added to refs/heads/branch-1.7 by this push:
     new 43b53ea  ORC-1012: Support specifying columns in orc-scan (#921)
43b53ea is described below

commit 43b53ea616e651d5d4a71537ab69dfdd5a1eed14
Author: Quanlong Huang <hu...@gmail.com>
AuthorDate: Wed Oct 6 11:20:54 2021 +0800

    ORC-1012: Support specifying columns in orc-scan (#921)
    
    ### What changes were proposed in this pull request?
    
    The PR adds an option to support specifying columns to be read in the orc-scan tool.
    
    ### Why are the changes needed?
    
    Currently the orc-scan tool will scan all columns. I find it useful to specifying columns when profiling read performance on specified data types.
    
    ### How was this patch tested?
    
    Manually tested with the new option.
    Added tests in tools/test/TestFileScan.cc.
    
    (cherry picked from commit 9dcd6453d637d7e883b50f66c14c8e0f68fd9939)
    Signed-off-by: Dongjoon Hyun <do...@apache.org>
---
 tools/src/FileScan.cc      | 25 +++++++++++++++++++++----
 tools/test/TestFileScan.cc | 18 ++++++++++++++++++
 2 files changed, 39 insertions(+), 4 deletions(-)

diff --git a/tools/src/FileScan.cc b/tools/src/FileScan.cc
index d91df91..a212dc0 100644
--- a/tools/src/FileScan.cc
+++ b/tools/src/FileScan.cc
@@ -26,11 +26,12 @@
 #include <iostream>
 #include <string>
 
-void scanFile(std::ostream & out, const char* filename, uint64_t batchSize) {
+void scanFile(std::ostream & out, const char* filename, uint64_t batchSize,
+              const orc::RowReaderOptions& rowReaderOpts) {
   orc::ReaderOptions readerOpts;
   std::unique_ptr<orc::Reader> reader =
     orc::createReader(orc::readFile(filename), readerOpts);
-  std::unique_ptr<orc::RowReader> rowReader = reader->createRowReader();
+  std::unique_ptr<orc::RowReader> rowReader = reader->createRowReader(rowReaderOpts);
   std::unique_ptr<orc::ColumnVectorBatch> batch =
     rowReader->createRowBatch(batchSize);
 
@@ -48,14 +49,17 @@ int main(int argc, char* argv[]) {
   static struct option longOptions[] = {
     {"help", no_argument, ORC_NULLPTR, 'h'},
     {"batch", required_argument, ORC_NULLPTR, 'b'},
+    {"columns", required_argument, ORC_NULLPTR, 'c'},
     {ORC_NULLPTR, 0, ORC_NULLPTR, 0}
   };
   bool helpFlag = false;
   uint64_t batchSize = 1024;
+  std::list<uint64_t> cols;
+  orc::RowReaderOptions rowReaderOptions;
   int opt;
   char *tail;
   do {
-    opt = getopt_long(argc, argv, "hb:", longOptions, ORC_NULLPTR);
+    opt = getopt_long(argc, argv, "hb:c:", longOptions, ORC_NULLPTR);
     switch (opt) {
     case '?':
     case 'h':
@@ -69,6 +73,18 @@ int main(int argc, char* argv[]) {
         return 1;
       }
       break;
+    case 'c': {
+      char *col = std::strtok(optarg, ",");
+      while (col) {
+        cols.push_back(static_cast<uint64_t>(std::atoi(col)));
+        col = std::strtok(ORC_NULLPTR, ",");
+      }
+      if (!cols.empty()) {
+        rowReaderOptions.include(cols);
+      }
+      break;
+    }
+    default: break;
     }
   } while (opt != -1);
   argc -= optind;
@@ -76,12 +92,13 @@ int main(int argc, char* argv[]) {
 
   if (argc < 1 || helpFlag) {
     std::cerr << "Usage: orc-scan [-h] [--help]\n"
+              << "                [-c 1,2,...] [--columns=1,2,...]\n"
               << "                [-b<size>] [--batch=<size>] <filename>\n";
     return 1;
   } else {
     for(int i=0; i < argc; ++i) {
       try {
-        scanFile(std::cout, argv[i], batchSize);
+        scanFile(std::cout, argv[i], batchSize, rowReaderOptions);
       } catch (std::exception& ex) {
         std::cerr << "Caught exception in " << argv[i]
                   << ": " << ex.what() << "\n";
diff --git a/tools/test/TestFileScan.cc b/tools/test/TestFileScan.cc
index de2f91d..b53841f 100644
--- a/tools/test/TestFileScan.cc
+++ b/tools/test/TestFileScan.cc
@@ -34,11 +34,22 @@ TEST (TestFileScan, testNominal) {
   EXPECT_EQ("Rows: 32768\nBatches: 33\n", output);
   EXPECT_EQ("", error);
 
+  EXPECT_EQ(0, runProgram({pgm, std::string("-c"), std::string("1,2,3,9"), file},
+                          output, error));
+  EXPECT_EQ("Rows: 32768\nBatches: 33\n", output);
+  EXPECT_EQ("", error);
+
   EXPECT_EQ(0, runProgram({pgm, std::string("-b"), std::string("256"), file},
                           output, error));
   EXPECT_EQ("Rows: 32768\nBatches: 131\n", output);
   EXPECT_EQ("", error);
 
+  EXPECT_EQ(0, runProgram({pgm, std::string("-b"), std::string("256"),
+                           std::string("-c"), std::string("1,2,3"), file},
+                          output, error));
+  EXPECT_EQ("Rows: 32768\nBatches: 131\n", output);
+  EXPECT_EQ("", error);
+
   EXPECT_EQ(0, runProgram({pgm, std::string("-b256"), file}, output, error));
   EXPECT_EQ("Rows: 32768\nBatches: 131\n", output);
   EXPECT_EQ("", error);
@@ -52,6 +63,11 @@ TEST (TestFileScan, testNominal) {
                           output, error));
   EXPECT_EQ("Rows: 32768\nBatches: 131\n", output);
   EXPECT_EQ("", error);
+
+  EXPECT_EQ(0, runProgram({pgm, std::string("--batch=256"),
+                           std::string("--columns=1,2,3"), file},output, error));
+  EXPECT_EQ("Rows: 32768\nBatches: 131\n", output);
+  EXPECT_EQ("", error);
 }
 
 /**
@@ -104,6 +120,7 @@ TEST (TestFileScan, testBadCommand) {
   EXPECT_EQ("", output);
   EXPECT_EQ("orc-scan: option requires an argument -- b\n"
             "Usage: orc-scan [-h] [--help]\n"
+            "                [-c 1,2,...] [--columns=1,2,...]\n"
             "                [-b<size>] [--batch=<size>] <filename>\n",
             removeChars(stripPrefix(error, "orc-scan: "),"'`"));
 
@@ -122,6 +139,7 @@ TEST (TestFileScan, testBadCommand) {
   EXPECT_EQ("", output);
   EXPECT_EQ("orc-scan: option --batch requires an argument\n"
             "Usage: orc-scan [-h] [--help]\n"
+            "                [-c 1,2,...] [--columns=1,2,...]\n"
             "                [-b<size>] [--batch=<size>] <filename>\n",
             removeChars(stripPrefix(error, "orc-scan: "), "'`"));