You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2018/09/27 12:19:57 UTC

[arrow] 18/24: PARQUET-1083: Factor logic in parquet-scan.cc into a library function to help with perf testing

This is an automated email from the ASF dual-hosted git repository.

wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git

commit 144699c4210ac0b75d16395a583671b2874acc9e
Author: Wes McKinney <we...@twosigma.com>
AuthorDate: Wed Aug 30 09:49:14 2017 +0200

    PARQUET-1083: Factor logic in parquet-scan.cc into a library function to help with perf testing
    
    See ARROW-1377
    
    Author: Wes McKinney <we...@twosigma.com>
    
    Closes #385 from wesm/PARQUET-1083 and squashes the following commits:
    
    359cd09 [Wes McKinney] Factor main logic in parquet-scan.cc into a library function, so that library users can use for performance testing
    
    Change-Id: Ia50d136c380c4d42d6c62577e02a9533df6fa6fe
---
 cpp/tools/parquet/parquet-scan.cc | 38 ++------------------------------------
 1 file changed, 2 insertions(+), 36 deletions(-)

diff --git a/cpp/tools/parquet/parquet-scan.cc b/cpp/tools/parquet/parquet-scan.cc
index 5bf2b18..fdc73d7 100644
--- a/cpp/tools/parquet/parquet-scan.cc
+++ b/cpp/tools/parquet/parquet-scan.cc
@@ -57,50 +57,16 @@ int main(int argc, char** argv) {
     }
   }
 
-  std::vector<int16_t> rep_levels(batch_size);
-  std::vector<int16_t> def_levels(batch_size);
   try {
     double total_time;
     std::clock_t start_time = std::clock();
     std::unique_ptr<parquet::ParquetFileReader> reader =
         parquet::ParquetFileReader::OpenFile(filename);
-    // columns are not specified explicitly. Add all columns
-    if (num_columns == 0) {
-      num_columns = reader->metadata()->num_columns();
-      columns.resize(num_columns);
-      for (int i = 0; i < num_columns; i++) {
-        columns[i] = i;
-      }
-    }
-
-    std::vector<int64_t> total_rows(num_columns);
-
-    for (int r = 0; r < reader->metadata()->num_row_groups(); ++r) {
-      auto group_reader = reader->RowGroup(r);
-      int col = 0;
-      for (auto i : columns) {
-        total_rows[col] = 0;
-        std::shared_ptr<parquet::ColumnReader> col_reader = group_reader->Column(i);
-        size_t value_byte_size = GetTypeByteSize(col_reader->descr()->physical_type());
-        std::vector<uint8_t> values(batch_size * value_byte_size);
 
-        int64_t values_read = 0;
-        while (col_reader->HasNext()) {
-          total_rows[col] +=
-              ScanAllValues(batch_size, def_levels.data(), rep_levels.data(),
-                            values.data(), &values_read, col_reader.get());
-        }
-        col++;
-      }
-    }
+    int64_t total_rows = parquet::ScanFileContents(columns, batch_size, reader.get());
 
     total_time = (std::clock() - start_time) / static_cast<double>(CLOCKS_PER_SEC);
-    for (int ct = 1; ct < num_columns; ++ct) {
-      if (total_rows[0] != total_rows[ct]) {
-        std::cerr << "Parquet error: Total rows among columns do not match" << std::endl;
-      }
-    }
-    std::cout << total_rows[0] << " rows scanned in " << total_time << " seconds."
+    std::cout << total_rows << " rows scanned in " << total_time << " seconds."
               << std::endl;
   } catch (const std::exception& e) {
     std::cerr << "Parquet error: " << e.what() << std::endl;