You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2018/09/27 12:19:57 UTC
[arrow] 18/24: PARQUET-1083: Factor logic in parquet-scan.cc into a
library function to help with perf testing
This is an automated email from the ASF dual-hosted git repository.
wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
commit 144699c4210ac0b75d16395a583671b2874acc9e
Author: Wes McKinney <we...@twosigma.com>
AuthorDate: Wed Aug 30 09:49:14 2017 +0200
PARQUET-1083: Factor logic in parquet-scan.cc into a library function to help with perf testing
See ARROW-1377
Author: Wes McKinney <we...@twosigma.com>
Closes #385 from wesm/PARQUET-1083 and squashes the following commits:
359cd09 [Wes McKinney] Factor main logic in parquet-scan.cc into a library function, so that library users can use for performance testing
Change-Id: Ia50d136c380c4d42d6c62577e02a9533df6fa6fe
---
cpp/tools/parquet/parquet-scan.cc | 38 ++------------------------------------
1 file changed, 2 insertions(+), 36 deletions(-)
diff --git a/cpp/tools/parquet/parquet-scan.cc b/cpp/tools/parquet/parquet-scan.cc
index 5bf2b18..fdc73d7 100644
--- a/cpp/tools/parquet/parquet-scan.cc
+++ b/cpp/tools/parquet/parquet-scan.cc
@@ -57,50 +57,16 @@ int main(int argc, char** argv) {
}
}
- std::vector<int16_t> rep_levels(batch_size);
- std::vector<int16_t> def_levels(batch_size);
try {
double total_time;
std::clock_t start_time = std::clock();
std::unique_ptr<parquet::ParquetFileReader> reader =
parquet::ParquetFileReader::OpenFile(filename);
- // columns are not specified explicitly. Add all columns
- if (num_columns == 0) {
- num_columns = reader->metadata()->num_columns();
- columns.resize(num_columns);
- for (int i = 0; i < num_columns; i++) {
- columns[i] = i;
- }
- }
-
- std::vector<int64_t> total_rows(num_columns);
-
- for (int r = 0; r < reader->metadata()->num_row_groups(); ++r) {
- auto group_reader = reader->RowGroup(r);
- int col = 0;
- for (auto i : columns) {
- total_rows[col] = 0;
- std::shared_ptr<parquet::ColumnReader> col_reader = group_reader->Column(i);
- size_t value_byte_size = GetTypeByteSize(col_reader->descr()->physical_type());
- std::vector<uint8_t> values(batch_size * value_byte_size);
- int64_t values_read = 0;
- while (col_reader->HasNext()) {
- total_rows[col] +=
- ScanAllValues(batch_size, def_levels.data(), rep_levels.data(),
- values.data(), &values_read, col_reader.get());
- }
- col++;
- }
- }
+ int64_t total_rows = parquet::ScanFileContents(columns, batch_size, reader.get());
total_time = (std::clock() - start_time) / static_cast<double>(CLOCKS_PER_SEC);
- for (int ct = 1; ct < num_columns; ++ct) {
- if (total_rows[0] != total_rows[ct]) {
- std::cerr << "Parquet error: Total rows among columns do not match" << std::endl;
- }
- }
- std::cout << total_rows[0] << " rows scanned in " << total_time << " seconds."
+ std::cout << total_rows << " rows scanned in " << total_time << " seconds."
<< std::endl;
} catch (const std::exception& e) {
std::cerr << "Parquet error: " << e.what() << std::endl;