You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@parquet.apache.org by uw...@apache.org on 2017/08/30 07:49:19 UTC
parquet-cpp git commit: PARQUET-1083: Factor logic in parquet-scan.cc
into a library function to help with perf testing
Repository: parquet-cpp
Updated Branches:
refs/heads/master f8401b15a -> c57deaca6
PARQUET-1083: Factor logic in parquet-scan.cc into a library function to help with perf testing
See ARROW-1377
Author: Wes McKinney <we...@twosigma.com>
Closes #385 from wesm/PARQUET-1083 and squashes the following commits:
359cd09 [Wes McKinney] Factor main logic in parquet-scan.cc into a library function, so that library users can use for performance testing
Project: http://git-wip-us.apache.org/repos/asf/parquet-cpp/repo
Commit: http://git-wip-us.apache.org/repos/asf/parquet-cpp/commit/c57deaca
Tree: http://git-wip-us.apache.org/repos/asf/parquet-cpp/tree/c57deaca
Diff: http://git-wip-us.apache.org/repos/asf/parquet-cpp/diff/c57deaca
Branch: refs/heads/master
Commit: c57deaca6edd64fd125c6c995e51bdbcb6a06d59
Parents: f8401b1
Author: Wes McKinney <we...@twosigma.com>
Authored: Wed Aug 30 09:49:14 2017 +0200
Committer: Uwe L. Korn <uw...@apache.org>
Committed: Wed Aug 30 09:49:14 2017 +0200
----------------------------------------------------------------------
src/parquet/file/reader.cc | 49 +++++++++++++++++++++++++++++++++++++++++
src/parquet/file/reader.h | 9 ++++++++
tools/parquet-scan.cc | 38 ++------------------------------
3 files changed, 60 insertions(+), 36 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/c57deaca/src/parquet/file/reader.cc
----------------------------------------------------------------------
diff --git a/src/parquet/file/reader.cc b/src/parquet/file/reader.cc
index 6e78fa4..c27fa4d 100644
--- a/src/parquet/file/reader.cc
+++ b/src/parquet/file/reader.cc
@@ -134,4 +134,53 @@ std::shared_ptr<FileMetaData> ReadMetaData(
return ParquetFileReader::Open(source)->metadata();
}
+// ----------------------------------------------------------------------
+// File scanner for performance testing
+
+int64_t ScanFileContents(std::vector<int> columns, const int32_t column_batch_size,
+ ParquetFileReader* reader) {
+ std::vector<int16_t> rep_levels(column_batch_size);
+ std::vector<int16_t> def_levels(column_batch_size);
+
+ int num_columns = static_cast<int>(columns.size());
+
+ // columns are not specified explicitly. Add all columns
+ if (columns.size() == 0) {
+ num_columns = reader->metadata()->num_columns();
+ columns.resize(num_columns);
+ for (int i = 0; i < num_columns; i++) {
+ columns[i] = i;
+ }
+ }
+
+ std::vector<int64_t> total_rows(num_columns);
+
+ for (int r = 0; r < reader->metadata()->num_row_groups(); ++r) {
+ auto group_reader = reader->RowGroup(r);
+ int col = 0;
+ for (auto i : columns) {
+ total_rows[col] = 0;
+ std::shared_ptr<ColumnReader> col_reader = group_reader->Column(i);
+ size_t value_byte_size = GetTypeByteSize(col_reader->descr()->physical_type());
+ std::vector<uint8_t> values(column_batch_size * value_byte_size);
+
+ int64_t values_read = 0;
+ while (col_reader->HasNext()) {
+ total_rows[col] +=
+ ScanAllValues(column_batch_size, def_levels.data(), rep_levels.data(),
+ values.data(), &values_read, col_reader.get());
+ }
+ col++;
+ }
+ }
+
+ for (int i = 1; i < num_columns; ++i) {
+ if (total_rows[0] != total_rows[i]) {
+ throw ParquetException("Parquet error: Total rows among columns do not match");
+ }
+ }
+
+ return total_rows[0];
+}
+
} // namespace parquet
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/c57deaca/src/parquet/file/reader.h
----------------------------------------------------------------------
diff --git a/src/parquet/file/reader.h b/src/parquet/file/reader.h
index eb85235..0467640 100644
--- a/src/parquet/file/reader.h
+++ b/src/parquet/file/reader.h
@@ -121,6 +121,15 @@ class PARQUET_EXPORT ParquetFileReader {
std::shared_ptr<FileMetaData> PARQUET_EXPORT
ReadMetaData(const std::shared_ptr<::arrow::io::ReadableFileInterface>& source);
+/// \brief Scan all values in file. Useful for performance testing
+/// \param[in] columns the column numbers to scan. If empty scans all
+/// \param[in] column_batch_size number of values to read at a time when scanning column
+/// \param[in] reader a ParquetFileReader instance
+/// \return number of semantic rows in file
+PARQUET_EXPORT
+int64_t ScanFileContents(std::vector<int> columns, const int32_t column_batch_size,
+ ParquetFileReader* reader);
+
} // namespace parquet
#endif // PARQUET_FILE_READER_H
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/c57deaca/tools/parquet-scan.cc
----------------------------------------------------------------------
diff --git a/tools/parquet-scan.cc b/tools/parquet-scan.cc
index 5bf2b18..fdc73d7 100644
--- a/tools/parquet-scan.cc
+++ b/tools/parquet-scan.cc
@@ -57,50 +57,16 @@ int main(int argc, char** argv) {
}
}
- std::vector<int16_t> rep_levels(batch_size);
- std::vector<int16_t> def_levels(batch_size);
try {
double total_time;
std::clock_t start_time = std::clock();
std::unique_ptr<parquet::ParquetFileReader> reader =
parquet::ParquetFileReader::OpenFile(filename);
- // columns are not specified explicitly. Add all columns
- if (num_columns == 0) {
- num_columns = reader->metadata()->num_columns();
- columns.resize(num_columns);
- for (int i = 0; i < num_columns; i++) {
- columns[i] = i;
- }
- }
-
- std::vector<int64_t> total_rows(num_columns);
-
- for (int r = 0; r < reader->metadata()->num_row_groups(); ++r) {
- auto group_reader = reader->RowGroup(r);
- int col = 0;
- for (auto i : columns) {
- total_rows[col] = 0;
- std::shared_ptr<parquet::ColumnReader> col_reader = group_reader->Column(i);
- size_t value_byte_size = GetTypeByteSize(col_reader->descr()->physical_type());
- std::vector<uint8_t> values(batch_size * value_byte_size);
- int64_t values_read = 0;
- while (col_reader->HasNext()) {
- total_rows[col] +=
- ScanAllValues(batch_size, def_levels.data(), rep_levels.data(),
- values.data(), &values_read, col_reader.get());
- }
- col++;
- }
- }
+ int64_t total_rows = parquet::ScanFileContents(columns, batch_size, reader.get());
total_time = (std::clock() - start_time) / static_cast<double>(CLOCKS_PER_SEC);
- for (int ct = 1; ct < num_columns; ++ct) {
- if (total_rows[0] != total_rows[ct]) {
- std::cerr << "Parquet error: Total rows among columns do not match" << std::endl;
- }
- }
- std::cout << total_rows[0] << " rows scanned in " << total_time << " seconds."
+ std::cout << total_rows << " rows scanned in " << total_time << " seconds."
<< std::endl;
} catch (const std::exception& e) {
std::cerr << "Parquet error: " << e.what() << std::endl;