You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@parquet.apache.org by uw...@apache.org on 2017/08/30 07:49:19 UTC

parquet-cpp git commit: PARQUET-1083: Factor logic in parquet-scan.cc into a library function to help with perf testing

Repository: parquet-cpp
Updated Branches:
  refs/heads/master f8401b15a -> c57deaca6


PARQUET-1083: Factor logic in parquet-scan.cc into a library function to help with perf testing

See ARROW-1377

Author: Wes McKinney <we...@twosigma.com>

Closes #385 from wesm/PARQUET-1083 and squashes the following commits:

359cd09 [Wes McKinney] Factor main logic in parquet-scan.cc into a library function, so that library users can use for performance testing


Project: http://git-wip-us.apache.org/repos/asf/parquet-cpp/repo
Commit: http://git-wip-us.apache.org/repos/asf/parquet-cpp/commit/c57deaca
Tree: http://git-wip-us.apache.org/repos/asf/parquet-cpp/tree/c57deaca
Diff: http://git-wip-us.apache.org/repos/asf/parquet-cpp/diff/c57deaca

Branch: refs/heads/master
Commit: c57deaca6edd64fd125c6c995e51bdbcb6a06d59
Parents: f8401b1
Author: Wes McKinney <we...@twosigma.com>
Authored: Wed Aug 30 09:49:14 2017 +0200
Committer: Uwe L. Korn <uw...@apache.org>
Committed: Wed Aug 30 09:49:14 2017 +0200

----------------------------------------------------------------------
 src/parquet/file/reader.cc | 49 +++++++++++++++++++++++++++++++++++++++++
 src/parquet/file/reader.h  |  9 ++++++++
 tools/parquet-scan.cc      | 38 ++------------------------------
 3 files changed, 60 insertions(+), 36 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/c57deaca/src/parquet/file/reader.cc
----------------------------------------------------------------------
diff --git a/src/parquet/file/reader.cc b/src/parquet/file/reader.cc
index 6e78fa4..c27fa4d 100644
--- a/src/parquet/file/reader.cc
+++ b/src/parquet/file/reader.cc
@@ -134,4 +134,53 @@ std::shared_ptr<FileMetaData> ReadMetaData(
   return ParquetFileReader::Open(source)->metadata();
 }
 
+// ----------------------------------------------------------------------
+// File scanner for performance testing
+
+int64_t ScanFileContents(std::vector<int> columns, const int32_t column_batch_size,
+                         ParquetFileReader* reader) {
+  std::vector<int16_t> rep_levels(column_batch_size);
+  std::vector<int16_t> def_levels(column_batch_size);
+
+  int num_columns = static_cast<int>(columns.size());
+
+  // columns are not specified explicitly. Add all columns
+  if (columns.size() == 0) {
+    num_columns = reader->metadata()->num_columns();
+    columns.resize(num_columns);
+    for (int i = 0; i < num_columns; i++) {
+      columns[i] = i;
+    }
+  }
+
+  std::vector<int64_t> total_rows(num_columns);
+
+  for (int r = 0; r < reader->metadata()->num_row_groups(); ++r) {
+    auto group_reader = reader->RowGroup(r);
+    int col = 0;
+    for (auto i : columns) {
+      total_rows[col] = 0;
+      std::shared_ptr<ColumnReader> col_reader = group_reader->Column(i);
+      size_t value_byte_size = GetTypeByteSize(col_reader->descr()->physical_type());
+      std::vector<uint8_t> values(column_batch_size * value_byte_size);
+
+      int64_t values_read = 0;
+      while (col_reader->HasNext()) {
+        total_rows[col] +=
+            ScanAllValues(column_batch_size, def_levels.data(), rep_levels.data(),
+                          values.data(), &values_read, col_reader.get());
+      }
+      col++;
+    }
+  }
+
+  for (int i = 1; i < num_columns; ++i) {
+    if (total_rows[0] != total_rows[i]) {
+      throw ParquetException("Parquet error: Total rows among columns do not match");
+    }
+  }
+
+  return total_rows[0];
+}
+
 }  // namespace parquet

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/c57deaca/src/parquet/file/reader.h
----------------------------------------------------------------------
diff --git a/src/parquet/file/reader.h b/src/parquet/file/reader.h
index eb85235..0467640 100644
--- a/src/parquet/file/reader.h
+++ b/src/parquet/file/reader.h
@@ -121,6 +121,15 @@ class PARQUET_EXPORT ParquetFileReader {
 std::shared_ptr<FileMetaData> PARQUET_EXPORT
 ReadMetaData(const std::shared_ptr<::arrow::io::ReadableFileInterface>& source);
 
+/// \brief Scan all values in file. Useful for performance testing
+/// \param[in] columns the column numbers to scan. If empty scans all
+/// \param[in] column_batch_size number of values to read at a time when scanning column
+/// \param[in] reader a ParquetFileReader instance
+/// \return number of semantic rows in file
+PARQUET_EXPORT
+int64_t ScanFileContents(std::vector<int> columns, const int32_t column_batch_size,
+                         ParquetFileReader* reader);
+
 }  // namespace parquet
 
 #endif  // PARQUET_FILE_READER_H

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/c57deaca/tools/parquet-scan.cc
----------------------------------------------------------------------
diff --git a/tools/parquet-scan.cc b/tools/parquet-scan.cc
index 5bf2b18..fdc73d7 100644
--- a/tools/parquet-scan.cc
+++ b/tools/parquet-scan.cc
@@ -57,50 +57,16 @@ int main(int argc, char** argv) {
     }
   }
 
-  std::vector<int16_t> rep_levels(batch_size);
-  std::vector<int16_t> def_levels(batch_size);
   try {
     double total_time;
     std::clock_t start_time = std::clock();
     std::unique_ptr<parquet::ParquetFileReader> reader =
         parquet::ParquetFileReader::OpenFile(filename);
-    // columns are not specified explicitly. Add all columns
-    if (num_columns == 0) {
-      num_columns = reader->metadata()->num_columns();
-      columns.resize(num_columns);
-      for (int i = 0; i < num_columns; i++) {
-        columns[i] = i;
-      }
-    }
-
-    std::vector<int64_t> total_rows(num_columns);
-
-    for (int r = 0; r < reader->metadata()->num_row_groups(); ++r) {
-      auto group_reader = reader->RowGroup(r);
-      int col = 0;
-      for (auto i : columns) {
-        total_rows[col] = 0;
-        std::shared_ptr<parquet::ColumnReader> col_reader = group_reader->Column(i);
-        size_t value_byte_size = GetTypeByteSize(col_reader->descr()->physical_type());
-        std::vector<uint8_t> values(batch_size * value_byte_size);
 
-        int64_t values_read = 0;
-        while (col_reader->HasNext()) {
-          total_rows[col] +=
-              ScanAllValues(batch_size, def_levels.data(), rep_levels.data(),
-                            values.data(), &values_read, col_reader.get());
-        }
-        col++;
-      }
-    }
+    int64_t total_rows = parquet::ScanFileContents(columns, batch_size, reader.get());
 
     total_time = (std::clock() - start_time) / static_cast<double>(CLOCKS_PER_SEC);
-    for (int ct = 1; ct < num_columns; ++ct) {
-      if (total_rows[0] != total_rows[ct]) {
-        std::cerr << "Parquet error: Total rows among columns do not match" << std::endl;
-      }
-    }
-    std::cout << total_rows[0] << " rows scanned in " << total_time << " seconds."
+    std::cout << total_rows << " rows scanned in " << total_time << " seconds."
               << std::endl;
   } catch (const std::exception& e) {
     std::cerr << "Parquet error: " << e.what() << std::endl;