You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ap...@apache.org on 2022/11/15 16:19:05 UTC
[arrow] branch master updated: PARQUET-2206: [parquet-cpp] Microbenchmark for ColumnReader ReadBatch and Skip (#14523)
This is an automated email from the ASF dual-hosted git repository.
apitrou pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 3b852e49fe PARQUET-2206: [parquet-cpp] Microbenchmark for ColumnReader ReadBatch and Skip (#14523)
3b852e49fe is described below
commit 3b852e49fec85b57545c6edc6c66d3da93de2c06
Author: Fatemah Panahi <fa...@users.noreply.github.com>
AuthorDate: Tue Nov 15 16:18:57 2022 +0000
PARQUET-2206: [parquet-cpp] Microbenchmark for ColumnReader ReadBatch and Skip (#14523)
Adding a micro benchmark for column reader ReadBatch and Skip. Later, I will add benchmarks for RecordReader's ReadRecords and SkipRecords.
Here are the results from my machine:
```
-------------------------------------------------------------------------------
Benchmark Time CPU Iterations
-------------------------------------------------------------------------------
BM_Skip/0/0/0/1/iterations:1000 11250680 ns 11133405 ns 1000
BM_Skip/0/0/0/1000/iterations:1000 134092 ns 134455 ns 1000
BM_Skip/0/0/0/10000/iterations:1000 175717 ns 175677 ns 1000
BM_Skip/0/0/0/100000/iterations:1000 217368 ns 218672 ns 1000
BM_Skip/0/0/1/1/iterations:1000 150319842 ns 149567587 ns 1000
BM_Skip/0/0/1/1000/iterations:1000 244565 ns 244931 ns 1000
BM_Skip/0/0/1/10000/iterations:1000 115395 ns 115924 ns 1000
BM_Skip/0/0/1/100000/iterations:1000 115241 ns 115916 ns 1000
BM_Skip/1/0/0/1/iterations:1000 23289018 ns 23190677 ns 1000
BM_Skip/1/0/0/1000/iterations:1000 622022 ns 621621 ns 1000
BM_Skip/1/0/0/10000/iterations:1000 540981 ns 540620 ns 1000
BM_Skip/1/0/0/100000/iterations:1000 543156 ns 543126 ns 1000
BM_Skip/1/0/1/1/iterations:1000 149224507 ns 148683644 ns 1000
BM_Skip/1/0/1/1000/iterations:1000 805812 ns 805417 ns 1000
BM_Skip/1/0/1/10000/iterations:1000 702999 ns 700108 ns 1000
BM_Skip/1/0/1/100000/iterations:1000 654163 ns 651947 ns 1000
BM_Skip/1/1/0/1/iterations:1000 33160880 ns 33051386 ns 1000
BM_Skip/1/1/0/1000/iterations:1000 999412 ns 998795 ns 1000
BM_Skip/1/1/0/10000/iterations:1000 815868 ns 814927 ns 1000
BM_Skip/1/1/0/100000/iterations:1000 781166 ns 781112 ns 1000
BM_Skip/1/1/1/1/iterations:1000 165600118 ns 164864530 ns 1000
BM_Skip/1/1/1/1000/iterations:1000 1130975 ns 1130252 ns 1000
BM_Skip/1/1/1/10000/iterations:1000 1009628 ns 1009589 ns 1000
BM_Skip/1/1/1/100000/iterations:1000 1029064 ns 1028726 ns 1000
```
Lead-authored-by: Fatemah Panahi <pa...@google.com>
Co-authored-by: Antoine Pitrou <an...@python.org>
Signed-off-by: Antoine Pitrou <an...@python.org>
---
cpp/src/parquet/CMakeLists.txt | 1 +
cpp/src/parquet/column_reader_benchmark.cc | 157 +++++++++++++++++++++++++++++
2 files changed, 158 insertions(+)
diff --git a/cpp/src/parquet/CMakeLists.txt b/cpp/src/parquet/CMakeLists.txt
index dc55ab158d..17e6fcda72 100644
--- a/cpp/src/parquet/CMakeLists.txt
+++ b/cpp/src/parquet/CMakeLists.txt
@@ -385,6 +385,7 @@ endif()
add_parquet_test(file_deserialize_test SOURCES file_deserialize_test.cc test_util.cc)
add_parquet_test(schema_test)
+add_parquet_benchmark(column_reader_benchmark)
add_parquet_benchmark(column_io_benchmark)
add_parquet_benchmark(encoding_benchmark)
add_parquet_benchmark(level_conversion_benchmark)
diff --git a/cpp/src/parquet/column_reader_benchmark.cc b/cpp/src/parquet/column_reader_benchmark.cc
new file mode 100644
index 0000000000..00af976851
--- /dev/null
+++ b/cpp/src/parquet/column_reader_benchmark.cc
@@ -0,0 +1,157 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "benchmark/benchmark.h"
+#include "parquet/column_page.h"
+#include "parquet/column_reader.h"
+#include "parquet/schema.h"
+#include "parquet/test_util.h"
+#include "parquet/types.h"
+
+namespace parquet {
+
+using benchmark::DoNotOptimize;
+using parquet::Repetition;
+using parquet::test::MakePages;
+using schema::NodePtr;
+
+namespace benchmark {
+
+class BenchmarkHelper {
+ public:
+ BenchmarkHelper(Repetition::type repetition, int num_pages, int levels_per_page) {
+ NodePtr type = schema::Int32("b", repetition);
+
+ if (repetition == Repetition::REQUIRED) {
+ descr_ = std::make_unique<ColumnDescriptor>(type, 0, 0);
+ } else if (repetition == Repetition::OPTIONAL) {
+ descr_ = std::make_unique<ColumnDescriptor>(type, 1, 0);
+ } else {
+ descr_ = std::make_unique<ColumnDescriptor>(type, 1, 1);
+ }
+
+ // Vectors filled with random rep/defs and values to make pages.
+ std::vector<int32_t> values;
+ std::vector<int16_t> def_levels;
+ std::vector<int16_t> rep_levels;
+ std::vector<uint8_t> data_buffer;
+ MakePages<Int32Type>(descr_.get(), num_pages, levels_per_page, def_levels, rep_levels,
+ values, data_buffer, pages_, Encoding::PLAIN);
+ for (const auto& page : pages_) {
+ total_size_ += page->size();
+ }
+ }
+
+ Int32Reader* ResetReader() {
+ std::unique_ptr<PageReader> pager;
+ pager.reset(new test::MockPageReader(pages_));
+ column_reader_ = ColumnReader::Make(descr_.get(), std::move(pager));
+ return static_cast<Int32Reader*>(column_reader_.get());
+ }
+
+ int64_t total_size() const { return total_size_; }
+
+ private:
+ std::vector<std::shared_ptr<Page>> pages_;
+ std::unique_ptr<ColumnDescriptor> descr_;
+ std::shared_ptr<ColumnReader> column_reader_;
+ int64_t total_size_ = 0;
+};
+
+// Benchmarks Skip for ColumnReader with the following parameters in order:
+// - repetition: 0 for REQUIRED, 1 for OPTIONAL, 2 for REPEATED.
+// - batch_size: sets how many values to read at each call.
+static void ColumnReaderSkipInt32(::benchmark::State& state) {
+ const auto repetition = static_cast<Repetition::type>(state.range(0));
+ const auto batch_size = static_cast<int64_t>(state.range(1));
+
+ BenchmarkHelper helper(repetition, /*num_pages=*/16, /*levels_per_page=*/80000);
+
+ for (auto _ : state) {
+ state.PauseTiming();
+ Int32Reader* reader = helper.ResetReader();
+ int64_t values_count = -1;
+ state.ResumeTiming();
+ while (values_count != 0) {
+ DoNotOptimize(values_count = reader->Skip(batch_size));
+ }
+ }
+
+ state.SetBytesProcessed(state.iterations() * helper.total_size());
+}
+
+// Benchmarks ReadBatch for ColumnReader with the following parameters in order:
+// - repetition: 0 for REQUIRED, 1 for OPTIONAL, 2 for REPEATED.
+// - batch_size: sets how many values to read at each call.
+static void ColumnReaderReadBatchInt32(::benchmark::State& state) {
+ const auto repetition = static_cast<Repetition::type>(state.range(0));
+ const auto batch_size = static_cast<int64_t>(state.range(1));
+
+ BenchmarkHelper helper(repetition, /*num_pages=*/16, /*levels_per_page=*/80000);
+
+ // Vectors to read the values into.
+ std::vector<int32_t> read_values(batch_size, -1);
+ std::vector<int16_t> read_defs(batch_size, -1);
+ std::vector<int16_t> read_reps(batch_size, -1);
+ for (auto _ : state) {
+ state.PauseTiming();
+ Int32Reader* reader = helper.ResetReader();
+ int64_t values_count = -1;
+ state.ResumeTiming();
+ while (values_count != 0) {
+ int64_t values_read = 0;
+ DoNotOptimize(values_count =
+ reader->ReadBatch(batch_size, read_defs.data(), read_reps.data(),
+ read_values.data(), &values_read));
+ }
+ }
+
+ state.SetBytesProcessed(state.iterations() * helper.total_size());
+}
+
+BENCHMARK(ColumnReaderSkipInt32)
+ ->ArgNames({"Repetition", "BatchSize"})
+ ->Args({0, 100})
+ ->Args({0, 1000})
+ ->Args({0, 10000})
+ ->Args({0, 100000})
+ ->Args({1, 100})
+ ->Args({1, 1000})
+ ->Args({1, 10000})
+ ->Args({1, 100000})
+ ->Args({2, 100})
+ ->Args({2, 1000})
+ ->Args({2, 10000})
+ ->Args({2, 100000});
+
+BENCHMARK(ColumnReaderReadBatchInt32)
+ ->ArgNames({"Repetition", "BatchSize"})
+ ->Args({0, 100})
+ ->Args({0, 1000})
+ ->Args({0, 10000})
+ ->Args({0, 100000})
+ ->Args({1, 100})
+ ->Args({1, 1000})
+ ->Args({1, 10000})
+ ->Args({1, 100000})
+ ->Args({2, 100})
+ ->Args({2, 1000})
+ ->Args({2, 10000})
+ ->Args({2, 100000});
+
+} // namespace benchmark
+} // namespace parquet