You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ap...@apache.org on 2022/11/15 16:19:05 UTC

[arrow] branch master updated: PARQUET-2206: [parquet-cpp] Microbenchmark for ColumnReader ReadBatch and Skip (#14523)

This is an automated email from the ASF dual-hosted git repository.

apitrou pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 3b852e49fe PARQUET-2206: [parquet-cpp] Microbenchmark for ColumnReader ReadBatch and Skip (#14523)
3b852e49fe is described below

commit 3b852e49fec85b57545c6edc6c66d3da93de2c06
Author: Fatemah Panahi <fa...@users.noreply.github.com>
AuthorDate: Tue Nov 15 16:18:57 2022 +0000

    PARQUET-2206: [parquet-cpp] Microbenchmark for ColumnReader ReadBatch and Skip (#14523)
    
    Adding a micro benchmark for column reader ReadBatch and Skip. Later, I will add benchmarks for RecordReader's ReadRecords and SkipRecords.
    
    Here are the results from my machine:
    ```
    -------------------------------------------------------------------------------
    Benchmark                                     Time             CPU   Iterations
    -------------------------------------------------------------------------------
    BM_Skip/0/0/0/1/iterations:1000        11250680 ns     11133405 ns         1000
    BM_Skip/0/0/0/1000/iterations:1000       134092 ns       134455 ns         1000
    BM_Skip/0/0/0/10000/iterations:1000      175717 ns       175677 ns         1000
    BM_Skip/0/0/0/100000/iterations:1000     217368 ns       218672 ns         1000
    BM_Skip/0/0/1/1/iterations:1000       150319842 ns    149567587 ns         1000
    BM_Skip/0/0/1/1000/iterations:1000       244565 ns       244931 ns         1000
    BM_Skip/0/0/1/10000/iterations:1000      115395 ns       115924 ns         1000
    BM_Skip/0/0/1/100000/iterations:1000     115241 ns       115916 ns         1000
    BM_Skip/1/0/0/1/iterations:1000        23289018 ns     23190677 ns         1000
    BM_Skip/1/0/0/1000/iterations:1000       622022 ns       621621 ns         1000
    BM_Skip/1/0/0/10000/iterations:1000      540981 ns       540620 ns         1000
    BM_Skip/1/0/0/100000/iterations:1000     543156 ns       543126 ns         1000
    BM_Skip/1/0/1/1/iterations:1000       149224507 ns    148683644 ns         1000
    BM_Skip/1/0/1/1000/iterations:1000       805812 ns       805417 ns         1000
    BM_Skip/1/0/1/10000/iterations:1000      702999 ns       700108 ns         1000
    BM_Skip/1/0/1/100000/iterations:1000     654163 ns       651947 ns         1000
    BM_Skip/1/1/0/1/iterations:1000        33160880 ns     33051386 ns         1000
    BM_Skip/1/1/0/1000/iterations:1000       999412 ns       998795 ns         1000
    BM_Skip/1/1/0/10000/iterations:1000      815868 ns       814927 ns         1000
    BM_Skip/1/1/0/100000/iterations:1000     781166 ns       781112 ns         1000
    BM_Skip/1/1/1/1/iterations:1000       165600118 ns    164864530 ns         1000
    BM_Skip/1/1/1/1000/iterations:1000      1130975 ns      1130252 ns         1000
    BM_Skip/1/1/1/10000/iterations:1000     1009628 ns      1009589 ns         1000
    BM_Skip/1/1/1/100000/iterations:1000    1029064 ns      1028726 ns         1000
    ```
    
    Lead-authored-by: Fatemah Panahi <pa...@google.com>
    Co-authored-by: Antoine Pitrou <an...@python.org>
    Signed-off-by: Antoine Pitrou <an...@python.org>
---
 cpp/src/parquet/CMakeLists.txt             |   1 +
 cpp/src/parquet/column_reader_benchmark.cc | 157 +++++++++++++++++++++++++++++
 2 files changed, 158 insertions(+)

diff --git a/cpp/src/parquet/CMakeLists.txt b/cpp/src/parquet/CMakeLists.txt
index dc55ab158d..17e6fcda72 100644
--- a/cpp/src/parquet/CMakeLists.txt
+++ b/cpp/src/parquet/CMakeLists.txt
@@ -385,6 +385,7 @@ endif()
 add_parquet_test(file_deserialize_test SOURCES file_deserialize_test.cc test_util.cc)
 add_parquet_test(schema_test)
 
+add_parquet_benchmark(column_reader_benchmark)
 add_parquet_benchmark(column_io_benchmark)
 add_parquet_benchmark(encoding_benchmark)
 add_parquet_benchmark(level_conversion_benchmark)
diff --git a/cpp/src/parquet/column_reader_benchmark.cc b/cpp/src/parquet/column_reader_benchmark.cc
new file mode 100644
index 0000000000..00af976851
--- /dev/null
+++ b/cpp/src/parquet/column_reader_benchmark.cc
@@ -0,0 +1,157 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "benchmark/benchmark.h"
+#include "parquet/column_page.h"
+#include "parquet/column_reader.h"
+#include "parquet/schema.h"
+#include "parquet/test_util.h"
+#include "parquet/types.h"
+
+namespace parquet {
+
+using benchmark::DoNotOptimize;
+using parquet::Repetition;
+using parquet::test::MakePages;
+using schema::NodePtr;
+
+namespace benchmark {
+
+class BenchmarkHelper {
+ public:
+  BenchmarkHelper(Repetition::type repetition, int num_pages, int levels_per_page) {
+    NodePtr type = schema::Int32("b", repetition);
+
+    if (repetition == Repetition::REQUIRED) {
+      descr_ = std::make_unique<ColumnDescriptor>(type, 0, 0);
+    } else if (repetition == Repetition::OPTIONAL) {
+      descr_ = std::make_unique<ColumnDescriptor>(type, 1, 0);
+    } else {
+      descr_ = std::make_unique<ColumnDescriptor>(type, 1, 1);
+    }
+
+    // Vectors filled with random rep/defs and values to make pages.
+    std::vector<int32_t> values;
+    std::vector<int16_t> def_levels;
+    std::vector<int16_t> rep_levels;
+    std::vector<uint8_t> data_buffer;
+    MakePages<Int32Type>(descr_.get(), num_pages, levels_per_page, def_levels, rep_levels,
+                         values, data_buffer, pages_, Encoding::PLAIN);
+    for (const auto& page : pages_) {
+      total_size_ += page->size();
+    }
+  }
+
+  Int32Reader* ResetReader() {
+    std::unique_ptr<PageReader> pager;
+    pager.reset(new test::MockPageReader(pages_));
+    column_reader_ = ColumnReader::Make(descr_.get(), std::move(pager));
+    return static_cast<Int32Reader*>(column_reader_.get());
+  }
+
+  int64_t total_size() const { return total_size_; }
+
+ private:
+  std::vector<std::shared_ptr<Page>> pages_;
+  std::unique_ptr<ColumnDescriptor> descr_;
+  std::shared_ptr<ColumnReader> column_reader_;
+  int64_t total_size_ = 0;
+};
+
+// Benchmarks Skip for ColumnReader with the following parameters in order:
+// - repetition: 0 for REQUIRED, 1 for OPTIONAL, 2 for REPEATED.
+// - batch_size: sets how many values to read at each call.
+static void ColumnReaderSkipInt32(::benchmark::State& state) {
+  const auto repetition = static_cast<Repetition::type>(state.range(0));
+  const auto batch_size = static_cast<int64_t>(state.range(1));
+
+  BenchmarkHelper helper(repetition, /*num_pages=*/16, /*levels_per_page=*/80000);
+
+  for (auto _ : state) {
+    state.PauseTiming();
+    Int32Reader* reader = helper.ResetReader();
+    int64_t values_count = -1;
+    state.ResumeTiming();
+    while (values_count != 0) {
+      DoNotOptimize(values_count = reader->Skip(batch_size));
+    }
+  }
+
+  state.SetBytesProcessed(state.iterations() * helper.total_size());
+}
+
+// Benchmarks ReadBatch for ColumnReader with the following parameters in order:
+// - repetition: 0 for REQUIRED, 1 for OPTIONAL, 2 for REPEATED.
+// - batch_size: sets how many values to read at each call.
+static void ColumnReaderReadBatchInt32(::benchmark::State& state) {
+  const auto repetition = static_cast<Repetition::type>(state.range(0));
+  const auto batch_size = static_cast<int64_t>(state.range(1));
+
+  BenchmarkHelper helper(repetition, /*num_pages=*/16, /*levels_per_page=*/80000);
+
+  // Vectors to read the values into.
+  std::vector<int32_t> read_values(batch_size, -1);
+  std::vector<int16_t> read_defs(batch_size, -1);
+  std::vector<int16_t> read_reps(batch_size, -1);
+  for (auto _ : state) {
+    state.PauseTiming();
+    Int32Reader* reader = helper.ResetReader();
+    int64_t values_count = -1;
+    state.ResumeTiming();
+    while (values_count != 0) {
+      int64_t values_read = 0;
+      DoNotOptimize(values_count =
+                        reader->ReadBatch(batch_size, read_defs.data(), read_reps.data(),
+                                          read_values.data(), &values_read));
+    }
+  }
+
+  state.SetBytesProcessed(state.iterations() * helper.total_size());
+}
+
+BENCHMARK(ColumnReaderSkipInt32)
+    ->ArgNames({"Repetition", "BatchSize"})
+    ->Args({0, 100})
+    ->Args({0, 1000})
+    ->Args({0, 10000})
+    ->Args({0, 100000})
+    ->Args({1, 100})
+    ->Args({1, 1000})
+    ->Args({1, 10000})
+    ->Args({1, 100000})
+    ->Args({2, 100})
+    ->Args({2, 1000})
+    ->Args({2, 10000})
+    ->Args({2, 100000});
+
+BENCHMARK(ColumnReaderReadBatchInt32)
+    ->ArgNames({"Repetition", "BatchSize"})
+    ->Args({0, 100})
+    ->Args({0, 1000})
+    ->Args({0, 10000})
+    ->Args({0, 100000})
+    ->Args({1, 100})
+    ->Args({1, 1000})
+    ->Args({1, 10000})
+    ->Args({1, 100000})
+    ->Args({2, 100})
+    ->Args({2, 1000})
+    ->Args({2, 10000})
+    ->Args({2, 100000});
+
+}  // namespace benchmark
+}  // namespace parquet