You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2022/11/02 10:29:44 UTC

[GitHub] [arrow] pitrou commented on a diff in pull request #14523: PARQUET-2206: [parquet-cpp] Microbenchmark for ColumnReadaer ReadBatch and Skip

pitrou commented on code in PR #14523:
URL: https://github.com/apache/arrow/pull/14523#discussion_r1011511343


##########
cpp/src/parquet/column_reader_benchmark.cc:
##########
@@ -0,0 +1,123 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "benchmark/benchmark.h"
+#include "parquet/column_page.h"
+#include "parquet/column_reader.h"
+#include "parquet/schema.h"
+#include "parquet/test_util.h"
+#include "parquet/types.h"
+
+namespace parquet {
+
+using benchmark::DoNotOptimize;
+using parquet::Repetition;
+using parquet::test::MakePages;
+using schema::NodePtr;
+
+namespace benchmark {
+
+// Benchmarks Skip and ReadBatch API for ColumnReader with the following
+// paramenters in order:

Review Comment:
   ```suggestion
   // parameters in order:
   ```



##########
cpp/src/parquet/column_reader_benchmark.cc:
##########
@@ -0,0 +1,123 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "benchmark/benchmark.h"
+#include "parquet/column_page.h"
+#include "parquet/column_reader.h"
+#include "parquet/schema.h"
+#include "parquet/test_util.h"
+#include "parquet/types.h"
+
+namespace parquet {
+
+using benchmark::DoNotOptimize;
+using parquet::Repetition;
+using parquet::test::MakePages;
+using schema::NodePtr;
+
+namespace benchmark {
+
+// Benchmarks Skip and ReadBatch API for ColumnReader with the following
+// paramenters in order:
+// - def_level: set to 0 for REQUIRED, 1 for OPTIONAL/REPEATED.
+// - rep_level: set to 1 for REPEATED, 0 otherwise.
+// - is_skip: set to 0 for benchmarking ReadBatch and 1 for Skip.

Review Comment:
   This will make the results a bit cryptic. Can we instead define two sets of benchmarks? You can of course use a common helper function.



##########
cpp/src/parquet/column_reader_benchmark.cc:
##########
@@ -0,0 +1,123 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "benchmark/benchmark.h"
+#include "parquet/column_page.h"
+#include "parquet/column_reader.h"
+#include "parquet/schema.h"
+#include "parquet/test_util.h"
+#include "parquet/types.h"
+
+namespace parquet {
+
+using benchmark::DoNotOptimize;
+using parquet::Repetition;
+using parquet::test::MakePages;
+using schema::NodePtr;
+
+namespace benchmark {
+
+// Benchmarks Skip and ReadBatch API for ColumnReader with the following
+// paramenters in order:
+// - def_level: set to 0 for REQUIRED, 1 for OPTIONAL/REPEATED.
+// - rep_level: set to 1 for REPEATED, 0 otherwise.

Review Comment:
   Why not have single `repetition` parameter instead?



##########
cpp/src/parquet/column_reader_benchmark.cc:
##########
@@ -0,0 +1,123 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "benchmark/benchmark.h"
+#include "parquet/column_page.h"
+#include "parquet/column_reader.h"
+#include "parquet/schema.h"
+#include "parquet/test_util.h"
+#include "parquet/types.h"
+
+namespace parquet {
+
+using benchmark::DoNotOptimize;
+using parquet::Repetition;
+using parquet::test::MakePages;
+using schema::NodePtr;
+
+namespace benchmark {
+
+// Benchmarks Skip and ReadBatch API for ColumnReader with the following
+// paramenters in order:
+// - def_level: set to 0 for REQUIRED, 1 for OPTIONAL/REPEATED.
+// - rep_level: set to 1 for REPEATED, 0 otherwise.
+// - is_skip: set to 0 for benchmarking ReadBatch and 1 for Skip.
+// - batch_size: sets how many values to read at each call.
+static void BM_Skip(::benchmark::State& state) {
+  internal::LevelInfo level_info;
+  level_info.def_level = state.range(0);
+  level_info.rep_level = state.range(1);
+  const int skip = state.range(2);
+  const int batch_size = state.range(3);
+
+  Repetition::type repetition = Repetition::REQUIRED;
+  if (level_info.def_level > 0) {
+    repetition = Repetition::OPTIONAL;
+  }
+  if (level_info.rep_level > 0) {
+    repetition = Repetition::REPEATED;
+  }
+  NodePtr type = schema::Int32("b", repetition);
+  const ColumnDescriptor descr(type, level_info.def_level, level_info.rep_level);
+
+  const int num_pages = 5;
+  const int levels_per_page = 100000;
+  // Vectors filled with random rep/defs and values to make pages.
+  std::vector<int32_t> values;
+  std::vector<int16_t> def_levels;
+  std::vector<int16_t> rep_levels;
+  std::vector<uint8_t> data_buffer;
+  std::vector<std::shared_ptr<Page>> pages;
+  MakePages<Int32Type>(&descr, num_pages, levels_per_page, def_levels, rep_levels, values,
+                       data_buffer, pages, Encoding::PLAIN);
+
+  // Vectors to read the values into.
+  std::vector<int32_t> read_values(batch_size, -1);
+  std::vector<int16_t> read_defs(batch_size, -1);
+  std::vector<int16_t> read_reps(batch_size, -1);
+
+  while (state.KeepRunning()) {

Review Comment:
   Nit: can use for-range iteration
   ```suggestion
     for (auto _ : state) {
   ```



##########
cpp/src/parquet/column_reader_benchmark.cc:
##########
@@ -0,0 +1,123 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "benchmark/benchmark.h"
+#include "parquet/column_page.h"
+#include "parquet/column_reader.h"
+#include "parquet/schema.h"
+#include "parquet/test_util.h"
+#include "parquet/types.h"
+
+namespace parquet {
+
+using benchmark::DoNotOptimize;
+using parquet::Repetition;
+using parquet::test::MakePages;
+using schema::NodePtr;
+
+namespace benchmark {
+
+// Benchmarks Skip and ReadBatch API for ColumnReader with the following
+// paramenters in order:
+// - def_level: set to 0 for REQUIRED, 1 for OPTIONAL/REPEATED.
+// - rep_level: set to 1 for REPEATED, 0 otherwise.
+// - is_skip: set to 0 for benchmarking ReadBatch and 1 for Skip.
+// - batch_size: sets how many values to read at each call.
+static void BM_Skip(::benchmark::State& state) {
+  internal::LevelInfo level_info;
+  level_info.def_level = state.range(0);
+  level_info.rep_level = state.range(1);
+  const int skip = state.range(2);
+  const int batch_size = state.range(3);
+
+  Repetition::type repetition = Repetition::REQUIRED;
+  if (level_info.def_level > 0) {
+    repetition = Repetition::OPTIONAL;
+  }
+  if (level_info.rep_level > 0) {
+    repetition = Repetition::REPEATED;
+  }
+  NodePtr type = schema::Int32("b", repetition);
+  const ColumnDescriptor descr(type, level_info.def_level, level_info.rep_level);
+
+  const int num_pages = 5;
+  const int levels_per_page = 100000;
+  // Vectors filled with random rep/defs and values to make pages.
+  std::vector<int32_t> values;
+  std::vector<int16_t> def_levels;
+  std::vector<int16_t> rep_levels;
+  std::vector<uint8_t> data_buffer;
+  std::vector<std::shared_ptr<Page>> pages;
+  MakePages<Int32Type>(&descr, num_pages, levels_per_page, def_levels, rep_levels, values,
+                       data_buffer, pages, Encoding::PLAIN);
+
+  // Vectors to read the values into.
+  std::vector<int32_t> read_values(batch_size, -1);
+  std::vector<int16_t> read_defs(batch_size, -1);
+  std::vector<int16_t> read_reps(batch_size, -1);
+
+  while (state.KeepRunning()) {
+    state.PauseTiming();
+    std::unique_ptr<PageReader> pager;
+    pager.reset(new test::MockPageReader(pages));
+    std::shared_ptr<ColumnReader> column_reader =
+        ColumnReader::Make(&descr, std::move(pager));
+    Int32Reader* reader = static_cast<Int32Reader*>(column_reader.get());
+    int values_count = -1;
+    state.ResumeTiming();
+    while (values_count != 0) {
+      if (skip == 1) {
+        DoNotOptimize(values_count = reader->Skip(batch_size));
+      } else {
+        int64_t values_read = 0;
+        DoNotOptimize(values_count = reader->ReadBatch(batch_size, read_defs.data(),
+                                                       read_reps.data(),
+                                                       read_values.data(), &values_read));
+      }
+    }
+  }
+}
+
+BENCHMARK(BM_Skip)
+    ->Iterations(1000)
+    ->Args({0, 0, 0, 1})

Review Comment:
   Is it useful to benchmark reading one value at a time?



##########
cpp/src/parquet/column_reader_benchmark.cc:
##########
@@ -0,0 +1,123 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "benchmark/benchmark.h"
+#include "parquet/column_page.h"
+#include "parquet/column_reader.h"
+#include "parquet/schema.h"
+#include "parquet/test_util.h"
+#include "parquet/types.h"
+
+namespace parquet {
+
+using benchmark::DoNotOptimize;
+using parquet::Repetition;
+using parquet::test::MakePages;
+using schema::NodePtr;
+
+namespace benchmark {
+
+// Benchmarks Skip and ReadBatch API for ColumnReader with the following
+// paramenters in order:
+// - def_level: set to 0 for REQUIRED, 1 for OPTIONAL/REPEATED.
+// - rep_level: set to 1 for REPEATED, 0 otherwise.
+// - is_skip: set to 0 for benchmarking ReadBatch and 1 for Skip.
+// - batch_size: sets how many values to read at each call.
+static void BM_Skip(::benchmark::State& state) {
+  internal::LevelInfo level_info;
+  level_info.def_level = state.range(0);
+  level_info.rep_level = state.range(1);
+  const int skip = state.range(2);
+  const int batch_size = state.range(3);
+
+  Repetition::type repetition = Repetition::REQUIRED;
+  if (level_info.def_level > 0) {
+    repetition = Repetition::OPTIONAL;
+  }
+  if (level_info.rep_level > 0) {
+    repetition = Repetition::REPEATED;
+  }
+  NodePtr type = schema::Int32("b", repetition);
+  const ColumnDescriptor descr(type, level_info.def_level, level_info.rep_level);
+
+  const int num_pages = 5;
+  const int levels_per_page = 100000;
+  // Vectors filled with random rep/defs and values to make pages.
+  std::vector<int32_t> values;
+  std::vector<int16_t> def_levels;
+  std::vector<int16_t> rep_levels;
+  std::vector<uint8_t> data_buffer;
+  std::vector<std::shared_ptr<Page>> pages;
+  MakePages<Int32Type>(&descr, num_pages, levels_per_page, def_levels, rep_levels, values,
+                       data_buffer, pages, Encoding::PLAIN);
+
+  // Vectors to read the values into.
+  std::vector<int32_t> read_values(batch_size, -1);
+  std::vector<int16_t> read_defs(batch_size, -1);
+  std::vector<int16_t> read_reps(batch_size, -1);
+
+  while (state.KeepRunning()) {
+    state.PauseTiming();
+    std::unique_ptr<PageReader> pager;
+    pager.reset(new test::MockPageReader(pages));
+    std::shared_ptr<ColumnReader> column_reader =
+        ColumnReader::Make(&descr, std::move(pager));
+    Int32Reader* reader = static_cast<Int32Reader*>(column_reader.get());
+    int values_count = -1;
+    state.ResumeTiming();
+    while (values_count != 0) {
+      if (skip == 1) {
+        DoNotOptimize(values_count = reader->Skip(batch_size));
+      } else {
+        int64_t values_read = 0;
+        DoNotOptimize(values_count = reader->ReadBatch(batch_size, read_defs.data(),
+                                                       read_reps.data(),
+                                                       read_values.data(), &values_read));
+      }
+    }
+  }
+}
+
+BENCHMARK(BM_Skip)
+    ->Iterations(1000)
+    ->Args({0, 0, 0, 1})

Review Comment:
   Also, it seems to make the benchmark very slow to run...



##########
cpp/src/parquet/column_reader_benchmark.cc:
##########
@@ -0,0 +1,123 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "benchmark/benchmark.h"
+#include "parquet/column_page.h"
+#include "parquet/column_reader.h"
+#include "parquet/schema.h"
+#include "parquet/test_util.h"
+#include "parquet/types.h"
+
+namespace parquet {
+
+using benchmark::DoNotOptimize;
+using parquet::Repetition;
+using parquet::test::MakePages;
+using schema::NodePtr;
+
+namespace benchmark {
+
+// Benchmarks Skip and ReadBatch API for ColumnReader with the following
+// paramenters in order:
+// - def_level: set to 0 for REQUIRED, 1 for OPTIONAL/REPEATED.
+// - rep_level: set to 1 for REPEATED, 0 otherwise.
+// - is_skip: set to 0 for benchmarking ReadBatch and 1 for Skip.
+// - batch_size: sets how many values to read at each call.
+static void BM_Skip(::benchmark::State& state) {
+  internal::LevelInfo level_info;
+  level_info.def_level = state.range(0);
+  level_info.rep_level = state.range(1);
+  const int skip = state.range(2);
+  const int batch_size = state.range(3);
+
+  Repetition::type repetition = Repetition::REQUIRED;
+  if (level_info.def_level > 0) {
+    repetition = Repetition::OPTIONAL;
+  }
+  if (level_info.rep_level > 0) {
+    repetition = Repetition::REPEATED;
+  }
+  NodePtr type = schema::Int32("b", repetition);
+  const ColumnDescriptor descr(type, level_info.def_level, level_info.rep_level);
+
+  const int num_pages = 5;
+  const int levels_per_page = 100000;
+  // Vectors filled with random rep/defs and values to make pages.
+  std::vector<int32_t> values;
+  std::vector<int16_t> def_levels;
+  std::vector<int16_t> rep_levels;
+  std::vector<uint8_t> data_buffer;
+  std::vector<std::shared_ptr<Page>> pages;
+  MakePages<Int32Type>(&descr, num_pages, levels_per_page, def_levels, rep_levels, values,
+                       data_buffer, pages, Encoding::PLAIN);
+
+  // Vectors to read the values into.
+  std::vector<int32_t> read_values(batch_size, -1);
+  std::vector<int16_t> read_defs(batch_size, -1);
+  std::vector<int16_t> read_reps(batch_size, -1);
+
+  while (state.KeepRunning()) {
+    state.PauseTiming();
+    std::unique_ptr<PageReader> pager;
+    pager.reset(new test::MockPageReader(pages));
+    std::shared_ptr<ColumnReader> column_reader =
+        ColumnReader::Make(&descr, std::move(pager));
+    Int32Reader* reader = static_cast<Int32Reader*>(column_reader.get());
+    int values_count = -1;
+    state.ResumeTiming();
+    while (values_count != 0) {
+      if (skip == 1) {
+        DoNotOptimize(values_count = reader->Skip(batch_size));
+      } else {
+        int64_t values_read = 0;
+        DoNotOptimize(values_count = reader->ReadBatch(batch_size, read_defs.data(),
+                                                       read_reps.data(),
+                                                       read_values.data(), &values_read));
+      }
+    }
+  }
+}

Review Comment:
   One nice thing to add is custom counters to make benchmark arguments more intelligible.
   https://github.com/google/benchmark/blob/main/docs/user_guide.md#custom-counters
   
   Something like:
   ```c++
     state.counters["batch_size"] = batch_size;
     state.counters["repetition"] = static_cast<int>(repetition);
   ```
   



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org