You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2022/07/06 01:01:28 UTC
[arrow] branch master updated: ARROW-16599 [C++] Implementation of ExecuteScalarExpressionOverhead benchmarks without arrow for comparision (#13179)
This is an automated email from the ASF dual-hosted git repository.
westonpace pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new c6c6373696 ARROW-16599 [C++] Implementation of ExecuteScalarExpressionOverhead benchmarks without arrow for comparision (#13179)
c6c6373696 is described below
commit c6c63736963226356d3d8c09c0437ee95ff6c59f
Author: zagto <to...@zagorni.eu>
AuthorDate: Wed Jul 6 03:01:19 2022 +0200
ARROW-16599 [C++] Implementation of ExecuteScalarExpressionOverhead benchmarks without arrow for comparision (#13179)
Lead-authored-by: Tobias Zagorni <to...@zagorni.eu>
Co-authored-by: zagto <to...@zagorni.eu>
Signed-off-by: Weston Pace <we...@gmail.com>
---
cpp/src/arrow/compute/exec/expression_benchmark.cc | 112 ++++++++++++++++++++-
1 file changed, 109 insertions(+), 3 deletions(-)
diff --git a/cpp/src/arrow/compute/exec/expression_benchmark.cc b/cpp/src/arrow/compute/exec/expression_benchmark.cc
index 508ce97294..70aa509d2e 100644
--- a/cpp/src/arrow/compute/exec/expression_benchmark.cc
+++ b/cpp/src/arrow/compute/exec/expression_benchmark.cc
@@ -78,13 +78,38 @@ static void ExecuteScalarExpressionOverhead(benchmark::State& state, Expression
auto dataset_schema = schema({
field("x", int64()),
});
- ExecBatch input({Datum(ConstantArrayGenerator::Int64(rows_per_batch, 5))},
- /*length=*/rows_per_batch);
+ std::vector<ExecBatch> inputs(num_batches);
+ for (auto& batch : inputs) {
+ batch = ExecBatch({Datum(ConstantArrayGenerator::Int64(rows_per_batch, 5))},
+ /*length=*/1);
+ }
ASSIGN_OR_ABORT(auto bound, expr.Bind(*dataset_schema));
for (auto _ : state) {
for (int it = 0; it < num_batches; ++it)
- ABORT_NOT_OK(ExecuteScalarExpression(bound, input, &ctx).status());
+ ABORT_NOT_OK(ExecuteScalarExpression(bound, inputs[it], &ctx).status());
+ }
+ state.counters["rows_per_second"] = benchmark::Counter(
+ static_cast<double>(state.iterations() * num_batches * rows_per_batch),
+ benchmark::Counter::kIsRate);
+
+ state.counters["batches_per_second"] = benchmark::Counter(
+ static_cast<double>(state.iterations() * num_batches), benchmark::Counter::kIsRate);
+}
+
+/// \brief Baseline benchmarks are implemented in pure C++ without arrow for performance
+/// comparision.
+template <typename BenchmarkType>
+void ExecuteScalarExpressionBaseline(benchmark::State& state) {
+ const auto rows_per_batch = static_cast<int32_t>(state.range(0));
+ const auto num_batches = 1000000 / rows_per_batch;
+
+ std::vector<std::vector<int64_t>> inputs(num_batches,
+ std::vector<int64_t>(rows_per_batch, 5));
+ BenchmarkType benchmark(rows_per_batch);
+
+ for (auto _ : state) {
+ for (int it = 0; it < num_batches; ++it) benchmark.Exec(inputs[it]);
}
state.counters["rows_per_second"] = benchmark::Counter(
static_cast<double>(state.iterations() * num_batches * rows_per_batch),
@@ -119,6 +144,9 @@ auto guarantee_dictionary = and_(equal(field_ref("a"), literal(ninety_nine_dict)
auto complex_expression =
and_(less(field_ref("x"), literal(20)), greater(field_ref("x"), literal(0)));
+auto complex_integer_expression =
+ call("multiply", {call("add", {field_ref("x"), literal(20)}),
+ call("add", {field_ref("x"), literal(-3)})});
auto simple_expression = call("negate", {field_ref("x")});
auto zero_copy_expression =
call("cast", {field_ref("x")}, compute::CastOptions::Safe(timestamp(TimeUnit::NANO)));
@@ -152,6 +180,70 @@ BENCHMARK_CAPTURE(BindAndEvaluate, nested_array,
BENCHMARK_CAPTURE(BindAndEvaluate, nested_scalar,
field_ref(FieldRef("struct_scalar", "float")));
+/// \brief Baseline benchmark for complex_expression implemented without arrow
+struct ComplexExpressionBaseline {
+ public:
+ ComplexExpressionBaseline(size_t input_size) {
+ /* hack - cuts off a few elemets if the input size is not a multiple of 64 for
+ * simplicity. We can't use std::vector<bool> here since it slows down things
+ * massively */
+ less_20.resize(input_size / 64);
+ greater_0.resize(input_size / 64);
+ output.resize(input_size / 64);
+ }
+ void Exec(const std::vector<int64_t>& input) {
+ size_t input_size = input.size();
+
+ for (size_t index = 0; index < input_size / 64; index++) {
+ size_t value = 0;
+ for (size_t bit = 0; bit < 64; bit++) {
+ value |= input[index * 64 + bit] > 0;
+ value <<= 1;
+ }
+ greater_0[index] = value;
+ }
+ for (size_t index = 0; index < input_size / 64; index++) {
+ size_t value = 0;
+ for (size_t bit = 0; bit < 64; bit++) {
+ value |= input[index * 64 + bit] < 20;
+ value <<= 1;
+ }
+ less_20[index] = value;
+ }
+
+ for (size_t index = 0; index < input_size / 64; index++) {
+ output[index] = greater_0[index] & less_20[index];
+ }
+ }
+
+ private:
+ std::vector<int64_t> greater_0;
+ std::vector<int64_t> less_20;
+ std::vector<int64_t> output;
+};
+
+/// \brief Baseline benchmark for simple_expression implemented without arrow
+struct SimpleExpressionBaseline {
+ SimpleExpressionBaseline(size_t input_size) { output.resize(input_size); }
+ void Exec(const std::vector<int64_t>& input) {
+ size_t input_size = input.size();
+
+ for (size_t index = 0; index < input_size; index++) {
+ output[index] = -input[index];
+ }
+ }
+ std::vector<int64_t> output;
+};
+
+BENCHMARK_CAPTURE(ExecuteScalarExpressionOverhead, complex_integer_expression,
+ complex_expression)
+ ->ArgNames({"rows_per_batch"})
+ ->RangeMultiplier(10)
+ ->Range(1000, 1000000)
+ ->DenseThreadRange(1, std::thread::hardware_concurrency(),
+ std::thread::hardware_concurrency())
+ ->UseRealTime();
+
BENCHMARK_CAPTURE(ExecuteScalarExpressionOverhead, complex_expression, complex_expression)
->ArgNames({"rows_per_batch"})
->RangeMultiplier(10)
@@ -159,6 +251,13 @@ BENCHMARK_CAPTURE(ExecuteScalarExpressionOverhead, complex_expression, complex_e
->DenseThreadRange(1, std::thread::hardware_concurrency(),
std::thread::hardware_concurrency())
->UseRealTime();
+BENCHMARK_TEMPLATE(ExecuteScalarExpressionBaseline, ComplexExpressionBaseline)
+ ->ArgNames({"rows_per_batch"})
+ ->RangeMultiplier(10)
+ ->Range(1000, 1000000)
+ ->DenseThreadRange(1, std::thread::hardware_concurrency(),
+ std::thread::hardware_concurrency())
+ ->UseRealTime();
BENCHMARK_CAPTURE(ExecuteScalarExpressionOverhead, simple_expression, simple_expression)
->ArgNames({"rows_per_batch"})
->RangeMultiplier(10)
@@ -166,6 +265,13 @@ BENCHMARK_CAPTURE(ExecuteScalarExpressionOverhead, simple_expression, simple_exp
->DenseThreadRange(1, std::thread::hardware_concurrency(),
std::thread::hardware_concurrency())
->UseRealTime();
+BENCHMARK_TEMPLATE(ExecuteScalarExpressionBaseline, SimpleExpressionBaseline)
+ ->ArgNames({"rows_per_batch"})
+ ->RangeMultiplier(10)
+ ->Range(1000, 1000000)
+ ->DenseThreadRange(1, std::thread::hardware_concurrency(),
+ std::thread::hardware_concurrency())
+ ->UseRealTime();
BENCHMARK_CAPTURE(ExecuteScalarExpressionOverhead, zero_copy_expression,
zero_copy_expression)
->ArgNames({"rows_per_batch"})