You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ap...@apache.org on 2022/04/20 13:35:40 UTC
[arrow] branch master updated: ARROW-16014: [C++] Create more benchmarks for measuring expression evaluation overhead
This is an automated email from the ASF dual-hosted git repository.
apitrou pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new c6d844755a ARROW-16014: [C++] Create more benchmarks for measuring expression evaluation overhead
c6d844755a is described below
commit c6d844755ac253a3af8679ad527190a048881aba
Author: Sanjiban Sengupta <sa...@gmail.com>
AuthorDate: Wed Apr 20 15:35:30 2022 +0200
ARROW-16014: [C++] Create more benchmarks for measuring expression evaluation overhead
This PR adds an Expression Execution benchmark in `arrow-compute` which uses the `rows_per_batch` as a parameter and measures the `rows_per_second` for executing the scalar expression.
```
Running ./release/arrow-compute-expression-benchmark
Run on (10 X 24.1211 MHz CPU s)
CPU Caches:
L1 Data 64 KiB (x10)
L1 Instruction 128 KiB (x10)
L2 Unified 4096 KiB (x5)
-----------------------------------------------------------------------------------------------------------------------------------------------------------
Benchmark Time CPU Iterations UserCounters...
-----------------------------------------------------------------------------------------------------------------------------------------------------------
ExecuteScalarExpressionOverhead/complex_expression/rows_per_batch:1000/real_time/threads:1 2996512 ns 2995935 ns 232 batches_per_second=333.721k/s rows_per_second=333.721M/s
ExecuteScalarExpressionOverhead/complex_expression/rows_per_batch:1000/real_time/threads:10 2808554 ns 26825320 ns 250 batches_per_second=356.055k/s rows_per_second=356.055M/s
ExecuteScalarExpressionOverhead/complex_expression/rows_per_batch:10000/real_time/threads:1 745300 ns 745197 ns 914 batches_per_second=134.174k/s rows_per_second=1.34174G/s
ExecuteScalarExpressionOverhead/complex_expression/rows_per_batch:10000/real_time/threads:10 274030 ns 2621425 ns 2570 batches_per_second=364.923k/s rows_per_second=3.64923G/s
ExecuteScalarExpressionOverhead/complex_expression/rows_per_batch:100000/real_time/threads:1 509337 ns 509270 ns 1371 batches_per_second=19.6334k/s rows_per_second=1.96334G/s
ExecuteScalarExpressionOverhead/complex_expression/rows_per_batch:100000/real_time/threads:10 67678 ns 645726 ns 10530 batches_per_second=147.758k/s rows_per_second=14.7758G/s
ExecuteScalarExpressionOverhead/complex_expression/rows_per_batch:1000000/real_time/threads:1 498521 ns 498480 ns 1397 batches_per_second=2.00593k/s rows_per_second=2.00593G/s
ExecuteScalarExpressionOverhead/complex_expression/rows_per_batch:1000000/real_time/threads:10 131368 ns 1231142 ns 5300 batches_per_second=7.61221k/s rows_per_second=7.61221G/s
ExecuteScalarExpressionOverhead/simple_expression/rows_per_batch:1000/real_time/threads:1 1099162 ns 1098507 ns 635 batches_per_second=909.784k/s rows_per_second=909.784M/s
ExecuteScalarExpressionOverhead/simple_expression/rows_per_batch:1000/real_time/threads:10 992847 ns 9585644 ns 700 batches_per_second=1007.2k/s rows_per_second=1007.2M/s
ExecuteScalarExpressionOverhead/simple_expression/rows_per_batch:10000/real_time/threads:1 465254 ns 465121 ns 1447 batches_per_second=214.936k/s rows_per_second=2.14936G/s
ExecuteScalarExpressionOverhead/simple_expression/rows_per_batch:10000/real_time/threads:10 100351 ns 891244 ns 7060 batches_per_second=996.503k/s rows_per_second=9.96503G/s
ExecuteScalarExpressionOverhead/simple_expression/rows_per_batch:100000/real_time/threads:1 393506 ns 393395 ns 1784 batches_per_second=25.4126k/s rows_per_second=2.54126G/s
ExecuteScalarExpressionOverhead/simple_expression/rows_per_batch:100000/real_time/threads:10 57732 ns 545456 ns 12070 batches_per_second=173.215k/s rows_per_second=17.3215G/s
ExecuteScalarExpressionOverhead/simple_expression/rows_per_batch:1000000/real_time/threads:1 392542 ns 392370 ns 1797 batches_per_second=2.5475k/s rows_per_second=2.5475G/s
ExecuteScalarExpressionOverhead/simple_expression/rows_per_batch:1000000/real_time/threads:10 103905 ns 945614 ns 6740 batches_per_second=9.6242k/s rows_per_second=9.6242G/s
ExecuteScalarExpressionOverhead/zero_copy_expression/rows_per_batch:1000/real_time/threads:1 556220 ns 556173 ns 1265 batches_per_second=1.79785M/s rows_per_second=1.79785G/s
ExecuteScalarExpressionOverhead/zero_copy_expression/rows_per_batch:1000/real_time/threads:10 600231 ns 5753634 ns 1210 batches_per_second=1.66603M/s rows_per_second=1.66603G/s
ExecuteScalarExpressionOverhead/zero_copy_expression/rows_per_batch:10000/real_time/threads:1 55670 ns 55645 ns 12314 batches_per_second=1.79629M/s rows_per_second=17.9629G/s
ExecuteScalarExpressionOverhead/zero_copy_expression/rows_per_batch:10000/real_time/threads:10 59506 ns 563823 ns 12210 batches_per_second=1.68052M/s rows_per_second=16.8052G/s
ExecuteScalarExpressionOverhead/zero_copy_expression/rows_per_batch:100000/real_time/threads:1 5644 ns 5643 ns 123957 batches_per_second=1.77176M/s rows_per_second=177.176G/s
ExecuteScalarExpressionOverhead/zero_copy_expression/rows_per_batch:100000/real_time/threads:10 5932 ns 57219 ns 118590 batches_per_second=1.68579M/s rows_per_second=168.579G/s
ExecuteScalarExpressionOverhead/zero_copy_expression/rows_per_batch:1000000/real_time/threads:1 558 ns 557 ns 1259506 batches_per_second=1.79198M/s rows_per_second=1.79198T/s
ExecuteScalarExpressionOverhead/zero_copy_expression/rows_per_batch:1000000/real_time/threads:10 594 ns 5712 ns 1223120 batches_per_second=1.68243M/s rows_per_second=1.68243T/s
ExecuteScalarExpressionOverhead/ref_only_expression/rows_per_batch:1000/real_time/threads:1 20897 ns 20877 ns 33585 batches_per_second=47.8538M/s rows_per_second=47.8538G/s
ExecuteScalarExpressionOverhead/ref_only_expression/rows_per_batch:1000/real_time/threads:10 146133 ns 1390005 ns 5150 batches_per_second=6.84306M/s rows_per_second=6.84306G/s
ExecuteScalarExpressionOverhead/ref_only_expression/rows_per_batch:10000/real_time/threads:1 2084 ns 2084 ns 335070 batches_per_second=47.9808M/s rows_per_second=479.808G/s
ExecuteScalarExpressionOverhead/ref_only_expression/rows_per_batch:10000/real_time/threads:10 14656 ns 139612 ns 48420 batches_per_second=6.82311M/s rows_per_second=68.2311G/s
ExecuteScalarExpressionOverhead/ref_only_expression/rows_per_batch:100000/real_time/threads:1 209 ns 209 ns 3342442 batches_per_second=47.755M/s rows_per_second=4.7755T/s
ExecuteScalarExpressionOverhead/ref_only_expression/rows_per_batch:100000/real_time/threads:10 1404 ns 13089 ns 481520 batches_per_second=7.12084M/s rows_per_second=712.084G/s
ExecuteScalarExpressionOverhead/ref_only_expression/rows_per_batch:1000000/real_time/threads:1 20.9 ns 20.9 ns 33575027 batches_per_second=47.8822M/s rows_per_second=47.8822T/s
ExecuteScalarExpressionOverhead/ref_only_expression/rows_per_batch:1000000/real_time/threads:10 145 ns 1371 ns 5094890 batches_per_second=6.89397M/s rows_per_second=6.89397T/s
```
Closes #12755 from sanjibansg/scheduler/ScalarExecution
Authored-by: Sanjiban Sengupta <sa...@gmail.com>
Signed-off-by: Antoine Pitrou <an...@python.org>
---
cpp/src/arrow/compute/exec/expression_benchmark.cc | 62 ++++++++++++++++++++++
1 file changed, 62 insertions(+)
diff --git a/cpp/src/arrow/compute/exec/expression_benchmark.cc b/cpp/src/arrow/compute/exec/expression_benchmark.cc
index d1738c9c23..508ce97294 100644
--- a/cpp/src/arrow/compute/exec/expression_benchmark.cc
+++ b/cpp/src/arrow/compute/exec/expression_benchmark.cc
@@ -21,6 +21,7 @@
#include "arrow/compute/exec/expression.h"
#include "arrow/compute/exec/test_util.h"
#include "arrow/dataset/partition.h"
+#include "arrow/testing/generator.h"
#include "arrow/testing/gtest_util.h"
#include "arrow/type.h"
@@ -69,6 +70,30 @@ static void SimplifyFilterWithGuarantee(benchmark::State& state, Expression filt
}
}
+static void ExecuteScalarExpressionOverhead(benchmark::State& state, Expression expr) {
+ const auto rows_per_batch = static_cast<int32_t>(state.range(0));
+ const auto num_batches = 1000000 / rows_per_batch;
+
+ ExecContext ctx;
+ auto dataset_schema = schema({
+ field("x", int64()),
+ });
+ ExecBatch input({Datum(ConstantArrayGenerator::Int64(rows_per_batch, 5))},
+ /*length=*/rows_per_batch);
+
+ ASSIGN_OR_ABORT(auto bound, expr.Bind(*dataset_schema));
+ for (auto _ : state) {
+ for (int it = 0; it < num_batches; ++it)
+ ABORT_NOT_OK(ExecuteScalarExpression(bound, input, &ctx).status());
+ }
+ state.counters["rows_per_second"] = benchmark::Counter(
+ static_cast<double>(state.iterations() * num_batches * rows_per_batch),
+ benchmark::Counter::kIsRate);
+
+ state.counters["batches_per_second"] = benchmark::Counter(
+ static_cast<double>(state.iterations() * num_batches), benchmark::Counter::kIsRate);
+}
+
auto to_int64 = compute::CastOptions::Safe(int64());
// A fully simplified filter.
auto filter_simple_negative = and_(equal(field_ref("a"), literal(int64_t(99))),
@@ -92,6 +117,13 @@ auto guarantee = and_(equal(field_ref("a"), literal(int64_t(99))),
auto guarantee_dictionary = and_(equal(field_ref("a"), literal(ninety_nine_dict)),
equal(field_ref("b"), literal(ninety_nine_dict)));
+auto complex_expression =
+ and_(less(field_ref("x"), literal(20)), greater(field_ref("x"), literal(0)));
+auto simple_expression = call("negate", {field_ref("x")});
+auto zero_copy_expression =
+ call("cast", {field_ref("x")}, compute::CastOptions::Safe(timestamp(TimeUnit::NANO)));
+auto ref_only_expression = field_ref("x");
+
// Negative queries (partition expressions that fail the filter)
BENCHMARK_CAPTURE(SimplifyFilterWithGuarantee, negative_filter_simple_guarantee_simple,
filter_simple_negative, guarantee);
@@ -120,5 +152,35 @@ BENCHMARK_CAPTURE(BindAndEvaluate, nested_array,
BENCHMARK_CAPTURE(BindAndEvaluate, nested_scalar,
field_ref(FieldRef("struct_scalar", "float")));
+BENCHMARK_CAPTURE(ExecuteScalarExpressionOverhead, complex_expression, complex_expression)
+ ->ArgNames({"rows_per_batch"})
+ ->RangeMultiplier(10)
+ ->Range(1000, 1000000)
+ ->DenseThreadRange(1, std::thread::hardware_concurrency(),
+ std::thread::hardware_concurrency())
+ ->UseRealTime();
+BENCHMARK_CAPTURE(ExecuteScalarExpressionOverhead, simple_expression, simple_expression)
+ ->ArgNames({"rows_per_batch"})
+ ->RangeMultiplier(10)
+ ->Range(1000, 1000000)
+ ->DenseThreadRange(1, std::thread::hardware_concurrency(),
+ std::thread::hardware_concurrency())
+ ->UseRealTime();
+BENCHMARK_CAPTURE(ExecuteScalarExpressionOverhead, zero_copy_expression,
+ zero_copy_expression)
+ ->ArgNames({"rows_per_batch"})
+ ->RangeMultiplier(10)
+ ->Range(1000, 1000000)
+ ->DenseThreadRange(1, std::thread::hardware_concurrency(),
+ std::thread::hardware_concurrency())
+ ->UseRealTime();
+BENCHMARK_CAPTURE(ExecuteScalarExpressionOverhead, ref_only_expression,
+ ref_only_expression)
+ ->ArgNames({"rows_per_batch"})
+ ->RangeMultiplier(10)
+ ->Range(1000, 1000000)
+ ->DenseThreadRange(1, std::thread::hardware_concurrency(),
+ std::thread::hardware_concurrency())
+ ->UseRealTime();
} // namespace compute
} // namespace arrow