You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ap...@apache.org on 2018/08/22 10:54:22 UTC
[arrow] branch master updated: ARROW-3099: [C++] Add benchmark for
number parsing
This is an automated email from the ASF dual-hosted git repository.
apitrou pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 8602509 ARROW-3099: [C++] Add benchmark for number parsing
8602509 is described below
commit 8602509c6a1d2c0c396ecc23c5dd9d670b4190f4
Author: Antoine Pitrou <an...@python.org>
AuthorDate: Wed Aug 22 12:54:09 2018 +0200
ARROW-3099: [C++] Add benchmark for number parsing
Sample output here (gcc 7.3, Ubuntu 18.04):
```
---------------------------------------------------------------------
Benchmark Time CPU Iterations
---------------------------------------------------------------------
BM_IntegerParsing<Int8Type> 1393 ns 1393 ns 499140 5.47679M items/s
BM_IntegerParsing<Int16Type> 1496 ns 1496 ns 470116 5.09902M items/s
BM_IntegerParsing<Int32Type> 1660 ns 1660 ns 420510 4.59584M items/s
BM_IntegerParsing<Int64Type> 2071 ns 2070 ns 338406 3.68531M items/s
BM_IntegerParsing<UInt8Type> 1229 ns 1229 ns 568370 6.20627M items/s
BM_IntegerParsing<UInt16Type> 1311 ns 1311 ns 533548 5.81873M items/s
BM_IntegerParsing<UInt32Type> 1402 ns 1401 ns 498921 5.44415M items/s
BM_IntegerParsing<UInt64Type> 1625 ns 1625 ns 428792 4.69486M items/s
BM_FloatParsing<FloatType> 4501 ns 4501 ns 155621 1.69512M items/s
BM_FloatParsing<DoubleType> 4632 ns 4632 ns 150996 1.64721M items/s
```
Author: Antoine Pitrou <an...@python.org>
Closes #2456 from pitrou/ARROW-3099-benchmark-number-parsing and squashes the following commits:
08bbf5d7 <Antoine Pitrou> Try to fix integer conversion warning
60c4ac54 <Antoine Pitrou> Try to fix compile error
e5dd04db <Antoine Pitrou> ARROW-3099: Add benchmark for number parsing
---
cpp/src/arrow/util/CMakeLists.txt | 1 +
cpp/src/arrow/util/number-parsing-benchmark.cc | 113 +++++++++++++++++++++++++
cpp/src/arrow/util/parsing.h | 5 ++
3 files changed, 119 insertions(+)
diff --git a/cpp/src/arrow/util/CMakeLists.txt b/cpp/src/arrow/util/CMakeLists.txt
index 9a4fceb..d0c6f88 100644
--- a/cpp/src/arrow/util/CMakeLists.txt
+++ b/cpp/src/arrow/util/CMakeLists.txt
@@ -66,5 +66,6 @@ ADD_ARROW_TEST(lazy-test)
ADD_ARROW_BENCHMARK(bit-util-benchmark)
ADD_ARROW_BENCHMARK(decimal-benchmark)
ADD_ARROW_BENCHMARK(lazy-benchmark)
+ADD_ARROW_BENCHMARK(number-parsing-benchmark)
add_subdirectory(variant)
diff --git a/cpp/src/arrow/util/number-parsing-benchmark.cc b/cpp/src/arrow/util/number-parsing-benchmark.cc
new file mode 100644
index 0000000..28ef76a
--- /dev/null
+++ b/cpp/src/arrow/util/number-parsing-benchmark.cc
@@ -0,0 +1,113 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "benchmark/benchmark.h"
+
+#include <limits>
+#include <string>
+#include <type_traits>
+#include <vector>
+
+#include "arrow/test-util.h"
+#include "arrow/util/parsing.h"
+
+namespace arrow {
+namespace internal {
+
+template <typename c_int>
+static std::vector<std::string> MakeIntStrings(int32_t num_items) {
+ using c_int_limits = std::numeric_limits<c_int>;
+ std::vector<std::string> base_strings = {"0",
+ "5",
+ c_int_limits::is_signed ? "-12" : "12",
+ "34",
+ "99",
+ c_int_limits::is_signed ? "-111" : "111",
+ std::to_string(c_int_limits::min()),
+ std::to_string(c_int_limits::max())};
+ std::vector<std::string> strings;
+ for (int32_t i = 0; i < num_items; ++i) {
+ strings.push_back(base_strings[i % base_strings.size()]);
+ }
+ return base_strings;
+}
+
+static std::vector<std::string> MakeFloatStrings(int32_t num_items) {
+ std::vector<std::string> base_strings = {"0.0", "5", "-12.3",
+ "98765430000", "3456.789", "0.0012345",
+ "2.34567e8", "-5.67e-8"};
+ std::vector<std::string> strings;
+ for (int32_t i = 0; i < num_items; ++i) {
+ strings.push_back(base_strings[i % base_strings.size()]);
+ }
+ return base_strings;
+}
+
+template <typename ARROW_TYPE, typename C_TYPE = typename ARROW_TYPE::c_type>
+static void BM_IntegerParsing(benchmark::State& state) { // NOLINT non-const reference
+ auto strings = MakeIntStrings<C_TYPE>(1000);
+ StringConverter<ARROW_TYPE> converter;
+
+ while (state.KeepRunning()) {
+ C_TYPE total = 0;
+ for (const auto& s : strings) {
+ C_TYPE value;
+ if (!converter(s.data(), s.length(), &value)) {
+ std::cerr << "Conversion failed for '" << s << "'";
+ std::abort();
+ }
+ total = static_cast<C_TYPE>(total + value);
+ }
+ benchmark::DoNotOptimize(total);
+ }
+ state.SetItemsProcessed(state.iterations() * strings.size());
+}
+
+template <typename ARROW_TYPE, typename C_TYPE = typename ARROW_TYPE::c_type>
+static void BM_FloatParsing(benchmark::State& state) { // NOLINT non-const reference
+ auto strings = MakeFloatStrings(1000);
+ StringConverter<ARROW_TYPE> converter;
+
+ while (state.KeepRunning()) {
+ C_TYPE total = 0;
+ for (const auto& s : strings) {
+ C_TYPE value;
+ if (!converter(s.data(), s.length(), &value)) {
+ std::cerr << "Conversion failed for '" << s << "'";
+ std::abort();
+ }
+ total += value;
+ }
+ benchmark::DoNotOptimize(total);
+ }
+ state.SetItemsProcessed(state.iterations() * strings.size());
+}
+
+BENCHMARK_TEMPLATE(BM_IntegerParsing, Int8Type);
+BENCHMARK_TEMPLATE(BM_IntegerParsing, Int16Type);
+BENCHMARK_TEMPLATE(BM_IntegerParsing, Int32Type);
+BENCHMARK_TEMPLATE(BM_IntegerParsing, Int64Type);
+BENCHMARK_TEMPLATE(BM_IntegerParsing, UInt8Type);
+BENCHMARK_TEMPLATE(BM_IntegerParsing, UInt16Type);
+BENCHMARK_TEMPLATE(BM_IntegerParsing, UInt32Type);
+BENCHMARK_TEMPLATE(BM_IntegerParsing, UInt64Type);
+
+BENCHMARK_TEMPLATE(BM_FloatParsing, FloatType);
+BENCHMARK_TEMPLATE(BM_FloatParsing, DoubleType);
+
+} // namespace internal
+} // namespace arrow
diff --git a/cpp/src/arrow/util/parsing.h b/cpp/src/arrow/util/parsing.h
index efe3162..8efc614 100644
--- a/cpp/src/arrow/util/parsing.h
+++ b/cpp/src/arrow/util/parsing.h
@@ -76,6 +76,11 @@ class StringConverter<BooleanType> {
}
};
+// Ideas for faster float parsing:
+// - http://rapidjson.org/md_doc_internals.html#ParsingDouble
+// - https://github.com/google/double-conversion
+// - https://github.com/achan001/dtoa-fast
+
template <class ARROW_TYPE>
class StringToFloatConverterMixin {
public: