You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ap...@apache.org on 2018/08/22 10:54:22 UTC
[arrow] branch master updated: ARROW-3099: [C++] Add benchmark for number parsing

This is an automated email from the ASF dual-hosted git repository.

apitrou pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 8602509  ARROW-3099: [C++] Add benchmark for number parsing
8602509 is described below

commit 8602509c6a1d2c0c396ecc23c5dd9d670b4190f4
Author: Antoine Pitrou <an...@python.org>
AuthorDate: Wed Aug 22 12:54:09 2018 +0200

    ARROW-3099: [C++] Add benchmark for number parsing
    
    Sample output here (gcc 7.3, Ubuntu 18.04):
    ```
    ---------------------------------------------------------------------
    Benchmark                              Time           CPU Iterations
    ---------------------------------------------------------------------
    BM_IntegerParsing<Int8Type>         1393 ns       1393 ns     499140   5.47679M items/s
    BM_IntegerParsing<Int16Type>        1496 ns       1496 ns     470116   5.09902M items/s
    BM_IntegerParsing<Int32Type>        1660 ns       1660 ns     420510   4.59584M items/s
    BM_IntegerParsing<Int64Type>        2071 ns       2070 ns     338406   3.68531M items/s
    BM_IntegerParsing<UInt8Type>        1229 ns       1229 ns     568370   6.20627M items/s
    BM_IntegerParsing<UInt16Type>       1311 ns       1311 ns     533548   5.81873M items/s
    BM_IntegerParsing<UInt32Type>       1402 ns       1401 ns     498921   5.44415M items/s
    BM_IntegerParsing<UInt64Type>       1625 ns       1625 ns     428792   4.69486M items/s
    BM_FloatParsing<FloatType>          4501 ns       4501 ns     155621   1.69512M items/s
    BM_FloatParsing<DoubleType>         4632 ns       4632 ns     150996   1.64721M items/s
    ```
    
    Author: Antoine Pitrou <an...@python.org>
    
    Closes #2456 from pitrou/ARROW-3099-benchmark-number-parsing and squashes the following commits:
    
    08bbf5d7 <Antoine Pitrou> Try to fix integer conversion warning
    60c4ac54 <Antoine Pitrou> Try to fix compile error
    e5dd04db <Antoine Pitrou> ARROW-3099:  Add benchmark for number parsing
---
 cpp/src/arrow/util/CMakeLists.txt              |   1 +
 cpp/src/arrow/util/number-parsing-benchmark.cc | 113 +++++++++++++++++++++++++
 cpp/src/arrow/util/parsing.h                   |   5 ++
 3 files changed, 119 insertions(+)

diff --git a/cpp/src/arrow/util/CMakeLists.txt b/cpp/src/arrow/util/CMakeLists.txt
index 9a4fceb..d0c6f88 100644
--- a/cpp/src/arrow/util/CMakeLists.txt
+++ b/cpp/src/arrow/util/CMakeLists.txt
@@ -66,5 +66,6 @@ ADD_ARROW_TEST(lazy-test)
 ADD_ARROW_BENCHMARK(bit-util-benchmark)
 ADD_ARROW_BENCHMARK(decimal-benchmark)
 ADD_ARROW_BENCHMARK(lazy-benchmark)
+ADD_ARROW_BENCHMARK(number-parsing-benchmark)
 
 add_subdirectory(variant)
diff --git a/cpp/src/arrow/util/number-parsing-benchmark.cc b/cpp/src/arrow/util/number-parsing-benchmark.cc
new file mode 100644
index 0000000..28ef76a
--- /dev/null
+++ b/cpp/src/arrow/util/number-parsing-benchmark.cc
@@ -0,0 +1,113 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "benchmark/benchmark.h"
+
+#include <limits>
+#include <string>
+#include <type_traits>
+#include <vector>
+
+#include "arrow/test-util.h"
+#include "arrow/util/parsing.h"
+
+namespace arrow {
+namespace internal {
+
+template <typename c_int>
+static std::vector<std::string> MakeIntStrings(int32_t num_items) {
+  using c_int_limits = std::numeric_limits<c_int>;
+  std::vector<std::string> base_strings = {"0",
+                                           "5",
+                                           c_int_limits::is_signed ? "-12" : "12",
+                                           "34",
+                                           "99",
+                                           c_int_limits::is_signed ? "-111" : "111",
+                                           std::to_string(c_int_limits::min()),
+                                           std::to_string(c_int_limits::max())};
+  std::vector<std::string> strings;
+  for (int32_t i = 0; i < num_items; ++i) {
+    strings.push_back(base_strings[i % base_strings.size()]);
+  }
+  return base_strings;
+}
+
+static std::vector<std::string> MakeFloatStrings(int32_t num_items) {
+  std::vector<std::string> base_strings = {"0.0",         "5",        "-12.3",
+                                           "98765430000", "3456.789", "0.0012345",
+                                           "2.34567e8",   "-5.67e-8"};
+  std::vector<std::string> strings;
+  for (int32_t i = 0; i < num_items; ++i) {
+    strings.push_back(base_strings[i % base_strings.size()]);
+  }
+  return base_strings;
+}
+
+template <typename ARROW_TYPE, typename C_TYPE = typename ARROW_TYPE::c_type>
+static void BM_IntegerParsing(benchmark::State& state) {  // NOLINT non-const reference
+  auto strings = MakeIntStrings<C_TYPE>(1000);
+  StringConverter<ARROW_TYPE> converter;
+
+  while (state.KeepRunning()) {
+    C_TYPE total = 0;
+    for (const auto& s : strings) {
+      C_TYPE value;
+      if (!converter(s.data(), s.length(), &value)) {
+        std::cerr << "Conversion failed for '" << s << "'";
+        std::abort();
+      }
+      total = static_cast<C_TYPE>(total + value);
+    }
+    benchmark::DoNotOptimize(total);
+  }
+  state.SetItemsProcessed(state.iterations() * strings.size());
+}
+
+template <typename ARROW_TYPE, typename C_TYPE = typename ARROW_TYPE::c_type>
+static void BM_FloatParsing(benchmark::State& state) {  // NOLINT non-const reference
+  auto strings = MakeFloatStrings(1000);
+  StringConverter<ARROW_TYPE> converter;
+
+  while (state.KeepRunning()) {
+    C_TYPE total = 0;
+    for (const auto& s : strings) {
+      C_TYPE value;
+      if (!converter(s.data(), s.length(), &value)) {
+        std::cerr << "Conversion failed for '" << s << "'";
+        std::abort();
+      }
+      total += value;
+    }
+    benchmark::DoNotOptimize(total);
+  }
+  state.SetItemsProcessed(state.iterations() * strings.size());
+}
+
+BENCHMARK_TEMPLATE(BM_IntegerParsing, Int8Type);
+BENCHMARK_TEMPLATE(BM_IntegerParsing, Int16Type);
+BENCHMARK_TEMPLATE(BM_IntegerParsing, Int32Type);
+BENCHMARK_TEMPLATE(BM_IntegerParsing, Int64Type);
+BENCHMARK_TEMPLATE(BM_IntegerParsing, UInt8Type);
+BENCHMARK_TEMPLATE(BM_IntegerParsing, UInt16Type);
+BENCHMARK_TEMPLATE(BM_IntegerParsing, UInt32Type);
+BENCHMARK_TEMPLATE(BM_IntegerParsing, UInt64Type);
+
+BENCHMARK_TEMPLATE(BM_FloatParsing, FloatType);
+BENCHMARK_TEMPLATE(BM_FloatParsing, DoubleType);
+
+}  // namespace internal
+}  // namespace arrow
diff --git a/cpp/src/arrow/util/parsing.h b/cpp/src/arrow/util/parsing.h
index efe3162..8efc614 100644
--- a/cpp/src/arrow/util/parsing.h
+++ b/cpp/src/arrow/util/parsing.h
@@ -76,6 +76,11 @@ class StringConverter<BooleanType> {
   }
 };
 
+// Ideas for faster float parsing:
+// - http://rapidjson.org/md_doc_internals.html#ParsingDouble
+// - https://github.com/google/double-conversion
+// - https://github.com/achan001/dtoa-fast
+
 template <class ARROW_TYPE>
 class StringToFloatConverterMixin {
  public: