You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ap...@apache.org on 2018/04/11 15:54:10 UTC
[arrow] branch master updated: ARROW-2224: [C++] Remove boost-regex
dependency
This is an automated email from the ASF dual-hosted git repository.
apitrou pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 4009b62 ARROW-2224: [C++] Remove boost-regex dependency
4009b62 is described below
commit 4009b62086dfa43a4fd8bfa714772716e6531c6f
Author: Antoine Pitrou <an...@python.org>
AuthorDate: Wed Apr 11 17:53:51 2018 +0200
ARROW-2224: [C++] Remove boost-regex dependency
Use a hand-written decimal parser.
Author: Antoine Pitrou <an...@python.org>
Closes #1880 from pitrou/ARROW-2224-remove-boost-regex-dep and squashes the following commits:
c173ceb <Antoine Pitrou> Mark result unused
0277855 <Antoine Pitrou> Add C++ decimal benchmark
f19dac2 <Antoine Pitrou> Address review comments
ba0b62e <Antoine Pitrou> ARROW-2224: Remove boost-regex dependency
---
cpp/README.md | 1 -
cpp/cmake_modules/ThirdpartyToolchain.cmake | 24 +---
cpp/src/arrow/util/CMakeLists.txt | 1 +
cpp/src/arrow/util/decimal-benchmark.cc | 45 ++++++++
cpp/src/arrow/util/decimal-test.cc | 2 -
cpp/src/arrow/util/decimal.cc | 164 +++++++++++++++-------------
6 files changed, 139 insertions(+), 98 deletions(-)
diff --git a/cpp/README.md b/cpp/README.md
index 8018efd..daeeade 100644
--- a/cpp/README.md
+++ b/cpp/README.md
@@ -35,7 +35,6 @@ On Ubuntu/Debian you can install the requirements with:
```shell
sudo apt-get install cmake \
libboost-dev \
- libboost-regex-dev \
libboost-filesystem-dev \
libboost-system-dev
```
diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake
index 129174c..020e0ed 100644
--- a/cpp/cmake_modules/ThirdpartyToolchain.cmake
+++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake
@@ -157,11 +157,8 @@ if (ARROW_BOOST_VENDORED)
"${BOOST_LIB_DIR}/${CMAKE_STATIC_LIBRARY_PREFIX}boost_system${CMAKE_STATIC_LIBRARY_SUFFIX}")
set(BOOST_STATIC_FILESYSTEM_LIBRARY
"${BOOST_LIB_DIR}/${CMAKE_STATIC_LIBRARY_PREFIX}boost_filesystem${CMAKE_STATIC_LIBRARY_SUFFIX}")
- set(BOOST_STATIC_REGEX_LIBRARY
- "${BOOST_LIB_DIR}/${CMAKE_STATIC_LIBRARY_PREFIX}boost_regex${CMAKE_STATIC_LIBRARY_SUFFIX}")
set(BOOST_SYSTEM_LIBRARY "${BOOST_STATIC_SYSTEM_LIBRARY}")
set(BOOST_FILESYSTEM_LIBRARY "${BOOST_STATIC_FILESYSTEM_LIBRARY}")
- set(BOOST_REGEX_LIBRARY "${BOOST_STATIC_REGEX_LIBRARY}")
if (ARROW_BOOST_HEADER_ONLY)
set(BOOST_BUILD_PRODUCTS)
set(BOOST_CONFIGURE_COMMAND "")
@@ -169,12 +166,11 @@ if (ARROW_BOOST_VENDORED)
else()
set(BOOST_BUILD_PRODUCTS
${BOOST_SYSTEM_LIBRARY}
- ${BOOST_FILESYSTEM_LIBRARY}
- ${BOOST_REGEX_LIBRARY})
+ ${BOOST_FILESYSTEM_LIBRARY})
set(BOOST_CONFIGURE_COMMAND
"./bootstrap.sh"
"--prefix=${BOOST_PREFIX}"
- "--with-libraries=filesystem,system,regex")
+ "--with-libraries=filesystem,system")
if ("${CMAKE_BUILD_TYPE}" STREQUAL "DEBUG")
set(BOOST_BUILD_VARIANT "debug")
else()
@@ -214,19 +210,16 @@ else()
if (ARROW_BOOST_HEADER_ONLY)
find_package(Boost REQUIRED)
else()
- find_package(Boost COMPONENTS system filesystem regex REQUIRED)
+ find_package(Boost COMPONENTS system filesystem REQUIRED)
if ("${CMAKE_BUILD_TYPE}" STREQUAL "DEBUG")
set(BOOST_SHARED_SYSTEM_LIBRARY ${Boost_SYSTEM_LIBRARY_DEBUG})
set(BOOST_SHARED_FILESYSTEM_LIBRARY ${Boost_FILESYSTEM_LIBRARY_DEBUG})
- set(BOOST_SHARED_REGEX_LIBRARY ${Boost_REGEX_LIBRARY_DEBUG})
else()
set(BOOST_SHARED_SYSTEM_LIBRARY ${Boost_SYSTEM_LIBRARY_RELEASE})
set(BOOST_SHARED_FILESYSTEM_LIBRARY ${Boost_FILESYSTEM_LIBRARY_RELEASE})
- set(BOOST_SHARED_REGEX_LIBRARY ${Boost_REGEX_LIBRARY_RELEASE})
endif()
set(BOOST_SYSTEM_LIBRARY boost_system_shared)
set(BOOST_FILESYSTEM_LIBRARY boost_filesystem_shared)
- set(BOOST_REGEX_LIBRARY boost_regex_shared)
endif()
else()
# Find static boost headers and libs
@@ -235,19 +228,16 @@ else()
if (ARROW_BOOST_HEADER_ONLY)
find_package(Boost REQUIRED)
else()
- find_package(Boost COMPONENTS system filesystem regex REQUIRED)
+ find_package(Boost COMPONENTS system filesystem REQUIRED)
if ("${CMAKE_BUILD_TYPE}" STREQUAL "DEBUG")
set(BOOST_STATIC_SYSTEM_LIBRARY ${Boost_SYSTEM_LIBRARY_DEBUG})
set(BOOST_STATIC_FILESYSTEM_LIBRARY ${Boost_FILESYSTEM_LIBRARY_DEBUG})
- set(BOOST_STATIC_REGEX_LIBRARY ${Boost_REGEX_LIBRARY_DEBUG})
else()
set(BOOST_STATIC_SYSTEM_LIBRARY ${Boost_SYSTEM_LIBRARY_RELEASE})
set(BOOST_STATIC_FILESYSTEM_LIBRARY ${Boost_FILESYSTEM_LIBRARY_RELEASE})
- set(BOOST_STATIC_REGEX_LIBRARY ${Boost_REGEX_LIBRARY_RELEASE})
endif()
set(BOOST_SYSTEM_LIBRARY boost_system_static)
set(BOOST_FILESYSTEM_LIBRARY boost_filesystem_static)
- set(BOOST_REGEX_LIBRARY boost_regex_static)
endif()
endif()
endif()
@@ -264,11 +254,7 @@ if (NOT ARROW_BOOST_HEADER_ONLY)
STATIC_LIB "${BOOST_STATIC_FILESYSTEM_LIBRARY}"
SHARED_LIB "${BOOST_SHARED_FILESYSTEM_LIBRARY}")
- ADD_THIRDPARTY_LIB(boost_regex
- STATIC_LIB "${BOOST_STATIC_REGEX_LIBRARY}"
- SHARED_LIB "${BOOST_SHARED_REGEX_LIBRARY}")
-
- SET(ARROW_BOOST_LIBS boost_system boost_filesystem boost_regex)
+ SET(ARROW_BOOST_LIBS boost_system boost_filesystem)
endif()
include_directories(SYSTEM ${Boost_INCLUDE_DIR})
diff --git a/cpp/src/arrow/util/CMakeLists.txt b/cpp/src/arrow/util/CMakeLists.txt
index 41c27a5..8969527 100644
--- a/cpp/src/arrow/util/CMakeLists.txt
+++ b/cpp/src/arrow/util/CMakeLists.txt
@@ -60,5 +60,6 @@ ADD_ARROW_TEST(rle-encoding-test)
ADD_ARROW_TEST(stl-util-test)
ADD_ARROW_BENCHMARK(bit-util-benchmark)
+ADD_ARROW_BENCHMARK(decimal-benchmark)
add_subdirectory(variant)
diff --git a/cpp/src/arrow/util/decimal-benchmark.cc b/cpp/src/arrow/util/decimal-benchmark.cc
new file mode 100644
index 0000000..3129536
--- /dev/null
+++ b/cpp/src/arrow/util/decimal-benchmark.cc
@@ -0,0 +1,45 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "benchmark/benchmark.h"
+
+#include <string>
+#include <vector>
+
+#include "arrow/util/decimal.h"
+#include "arrow/util/macros.h"
+
+namespace arrow {
+namespace Decimal {
+
+static void BM_FromString(benchmark::State& state) { // NOLINT non-const reference
+ std::vector<std::string> values = {"0", "1.23", "12.345e6", "-12.345e-6"};
+
+ while (state.KeepRunning()) {
+ for (const auto& value : values) {
+ Decimal128 dec;
+ int32_t scale, precision;
+ ARROW_UNUSED(Decimal128::FromString(value, &dec, &scale, &precision));
+ }
+ }
+ state.SetItemsProcessed(state.iterations() * values.size());
+}
+
+BENCHMARK(BM_FromString)->Repetitions(3)->Unit(benchmark::kMicrosecond);
+
+} // namespace Decimal
+} // namespace arrow
diff --git a/cpp/src/arrow/util/decimal-test.cc b/cpp/src/arrow/util/decimal-test.cc
index 6db46d4..2829e4a 100644
--- a/cpp/src/arrow/util/decimal-test.cc
+++ b/cpp/src/arrow/util/decimal-test.cc
@@ -205,9 +205,7 @@ TEST(DecimalZerosTest, LeadingZerosDecimalPoint) {
int32_t precision;
int32_t scale;
ASSERT_OK(Decimal128::FromString(string_value, &d, &precision, &scale));
- // We explicitly do not support this for now, otherwise this would be ASSERT_EQ
ASSERT_EQ(4, precision);
-
ASSERT_EQ(4, scale);
ASSERT_EQ(0, d);
}
diff --git a/cpp/src/arrow/util/decimal.cc b/cpp/src/arrow/util/decimal.cc
index 48380a9..9e5e3dd 100644
--- a/cpp/src/arrow/util/decimal.cc
+++ b/cpp/src/arrow/util/decimal.cc
@@ -23,8 +23,6 @@
#include <limits>
#include <sstream>
-#include <boost/regex.hpp>
-
#include "arrow/util/bit-util.h"
#include "arrow/util/decimal.h"
#include "arrow/util/logging.h"
@@ -253,112 +251,126 @@ static void StringToInteger(const std::string& str, Decimal128* out) {
}
}
-static const boost::regex DECIMAL_REGEX(
- // sign of the number
- "(?<SIGN>[-+]?)"
-
- // digits around the decimal point
- "(((?<LEFT_DIGITS>\\d+)\\.(?<FIRST_RIGHT_DIGITS>\\d*)|\\.(?<SECOND_RIGHT_DIGITS>\\d+)"
- ")"
+namespace {
- // optional exponent
- "([eE](?<FIRST_EXP_VALUE>[-+]?\\d+))?"
+struct DecimalComponents {
+ std::string sign;
+ std::string whole_digits;
+ std::string fractional_digits;
+ std::string exponent_sign;
+ std::string exponent_digits;
+};
- // otherwise
- "|"
+inline bool IsSign(char c) { return c == '-' || c == '+'; }
- // we're just an integer
- "(?<INTEGER>\\d+)"
+inline bool IsDot(char c) { return c == '.'; }
- // or an integer with an exponent
- "(?:[eE](?<SECOND_EXP_VALUE>[-+]?\\d+))?)");
+inline bool IsDigit(char c) { return c >= '0' && c <= '9'; }
-static inline bool is_zero_character(char c) { return c == '0'; }
+inline bool StartsExponent(char c) { return c == 'e' || c == 'E'; }
-Status Decimal128::FromString(const std::string& s, Decimal128* out, int32_t* precision,
- int32_t* scale) {
- if (s.empty()) {
- return Status::Invalid("Empty string cannot be converted to decimal");
+inline size_t ParseDigitsRun(const char* s, size_t start, size_t size, std::string* out) {
+ size_t pos;
+ for (pos = start; pos < size; ++pos) {
+ if (!IsDigit(s[pos])) {
+ break;
+ }
}
+ *out = std::string(s + start, pos - start);
+ return pos;
+}
- // case of all zeros
- if (std::all_of(s.cbegin(), s.cend(), is_zero_character)) {
- if (precision != nullptr) {
- *precision = 0;
- }
+bool ParseDecimalComponents(const char* s, size_t size, DecimalComponents* out) {
+ size_t pos = 0;
- if (scale != nullptr) {
- *scale = 0;
+ if (size == 0) {
+ return false;
+ }
+ // Sign of the number
+ if (IsSign(s[pos])) {
+ out->sign = std::string(s + pos, 1);
+ ++pos;
+ }
+ // First run of digits
+ pos = ParseDigitsRun(s, pos, size, &out->whole_digits);
+ if (pos == size) {
+ return !out->whole_digits.empty();
+ }
+ // Optional dot (if given in fractional form)
+ bool has_dot = IsDot(s[pos]);
+ if (has_dot) {
+ // Second run of digits
+ ++pos;
+ pos = ParseDigitsRun(s, pos, size, &out->fractional_digits);
+ }
+ if (out->whole_digits.empty() && out->fractional_digits.empty()) {
+ // Need at least some digits (whole or fractional)
+ return false;
+ }
+ if (pos == size) {
+ return true;
+ }
+ // Optional exponent
+ if (StartsExponent(s[pos])) {
+ ++pos;
+ if (pos == size) {
+ return false;
+ }
+ // Optional exponent sign
+ if (IsSign(s[pos])) {
+ out->exponent_sign = std::string(s + pos, 1);
+ ++pos;
+ }
+ pos = ParseDigitsRun(s, pos, size, &out->exponent_digits);
+ if (out->exponent_digits.empty()) {
+ // Need some exponent digits
+ return false;
}
-
- *out = 0;
- return Status::OK();
}
+ return pos == size;
+}
- boost::smatch results;
- const bool matches = boost::regex_match(s, results, DECIMAL_REGEX);
+} // namespace
- if (!matches) {
- std::stringstream ss;
- ss << "The string " << s << " is not a valid decimal number";
- return Status::Invalid(ss.str());
+Status Decimal128::FromString(const std::string& s, Decimal128* out, int32_t* precision,
+ int32_t* scale) {
+ if (s.empty()) {
+ return Status::Invalid("Empty string cannot be converted to decimal");
}
- const std::string sign = results["SIGN"];
- const std::string integer = results["INTEGER"];
-
- const std::string left_digits = results["LEFT_DIGITS"];
- const std::string first_right_digits = results["FIRST_RIGHT_DIGITS"];
-
- const std::string second_right_digits = results["SECOND_RIGHT_DIGITS"];
-
- const std::string first_exp_value = results["FIRST_EXP_VALUE"];
- const std::string second_exp_value = results["SECOND_EXP_VALUE"];
-
- std::string whole_part;
- std::string fractional_part;
- std::string exponent_value;
-
- if (!integer.empty()) {
- whole_part = integer;
- } else if (!left_digits.empty()) {
- DCHECK(second_right_digits.empty()) << s << " " << second_right_digits;
- whole_part = left_digits;
- fractional_part = first_right_digits;
- } else {
- DCHECK(first_right_digits.empty()) << s << " " << first_right_digits;
- fractional_part = second_right_digits;
+ DecimalComponents dec;
+ if (!ParseDecimalComponents(s.data(), s.size(), &dec)) {
+ std::stringstream ss;
+ ss << "The string '" << s << "' is not a valid decimal number";
+ return Status::Invalid(ss.str());
}
+ std::string exponent_value = dec.exponent_sign + dec.exponent_digits;
- // skip leading zeros before the decimal point
- std::string::const_iterator without_leading_zeros =
- std::find_if_not(whole_part.cbegin(), whole_part.cend(), is_zero_character);
- whole_part = std::string(without_leading_zeros, whole_part.cend());
-
- if (!first_exp_value.empty()) {
- exponent_value = first_exp_value;
- } else {
- exponent_value = second_exp_value;
+ // Count number of significant digits (without leading zeros)
+ size_t first_non_zero = dec.whole_digits.find_first_not_of('0');
+ size_t significant_digits = dec.fractional_digits.size();
+ if (first_non_zero != std::string::npos) {
+ significant_digits += dec.whole_digits.size() - first_non_zero;
}
if (precision != nullptr) {
- *precision = static_cast<int32_t>(whole_part.size() + fractional_part.size());
+ *precision = static_cast<int32_t>(significant_digits);
}
if (scale != nullptr) {
if (!exponent_value.empty()) {
auto adjusted_exponent = static_cast<int32_t>(std::stol(exponent_value));
- auto len = static_cast<int32_t>(whole_part.size() + fractional_part.size());
+ auto len = static_cast<int32_t>(significant_digits);
*scale = -adjusted_exponent + len - 1;
} else {
- *scale = static_cast<int32_t>(fractional_part.size());
+ *scale = static_cast<int32_t>(dec.fractional_digits.size());
}
}
if (out != nullptr) {
*out = 0;
- StringToInteger(whole_part + fractional_part, out);
- if (sign == "-") {
+ StringToInteger(dec.whole_digits + dec.fractional_digits, out);
+ if (dec.sign == "-") {
out->Negate();
}
--
To stop receiving notification emails like this one, please contact
apitrou@apache.org.