You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@doris.apache.org by zh...@apache.org on 2019/11/19 10:00:24 UTC
[incubator-doris] branch master updated: Improve to_bitmap parse
int performance (#2223)
This is an automated email from the ASF dual-hosted git repository.
zhaoc pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-doris.git
The following commit(s) were added to refs/heads/master by this push:
new 14769b0 Improve to_bitmap parse int performance (#2223)
14769b0 is described below
commit 14769b0beb7671626da011611d2c73ffe841b831
Author: kangkaisen <ka...@apache.org>
AuthorDate: Tue Nov 19 18:00:19 2019 +0800
Improve to_bitmap parse int performance (#2223)
---
be/src/exprs/bitmap_function.cpp | 24 ++++-------
be/src/util/string_parser.hpp | 68 +++++++++++++++++++++++++++++++
be/test/exprs/bitmap_function_test.cpp | 4 +-
be/test/util/CMakeLists.txt | 1 +
be/test/util/string_parser_test.cpp | 73 +++++++++++++++++++++++++++++++---
run-ut.sh | 1 +
6 files changed, 146 insertions(+), 25 deletions(-)
diff --git a/be/src/exprs/bitmap_function.cpp b/be/src/exprs/bitmap_function.cpp
index 107362a..2f1942a 100644
--- a/be/src/exprs/bitmap_function.cpp
+++ b/be/src/exprs/bitmap_function.cpp
@@ -19,6 +19,7 @@
#include "exprs/anyval_util.h"
#include "util/bitmap.h"
+#include "util/string_parser.hpp"
namespace doris {
void BitmapFunctions::init() {
@@ -79,27 +80,16 @@ BigIntVal BitmapFunctions::bitmap_count(FunctionContext* ctx, const StringVal& s
StringVal BitmapFunctions::to_bitmap(doris_udf::FunctionContext* ctx, const doris_udf::StringVal& src) {
std::unique_ptr<RoaringBitmap> bitmap {new RoaringBitmap()};
if (!src.is_null) {
- std::string tmp_str = std::string(reinterpret_cast<char*>(src.ptr), src.len) ;
- unsigned long uint32_value = 0;
- try {
- uint32_value = std::stoul(tmp_str);
- // the std::stoul result type is unsigned long, not uint32_t. so we need check it
- if(UNLIKELY(uint32_value > std::numeric_limits<unsigned int>::max())) {
- throw std::out_of_range("");
- }
- } catch (std::invalid_argument& e) {
+ StringParser::ParseResult parse_result = StringParser::PARSE_SUCCESS;
+ uint32_t int_value = StringParser::string_to_unsigned_int<uint32_t>(reinterpret_cast<char*>(src.ptr), src.len, &parse_result);
+ if (UNLIKELY(parse_result != StringParser::PARSE_SUCCESS)) {
std::stringstream error_msg;
- error_msg << "The to_bitmap function argument: " << tmp_str << " type isn't integer family";
- ctx->set_error(error_msg.str().c_str());
- return StringVal::null();
- } catch (std::out_of_range& e) {
- std::stringstream error_msg;
- error_msg << "The to_bitmap function argument: " << tmp_str << " exceed unsigned integer max value "
- << std::numeric_limits<unsigned int>::max();
+ error_msg << "The to_bitmap function argument: " << std::string(reinterpret_cast<char*>(src.ptr), src.len)
+ << " type isn't integer family or exceed unsigned integer max value 4294967295";
ctx->set_error(error_msg.str().c_str());
return StringVal::null();
}
- bitmap->update(uint32_value);
+ bitmap->update(int_value);
}
std::string buf;
buf.resize(bitmap->size());
diff --git a/be/src/util/string_parser.hpp b/be/src/util/string_parser.hpp
index 3a0aa73..0354343 100644
--- a/be/src/util/string_parser.hpp
+++ b/be/src/util/string_parser.hpp
@@ -86,6 +86,20 @@ public:
return string_to_int_internal<T>(s + i, len - i, result);
}
+ // This is considerably faster than glibc's implementation.
+ // In the case of overflow, the max/min value for the data type will be returned.
+ // Assumes s represents a decimal number.
+ template <typename T>
+ static inline T string_to_unsigned_int(const char* s, int len, ParseResult* result) {
+ T ans = string_to_unsigned_int_internal<T>(s, len, result);
+ if (LIKELY(*result == PARSE_SUCCESS)){
+ return ans;
+ }
+
+ int i = skip_leading_whitespace(s, len);
+ return string_to_unsigned_int_internal<T>(s + i, len - i, result);
+ }
+
// Convert a string s representing a number in given base into a decimal number.
template <typename T>
static inline T string_to_int(const char* s, int len, int base, ParseResult* result) {
@@ -159,6 +173,13 @@ private:
template <typename T>
static inline T string_to_int_internal(const char* s, int len, ParseResult* result);
+ // This is considerably faster than glibc's implementation.
+ // In the case of overflow, the max/min value for the data type will be returned.
+ // Assumes s represents a decimal number.
+ // Return PARSE_FAILURE on leading whitespace. Trailing whitespace is allowed.
+ template <typename T>
+ static inline T string_to_unsigned_int_internal(const char* s, int len, ParseResult* result);
+
// Convert a string s representing a number in given base into a decimal number.
// Return PARSE_FAILURE on leading whitespace. Trailing whitespace is allowed.
template <typename T>
@@ -267,6 +288,53 @@ inline T StringParser::string_to_int_internal(const char* s, int len, ParseResul
}
template <typename T>
+inline T StringParser::string_to_unsigned_int_internal(const char* s, int len, ParseResult* result) {
+ if (UNLIKELY(len <= 0)) {
+ *result = PARSE_FAILURE;
+ return 0;
+ }
+
+ T val = 0;
+ T max_val = std::numeric_limits<T>::max();
+ int i = 0;
+
+ typedef typename std::make_signed<T>::type signedT;
+ // This is the fast path where the string cannot overflow.
+ if (LIKELY(len - i < StringParseTraits<signedT>::max_ascii_len())) {
+ val = string_to_int_no_overflow<T>(s + i, len - i, result);
+ return val;
+ }
+
+ const T max_div_10 = max_val / 10;
+ const T max_mod_10 = max_val % 10;
+
+ int first = i;
+ for (; i < len; ++i) {
+ if (LIKELY(s[i] >= '0' && s[i] <= '9')) {
+ T digit = s[i] - '0';
+ // This is a tricky check to see if adding this digit will cause an overflow.
+ if (UNLIKELY(val > (max_div_10 - (digit > max_mod_10)))) {
+ *result = PARSE_OVERFLOW;
+ return max_val;
+ }
+ val = val * 10 + digit;
+ } else {
+ if ((UNLIKELY(i == first || !is_all_whitespace(s + i, len - i)))) {
+ // Reject the string because either the first char was not a digit,
+ // or the remaining chars are not all whitespace
+ *result = PARSE_FAILURE;
+ return 0;
+ }
+ // Returning here is slightly faster than breaking the loop.
+ *result = PARSE_SUCCESS;
+ return val;
+ }
+ }
+ *result = PARSE_SUCCESS;
+ return val;
+}
+
+template <typename T>
inline T StringParser::string_to_int_internal(
const char* s, int len, int base, ParseResult* result) {
typedef typename std::make_unsigned<T>::type UnsignedT;
diff --git a/be/test/exprs/bitmap_function_test.cpp b/be/test/exprs/bitmap_function_test.cpp
index 228a104..b8492f2 100644
--- a/be/test/exprs/bitmap_function_test.cpp
+++ b/be/test/exprs/bitmap_function_test.cpp
@@ -90,7 +90,7 @@ TEST_F(BitmapFunctionsTest, to_bitmap_invalid_argument) {
ASSERT_EQ(expected, result);
ASSERT_TRUE(ctx->has_error());
- std::string error_msg("The to_bitmap function argument: xxxxxx type isn't integer family");
+ std::string error_msg("The to_bitmap function argument: xxxxxx type isn't integer family or exceed unsigned integer max value 4294967295");
ASSERT_EQ(error_msg, ctx->error_msg());
}
@@ -103,7 +103,7 @@ TEST_F(BitmapFunctionsTest, to_bitmap_out_of_range) {
ASSERT_TRUE(ctx->has_error());
- std::string error_msg("The to_bitmap function argument: 4294967296 exceed unsigned integer max value 4294967295");
+ std::string error_msg("The to_bitmap function argument: 4294967296 type isn't integer family or exceed unsigned integer max value 4294967295");
ASSERT_EQ(error_msg, ctx->error_msg());
}
diff --git a/be/test/util/CMakeLists.txt b/be/test/util/CMakeLists.txt
index bc78c18..1b4739e 100644
--- a/be/test/util/CMakeLists.txt
+++ b/be/test/util/CMakeLists.txt
@@ -32,6 +32,7 @@ ADD_BE_TEST(new_metrics_test)
ADD_BE_TEST(doris_metrics_test)
ADD_BE_TEST(system_metrics_test)
ADD_BE_TEST(string_util_test)
+ADD_BE_TEST(string_parser_test)
ADD_BE_TEST(core_local_test)
ADD_BE_TEST(types_test)
ADD_BE_TEST(json_util_test)
diff --git a/be/test/util/string_parser_test.cpp b/be/test/util/string_parser_test.cpp
index 0fd308b..dcfb3a6 100644
--- a/be/test/util/string_parser_test.cpp
+++ b/be/test/util/string_parser_test.cpp
@@ -47,6 +47,21 @@ void test_int_value(const char* s, T exp_val, StringParser::ParseResult exp_resu
}
}
+// Tests conversion of s to integer with and without leading/trailing whitespace
+template<typename T>
+void test_unsigned_int_value(const char* s, T exp_val, StringParser::ParseResult exp_result) {
+ for (int i = 0; i < space_len; ++i) {
+ for (int j = 0; j < space_len; ++j) {
+ // All combinations of leading and/or trailing whitespace.
+ std::string str = space[i] + s + space[j];
+ StringParser::ParseResult result;
+ T val = StringParser::string_to_unsigned_int<T>(str.data(), str.length(), &result);
+ EXPECT_EQ(exp_val, val) << str;
+ EXPECT_EQ(result, exp_result);
+ }
+ }
+}
+
// Tests conversion of s, given a base, to an integer with and without leading/trailing whitespace
template<typename T>
void test_int_value(const char* s, int base, T exp_val, StringParser::ParseResult exp_result) {
@@ -209,13 +224,59 @@ TEST(StringToInt, Limit) {
test_int_value<int32_t>("2147483647", 2147483647, StringParser::PARSE_SUCCESS);
test_int_value<int32_t>("-2147483648", -2147483648, StringParser::PARSE_SUCCESS);
test_int_value<int64_t>(
- "9223372036854775807",
- std::numeric_limits<int64_t>::max(),
- StringParser::PARSE_SUCCESS);
+ "9223372036854775807",
+ std::numeric_limits<int64_t>::max(),
+ StringParser::PARSE_SUCCESS);
test_int_value<int64_t>(
- "-9223372036854775808",
- std::numeric_limits<int64_t>::min(),
- StringParser::PARSE_SUCCESS);
+ "-9223372036854775808",
+ std::numeric_limits<int64_t>::min(),
+ StringParser::PARSE_SUCCESS);
+}
+
+TEST(StringToUnsignedInt, Basic) {
+ test_unsigned_int_value<uint8_t>("123", 123, StringParser::PARSE_SUCCESS);
+ test_unsigned_int_value<uint16_t>("123", 123, StringParser::PARSE_SUCCESS);
+ test_unsigned_int_value<uint32_t>("123", 123, StringParser::PARSE_SUCCESS);
+ test_unsigned_int_value<uint64_t>("123", 123, StringParser::PARSE_SUCCESS);
+
+ test_unsigned_int_value<uint8_t>("123", 123, StringParser::PARSE_SUCCESS);
+ test_unsigned_int_value<uint16_t>("12345", 12345, StringParser::PARSE_SUCCESS);
+ test_unsigned_int_value<uint32_t>("12345678", 12345678, StringParser::PARSE_SUCCESS);
+ test_unsigned_int_value<uint64_t>("12345678901234", 12345678901234, StringParser::PARSE_SUCCESS);
+
+ test_unsigned_int_value<uint8_t>("-10", 0, StringParser::PARSE_FAILURE);
+ test_unsigned_int_value<uint16_t>("-10", 0, StringParser::PARSE_FAILURE);
+ test_unsigned_int_value<uint32_t>("-10", 0, StringParser::PARSE_FAILURE);
+ test_unsigned_int_value<uint64_t>("-10", 0, StringParser::PARSE_FAILURE);
+
+ test_unsigned_int_value<uint8_t>("+1", 0, StringParser::PARSE_FAILURE);
+ test_unsigned_int_value<uint16_t>("+1", 0, StringParser::PARSE_FAILURE);
+ test_unsigned_int_value<uint32_t>("+1", 0, StringParser::PARSE_FAILURE);
+ test_unsigned_int_value<uint64_t>("+1", 0, StringParser::PARSE_FAILURE);
+
+ test_unsigned_int_value<uint8_t>("+0", 0, StringParser::PARSE_FAILURE);
+ test_unsigned_int_value<uint16_t>("-0", 0, StringParser::PARSE_FAILURE);
+ test_unsigned_int_value<uint32_t>("+0", 0, StringParser::PARSE_FAILURE);
+ test_unsigned_int_value<uint64_t>("-0", 0, StringParser::PARSE_FAILURE);
+}
+
+TEST(StringToUnsignedInt, Limit) {
+ test_unsigned_int_value<uint8_t>("255", 255, StringParser::PARSE_SUCCESS);
+ test_unsigned_int_value<uint16_t>("65535", 65535, StringParser::PARSE_SUCCESS);
+ test_unsigned_int_value<uint32_t>("4294967295", 4294967295, StringParser::PARSE_SUCCESS);
+ test_unsigned_int_value<uint64_t>("18446744073709551615",
+ std::numeric_limits<uint64_t>::max(),
+ StringParser::PARSE_SUCCESS);
+}
+
+TEST(StringToUnsignedInt, Overflow) {
+ test_unsigned_int_value<uint8_t>("256", 255, StringParser::PARSE_OVERFLOW);
+ test_unsigned_int_value<uint16_t>("65536", 65535, StringParser::PARSE_OVERFLOW);
+ test_unsigned_int_value<uint32_t>("4294967296", 4294967295, StringParser::PARSE_OVERFLOW);
+ test_unsigned_int_value<uint64_t>(
+ "18446744073709551616",
+ std::numeric_limits<uint64_t>::max(),
+ StringParser::PARSE_OVERFLOW);
}
TEST(StringToInt, Overflow) {
diff --git a/run-ut.sh b/run-ut.sh
index 07c035d..d0720ed 100755
--- a/run-ut.sh
+++ b/run-ut.sh
@@ -151,6 +151,7 @@ ${DORIS_TEST_BINARY_DIR}/util/byte_buffer_test2
${DORIS_TEST_BINARY_DIR}/util/uid_util_test
${DORIS_TEST_BINARY_DIR}/util/aes_util_test
${DORIS_TEST_BINARY_DIR}/util/string_util_test
+${DORIS_TEST_BINARY_DIR}/util/string_parser_test
${DORIS_TEST_BINARY_DIR}/util/coding_test
${DORIS_TEST_BINARY_DIR}/util/faststring_test
${DORIS_TEST_BINARY_DIR}/util/tdigest_test
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org
For additional commands, e-mail: commits-help@doris.apache.org