You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@doris.apache.org by zh...@apache.org on 2019/11/19 10:00:24 UTC

[incubator-doris] branch master updated: Improve to_bitmap parse int performance (#2223)

This is an automated email from the ASF dual-hosted git repository.

zhaoc pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-doris.git


The following commit(s) were added to refs/heads/master by this push:
     new 14769b0  Improve to_bitmap parse int performance (#2223)
14769b0 is described below

commit 14769b0beb7671626da011611d2c73ffe841b831
Author: kangkaisen <ka...@apache.org>
AuthorDate: Tue Nov 19 18:00:19 2019 +0800

    Improve to_bitmap parse int performance (#2223)
---
 be/src/exprs/bitmap_function.cpp       | 24 ++++-------
 be/src/util/string_parser.hpp          | 68 +++++++++++++++++++++++++++++++
 be/test/exprs/bitmap_function_test.cpp |  4 +-
 be/test/util/CMakeLists.txt            |  1 +
 be/test/util/string_parser_test.cpp    | 73 +++++++++++++++++++++++++++++++---
 run-ut.sh                              |  1 +
 6 files changed, 146 insertions(+), 25 deletions(-)

diff --git a/be/src/exprs/bitmap_function.cpp b/be/src/exprs/bitmap_function.cpp
index 107362a..2f1942a 100644
--- a/be/src/exprs/bitmap_function.cpp
+++ b/be/src/exprs/bitmap_function.cpp
@@ -19,6 +19,7 @@
 
 #include "exprs/anyval_util.h"
 #include "util/bitmap.h"
+#include "util/string_parser.hpp"
 
 namespace doris {
 void BitmapFunctions::init() {
@@ -79,27 +80,16 @@ BigIntVal BitmapFunctions::bitmap_count(FunctionContext* ctx, const StringVal& s
 StringVal BitmapFunctions::to_bitmap(doris_udf::FunctionContext* ctx, const doris_udf::StringVal& src) {
     std::unique_ptr<RoaringBitmap> bitmap {new RoaringBitmap()};
     if (!src.is_null) {
-        std::string tmp_str = std::string(reinterpret_cast<char*>(src.ptr), src.len) ;
-        unsigned long uint32_value = 0;
-        try {
-            uint32_value = std::stoul(tmp_str);
-            // the std::stoul result type is unsigned long, not uint32_t. so we need check it
-            if(UNLIKELY(uint32_value > std::numeric_limits<unsigned int>::max())) {
-                throw std::out_of_range("");
-            }
-        } catch (std::invalid_argument& e) {
+        StringParser::ParseResult parse_result = StringParser::PARSE_SUCCESS;
+        uint32_t int_value = StringParser::string_to_unsigned_int<uint32_t>(reinterpret_cast<char*>(src.ptr), src.len, &parse_result);
+        if (UNLIKELY(parse_result != StringParser::PARSE_SUCCESS)) {
             std::stringstream error_msg;
-            error_msg << "The to_bitmap function argument: " << tmp_str << " type isn't integer family";
-            ctx->set_error(error_msg.str().c_str());
-            return StringVal::null();
-        } catch (std::out_of_range& e) {
-            std::stringstream error_msg;
-            error_msg << "The to_bitmap function argument: " << tmp_str << " exceed unsigned integer max value "
-                      << std::numeric_limits<unsigned int>::max();
+            error_msg << "The to_bitmap function argument: " << std::string(reinterpret_cast<char*>(src.ptr), src.len)
+            << " type isn't integer family or exceed unsigned integer max value 4294967295";
             ctx->set_error(error_msg.str().c_str());
             return StringVal::null();
         }
-        bitmap->update(uint32_value);
+        bitmap->update(int_value);
     }
     std::string buf;
     buf.resize(bitmap->size());
diff --git a/be/src/util/string_parser.hpp b/be/src/util/string_parser.hpp
index 3a0aa73..0354343 100644
--- a/be/src/util/string_parser.hpp
+++ b/be/src/util/string_parser.hpp
@@ -86,6 +86,20 @@ public:
         return string_to_int_internal<T>(s + i, len - i, result);
     }
 
+    // This is considerably faster than glibc's implementation.
+    // In the case of overflow, the max/min value for the data type will be returned.
+    // Assumes s represents a decimal number.
+    template <typename T>
+    static inline T string_to_unsigned_int(const char* s, int len, ParseResult* result) {
+        T ans = string_to_unsigned_int_internal<T>(s, len, result);
+        if (LIKELY(*result == PARSE_SUCCESS)){
+            return ans;
+        }
+
+        int i = skip_leading_whitespace(s, len);
+        return string_to_unsigned_int_internal<T>(s + i, len - i, result);
+    }
+
     // Convert a string s representing a number in given base into a decimal number.
     template <typename T>
     static inline T string_to_int(const char* s, int len, int base, ParseResult* result) {
@@ -159,6 +173,13 @@ private:
     template <typename T>
     static inline T string_to_int_internal(const char* s, int len, ParseResult* result);
 
+    // This is considerably faster than glibc's implementation.
+    // In the case of overflow, the max/min value for the data type will be returned.
+    // Assumes s represents a decimal number.
+    // Return PARSE_FAILURE on leading whitespace. Trailing whitespace is allowed.
+    template <typename T>
+    static inline T string_to_unsigned_int_internal(const char* s, int len, ParseResult* result);
+
     // Convert a string s representing a number in given base into a decimal number.
     // Return PARSE_FAILURE on leading whitespace. Trailing whitespace is allowed.
     template <typename T>
@@ -267,6 +288,53 @@ inline T StringParser::string_to_int_internal(const char* s, int len, ParseResul
 }
 
 template <typename T>
+inline T StringParser::string_to_unsigned_int_internal(const char* s, int len, ParseResult* result) {
+    if (UNLIKELY(len <= 0)) {
+        *result = PARSE_FAILURE;
+        return 0;
+    }
+
+    T val = 0;
+    T max_val = std::numeric_limits<T>::max();
+    int i = 0;
+
+    typedef typename std::make_signed<T>::type signedT;
+    // This is the fast path where the string cannot overflow.
+    if (LIKELY(len - i < StringParseTraits<signedT>::max_ascii_len())) {
+        val = string_to_int_no_overflow<T>(s + i, len - i, result);
+        return val;
+    }
+
+    const T max_div_10 = max_val / 10;
+    const T max_mod_10 = max_val % 10;
+
+    int first = i;
+    for (; i < len; ++i) {
+        if (LIKELY(s[i] >= '0' && s[i] <= '9')) {
+            T digit = s[i] - '0';
+            // This is a tricky check to see if adding this digit will cause an overflow.
+            if (UNLIKELY(val > (max_div_10 - (digit > max_mod_10)))) {
+                *result = PARSE_OVERFLOW;
+                return max_val;
+            }
+            val = val * 10 + digit;
+        } else {
+            if ((UNLIKELY(i == first || !is_all_whitespace(s + i, len - i)))) {
+                // Reject the string because either the first char was not a digit,
+                // or the remaining chars are not all whitespace
+                *result = PARSE_FAILURE;
+                return 0;
+            }
+            // Returning here is slightly faster than breaking the loop.
+            *result = PARSE_SUCCESS;
+            return val;
+        }
+    }
+    *result = PARSE_SUCCESS;
+    return val;
+}
+
+template <typename T>
 inline T StringParser::string_to_int_internal(
         const char* s, int len, int base, ParseResult* result) {
     typedef typename std::make_unsigned<T>::type UnsignedT;
diff --git a/be/test/exprs/bitmap_function_test.cpp b/be/test/exprs/bitmap_function_test.cpp
index 228a104..b8492f2 100644
--- a/be/test/exprs/bitmap_function_test.cpp
+++ b/be/test/exprs/bitmap_function_test.cpp
@@ -90,7 +90,7 @@ TEST_F(BitmapFunctionsTest, to_bitmap_invalid_argument) {
     ASSERT_EQ(expected, result);
     ASSERT_TRUE(ctx->has_error());
 
-    std::string error_msg("The to_bitmap function argument: xxxxxx type isn't integer family");
+    std::string error_msg("The to_bitmap function argument: xxxxxx type isn't integer family or exceed unsigned integer max value 4294967295");
     ASSERT_EQ(error_msg, ctx->error_msg());
 }
 
@@ -103,7 +103,7 @@ TEST_F(BitmapFunctionsTest, to_bitmap_out_of_range) {
 
     ASSERT_TRUE(ctx->has_error());
 
-    std::string error_msg("The to_bitmap function argument: 4294967296 exceed unsigned integer max value 4294967295");
+    std::string error_msg("The to_bitmap function argument: 4294967296 type isn't integer family or exceed unsigned integer max value 4294967295");
     ASSERT_EQ(error_msg, ctx->error_msg());
 }
 
diff --git a/be/test/util/CMakeLists.txt b/be/test/util/CMakeLists.txt
index bc78c18..1b4739e 100644
--- a/be/test/util/CMakeLists.txt
+++ b/be/test/util/CMakeLists.txt
@@ -32,6 +32,7 @@ ADD_BE_TEST(new_metrics_test)
 ADD_BE_TEST(doris_metrics_test)
 ADD_BE_TEST(system_metrics_test)
 ADD_BE_TEST(string_util_test)
+ADD_BE_TEST(string_parser_test)
 ADD_BE_TEST(core_local_test)
 ADD_BE_TEST(types_test)
 ADD_BE_TEST(json_util_test)
diff --git a/be/test/util/string_parser_test.cpp b/be/test/util/string_parser_test.cpp
index 0fd308b..dcfb3a6 100644
--- a/be/test/util/string_parser_test.cpp
+++ b/be/test/util/string_parser_test.cpp
@@ -47,6 +47,21 @@ void test_int_value(const char* s, T exp_val, StringParser::ParseResult exp_resu
     }
 }
 
+// Tests conversion of s to integer with and without leading/trailing whitespace
+template<typename T>
+void test_unsigned_int_value(const char* s, T exp_val, StringParser::ParseResult exp_result) {
+    for (int i = 0; i < space_len; ++i) {
+        for (int j = 0; j < space_len; ++j) {
+            // All combinations of leading and/or trailing whitespace.
+            std::string str = space[i] + s + space[j];
+            StringParser::ParseResult result;
+            T val = StringParser::string_to_unsigned_int<T>(str.data(), str.length(), &result);
+            EXPECT_EQ(exp_val, val) << str;
+            EXPECT_EQ(result, exp_result);
+        }
+    }
+}
+
 // Tests conversion of s, given a base, to an integer with and without leading/trailing whitespace
 template<typename T>
 void test_int_value(const char* s, int base, T exp_val, StringParser::ParseResult exp_result) {
@@ -209,13 +224,59 @@ TEST(StringToInt, Limit) {
     test_int_value<int32_t>("2147483647", 2147483647, StringParser::PARSE_SUCCESS);
     test_int_value<int32_t>("-2147483648", -2147483648, StringParser::PARSE_SUCCESS);
     test_int_value<int64_t>(
-            "9223372036854775807",
-            std::numeric_limits<int64_t>::max(),
-            StringParser::PARSE_SUCCESS);
+        "9223372036854775807",
+        std::numeric_limits<int64_t>::max(),
+        StringParser::PARSE_SUCCESS);
     test_int_value<int64_t>(
-            "-9223372036854775808",
-            std::numeric_limits<int64_t>::min(),
-            StringParser::PARSE_SUCCESS);
+        "-9223372036854775808",
+        std::numeric_limits<int64_t>::min(),
+        StringParser::PARSE_SUCCESS);
+}
+
+TEST(StringToUnsignedInt, Basic) {
+    test_unsigned_int_value<uint8_t>("123", 123, StringParser::PARSE_SUCCESS);
+    test_unsigned_int_value<uint16_t>("123", 123, StringParser::PARSE_SUCCESS);
+    test_unsigned_int_value<uint32_t>("123", 123, StringParser::PARSE_SUCCESS);
+    test_unsigned_int_value<uint64_t>("123", 123, StringParser::PARSE_SUCCESS);
+
+    test_unsigned_int_value<uint8_t>("123", 123, StringParser::PARSE_SUCCESS);
+    test_unsigned_int_value<uint16_t>("12345", 12345, StringParser::PARSE_SUCCESS);
+    test_unsigned_int_value<uint32_t>("12345678", 12345678, StringParser::PARSE_SUCCESS);
+    test_unsigned_int_value<uint64_t>("12345678901234", 12345678901234, StringParser::PARSE_SUCCESS);
+
+    test_unsigned_int_value<uint8_t>("-10", 0, StringParser::PARSE_FAILURE);
+    test_unsigned_int_value<uint16_t>("-10", 0, StringParser::PARSE_FAILURE);
+    test_unsigned_int_value<uint32_t>("-10", 0, StringParser::PARSE_FAILURE);
+    test_unsigned_int_value<uint64_t>("-10", 0, StringParser::PARSE_FAILURE);
+
+    test_unsigned_int_value<uint8_t>("+1", 0, StringParser::PARSE_FAILURE);
+    test_unsigned_int_value<uint16_t>("+1", 0, StringParser::PARSE_FAILURE);
+    test_unsigned_int_value<uint32_t>("+1", 0, StringParser::PARSE_FAILURE);
+    test_unsigned_int_value<uint64_t>("+1", 0, StringParser::PARSE_FAILURE);
+
+    test_unsigned_int_value<uint8_t>("+0", 0, StringParser::PARSE_FAILURE);
+    test_unsigned_int_value<uint16_t>("-0", 0, StringParser::PARSE_FAILURE);
+    test_unsigned_int_value<uint32_t>("+0", 0, StringParser::PARSE_FAILURE);
+    test_unsigned_int_value<uint64_t>("-0", 0, StringParser::PARSE_FAILURE);
+}
+
+TEST(StringToUnsignedInt, Limit) {
+    test_unsigned_int_value<uint8_t>("255", 255, StringParser::PARSE_SUCCESS);
+    test_unsigned_int_value<uint16_t>("65535", 65535, StringParser::PARSE_SUCCESS);
+    test_unsigned_int_value<uint32_t>("4294967295", 4294967295, StringParser::PARSE_SUCCESS);
+    test_unsigned_int_value<uint64_t>("18446744073709551615",
+                                      std::numeric_limits<uint64_t>::max(),
+                                      StringParser::PARSE_SUCCESS);
+}
+
+TEST(StringToUnsignedInt, Overflow) {
+    test_unsigned_int_value<uint8_t>("256", 255, StringParser::PARSE_OVERFLOW);
+    test_unsigned_int_value<uint16_t>("65536", 65535, StringParser::PARSE_OVERFLOW);
+    test_unsigned_int_value<uint32_t>("4294967296", 4294967295, StringParser::PARSE_OVERFLOW);
+    test_unsigned_int_value<uint64_t>(
+        "18446744073709551616",
+        std::numeric_limits<uint64_t>::max(),
+        StringParser::PARSE_OVERFLOW);
 }
 
 TEST(StringToInt, Overflow) {
diff --git a/run-ut.sh b/run-ut.sh
index 07c035d..d0720ed 100755
--- a/run-ut.sh
+++ b/run-ut.sh
@@ -151,6 +151,7 @@ ${DORIS_TEST_BINARY_DIR}/util/byte_buffer_test2
 ${DORIS_TEST_BINARY_DIR}/util/uid_util_test
 ${DORIS_TEST_BINARY_DIR}/util/aes_util_test
 ${DORIS_TEST_BINARY_DIR}/util/string_util_test
+${DORIS_TEST_BINARY_DIR}/util/string_parser_test
 ${DORIS_TEST_BINARY_DIR}/util/coding_test
 ${DORIS_TEST_BINARY_DIR}/util/faststring_test
 ${DORIS_TEST_BINARY_DIR}/util/tdigest_test


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org
For additional commands, e-mail: commits-help@doris.apache.org