You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@doris.apache.org by yi...@apache.org on 2022/10/21 00:33:45 UTC

[doris] branch master updated: [opt](function) refactor extract_url to use StringValue (#13508)

This is an automated email from the ASF dual-hosted git repository.

yiguolei pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new e62d3dd8e5 [opt](function) refactor extract_url to use StringValue (#13508)
e62d3dd8e5 is described below

commit e62d3dd8e58979f8776bff0612f19dfecdb100fa
Author: HappenLee <ha...@hotmail.com>
AuthorDate: Fri Oct 21 08:33:39 2022 +0800

    [opt](function) refactor extract_url to use StringValue (#13508)
    
    change extract_url use stringvalue to repalce std::string to speed up
---
 be/src/runtime/string_value.h          |  3 +++
 be/src/runtime/string_value.hpp        |  5 ++++
 be/src/util/url_parser.cpp             | 45 +++++++++++++++++-----------------
 be/src/util/url_parser.h               |  2 +-
 be/src/vec/functions/function_string.h | 15 +++++-------
 docs/sidebars.json                     |  1 +
 6 files changed, 38 insertions(+), 33 deletions(-)

diff --git a/be/src/runtime/string_value.h b/be/src/runtime/string_value.h
index 13b3852a5d..4878ca31e4 100644
--- a/be/src/runtime/string_value.h
+++ b/be/src/runtime/string_value.h
@@ -173,6 +173,9 @@ struct StringValue {
     // Trims leading and trailing spaces.
     StringValue trim() const;
 
+    // Find the first position char of appear, return -1 if not found
+    int64_t find_first_of(char c) const;
+
     void to_string_val(doris_udf::StringVal* sv) const {
         *sv = doris_udf::StringVal(reinterpret_cast<uint8_t*>(ptr), len);
     }
diff --git a/be/src/runtime/string_value.hpp b/be/src/runtime/string_value.hpp
index 961e8b86c5..fb8039e661 100644
--- a/be/src/runtime/string_value.hpp
+++ b/be/src/runtime/string_value.hpp
@@ -51,4 +51,9 @@ inline StringValue StringValue::trim() const {
     return StringValue(ptr + begin, end - begin + 1);
 }
 
+inline int64_t StringValue::find_first_of(char c) const {
+    const char* p = static_cast<const char*>(memchr(ptr, c, len));
+    return p == nullptr ? -1 : p - ptr;
+}
+
 } // namespace doris
diff --git a/be/src/util/url_parser.cpp b/be/src/util/url_parser.cpp
index 00d2783bd6..06ed454251 100644
--- a/be/src/util/url_parser.cpp
+++ b/be/src/util/url_parser.cpp
@@ -346,9 +346,8 @@ UrlParser::UrlPart UrlParser::get_url_part(const StringValue& part) {
     }
 }
 
-std::string UrlParser::extract_url(const StringValue& url, const StringValue& name) {
-    std::string result;
-    std::string str_name = name.to_string();
+StringValue UrlParser::extract_url(StringValue url, StringValue name) {
+    StringValue result("", 0);
     // Remove leading and trailing spaces.
     StringValue trimmed_url = url.trim();
     // find '?'
@@ -358,45 +357,45 @@ std::string UrlParser::extract_url(const StringValue& url, const StringValue& na
         // Example: https://doris.apache.org/
         return result;
     }
+
     // find '#'
     int32_t hash_pos = _s_hash_search.search(&trimmed_url);
-    std::string sub_url = "";
+    StringValue sub_url;
     if (hash_pos < 0) {
-        sub_url = trimmed_url.substring(question_pos + 1, trimmed_url.len - question_pos - 1)
-                          .to_string();
+        sub_url = trimmed_url.substring(question_pos + 1, trimmed_url.len - question_pos - 1);
     } else {
-        sub_url = trimmed_url.substring(question_pos + 1, hash_pos - question_pos - 1).to_string();
+        sub_url = trimmed_url.substring(question_pos + 1, hash_pos - question_pos - 1);
     }
 
     // find '&' and '=', and extract target parameter
     // Example: k1=aa&k2=bb&k3=cc&test=dd
-    std::string::size_type and_pod;
-    std::string::size_type len = sub_url.length();
-    std::string key_url;
+    int64_t and_pod;
+    auto len = sub_url.len;
+    StringValue key_url;
     while (true) {
         if (len <= 0) {
             break;
         }
         and_pod = sub_url.find_first_of('&');
-        if (and_pod != std::string::npos) {
-            key_url = sub_url.substr(0, and_pod);
-            sub_url = sub_url.substr(and_pod + 1, len - and_pod);
+        if (and_pod != -1) {
+            key_url = sub_url.substring(0, and_pod);
+            sub_url = sub_url.substring(and_pod + 1, len - and_pod);
         } else {
-            key_url = sub_url;
-            sub_url = "";
+            auto end_pos = sub_url.find_first_of('#');
+            key_url = end_pos == -1 ? sub_url : sub_url.substring(0, end_pos);
+            sub_url = result;
         }
-        len = sub_url.length();
+        len = sub_url.len;
 
-        std::string::size_type eq_pod = key_url.find_first_of('=');
-        if (eq_pod == std::string::npos) {
+        auto eq_pod = key_url.find_first_of('=');
+        if (eq_pod == -1) {
             // invalid url. like: k1&k2=bb
             continue;
         }
-        int32_t key_len = key_url.length();
-        std::string key = key_url.substr(0, eq_pod);
-        if (str_name == key) {
-            result = key_url.substr(eq_pod + 1, key_len - eq_pod);
-            return result;
+        int32_t key_len = key_url.len;
+        auto key = key_url.substring(0, eq_pod);
+        if (name == key) {
+            return key_url.substring(eq_pod + 1, key_len - eq_pod - 1);
         }
     }
     return result;
diff --git a/be/src/util/url_parser.h b/be/src/util/url_parser.h
index 0d212b1acd..e2a7ca6872 100644
--- a/be/src/util/url_parser.h
+++ b/be/src/util/url_parser.h
@@ -63,7 +63,7 @@ public:
     // Extract parameter value from url
     // Example for url:
     // http://doris.apache.org?k1=aa&k2=bb&k3=cc&test=dd#999
-    static std::string extract_url(const StringValue& url, const StringValue& name);
+    static StringValue extract_url(StringValue url, StringValue name);
 
 private:
     // Constants representing parts of a URL.
diff --git a/be/src/vec/functions/function_string.h b/be/src/vec/functions/function_string.h
index 2b6fe69764..cc51514b8b 100644
--- a/be/src/vec/functions/function_string.h
+++ b/be/src/vec/functions/function_string.h
@@ -1276,12 +1276,9 @@ public:
         for (int i = 0; i < input_rows_count; ++i) {
             auto source = url_col->get_data_at(i);
             auto param = parameter_col->get_data_at(i);
-            StringValue url_str(const_cast<char*>(source.data), source.size);
-            StringValue parameter_str(const_cast<char*>(param.data), param.size);
+            auto res = extract_url(source, param);
 
-            std::string result = extract_url(url_str, parameter_str);
-
-            col_res->insert_data(result.data(), result.length());
+            col_res->insert_data(res.ptr, res.len);
         }
 
         block.replace_by_position(result, std::move(col_res));
@@ -1289,11 +1286,11 @@ public:
     }
 
 private:
-    std::string extract_url(StringValue url, StringValue parameter) {
-        if (url.len == 0 || parameter.len == 0) {
-            return "";
+    StringValue extract_url(StringRef url, StringRef parameter) {
+        if (url.size == 0 || parameter.size == 0) {
+            return StringValue("", 0);
         }
-        return UrlParser::extract_url(url, parameter);
+        return UrlParser::extract_url(StringValue(url), StringValue(parameter));
     }
 };
 
diff --git a/docs/sidebars.json b/docs/sidebars.json
index 31193a7e78..57eca79a93 100644
--- a/docs/sidebars.json
+++ b/docs/sidebars.json
@@ -388,6 +388,7 @@
                                 "sql-manual/sql-functions/string-functions/split_part",
                                 "sql-manual/sql-functions/string-functions/money_format",
                                 "sql-manual/sql-functions/string-functions/parse_url",
+                                "sql-manual/sql-functions/string-functions/extract_url_parameter",
                                 {
                                     "type": "category",
                                     "label": "Fuzzy Match",


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org
For additional commands, e-mail: commits-help@doris.apache.org