You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@doris.apache.org by li...@apache.org on 2022/10/20 03:11:49 UTC

[doris] branch master updated: [function](string_function) add new string function 'extract_url_parameter' (#13323)

This is an automated email from the ASF dual-hosted git repository.

lihaopeng pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new 2b328eafbb  [function](string_function) add new string function 'extract_url_parameter' (#13323)
2b328eafbb is described below

commit 2b328eafbb20f5ec206912b41165616f88727f63
Author: DongLiang-0 <46...@users.noreply.github.com>
AuthorDate: Thu Oct 20 11:11:43 2022 +0800

     [function](string_function) add new string function 'extract_url_parameter' (#13323)
---
 be/src/util/url_parser.cpp                         | 57 ++++++++++++++++++++++
 be/src/util/url_parser.h                           |  5 ++
 be/src/vec/functions/function_string.cpp           |  1 +
 be/src/vec/functions/function_string.h             | 50 +++++++++++++++++++
 be/test/vec/function/function_string_test.cpp      | 25 ++++++++++
 .../string-functions/extract_url_parameter.md      | 50 +++++++++++++++++++
 .../string-functions/extract_url_parameter.md      | 50 +++++++++++++++++++
 gensrc/script/doris_builtins_functions.py          |  1 +
 8 files changed, 239 insertions(+)

diff --git a/be/src/util/url_parser.cpp b/be/src/util/url_parser.cpp
index 0ce0913f3d..00d2783bd6 100644
--- a/be/src/util/url_parser.cpp
+++ b/be/src/util/url_parser.cpp
@@ -17,6 +17,8 @@
 
 #include "util/url_parser.h"
 
+#include <string>
+
 #include "runtime/string_value.hpp"
 
 namespace doris {
@@ -344,4 +346,59 @@ UrlParser::UrlPart UrlParser::get_url_part(const StringValue& part) {
     }
 }
 
+std::string UrlParser::extract_url(const StringValue& url, const StringValue& name) {
+    std::string result;
+    std::string str_name = name.to_string();
+    // Remove leading and trailing spaces.
+    StringValue trimmed_url = url.trim();
+    // find '?'
+    int32_t question_pos = _s_question_search.search(&trimmed_url);
+    if (question_pos < 0) {
+        // this url no parameters.
+        // Example: https://doris.apache.org/
+        return result;
+    }
+    // find '#'
+    int32_t hash_pos = _s_hash_search.search(&trimmed_url);
+    std::string sub_url = "";
+    if (hash_pos < 0) {
+        sub_url = trimmed_url.substring(question_pos + 1, trimmed_url.len - question_pos - 1)
+                          .to_string();
+    } else {
+        sub_url = trimmed_url.substring(question_pos + 1, hash_pos - question_pos - 1).to_string();
+    }
+
+    // find '&' and '=', and extract target parameter
+    // Example: k1=aa&k2=bb&k3=cc&test=dd
+    std::string::size_type and_pod;
+    std::string::size_type len = sub_url.length();
+    std::string key_url;
+    while (true) {
+        if (len <= 0) {
+            break;
+        }
+        and_pod = sub_url.find_first_of('&');
+        if (and_pod != std::string::npos) {
+            key_url = sub_url.substr(0, and_pod);
+            sub_url = sub_url.substr(and_pod + 1, len - and_pod);
+        } else {
+            key_url = sub_url;
+            sub_url = "";
+        }
+        len = sub_url.length();
+
+        std::string::size_type eq_pod = key_url.find_first_of('=');
+        if (eq_pod == std::string::npos) {
+            // invalid url. like: k1&k2=bb
+            continue;
+        }
+        int32_t key_len = key_url.length();
+        std::string key = key_url.substr(0, eq_pod);
+        if (str_name == key) {
+            result = key_url.substr(eq_pod + 1, key_len - eq_pod);
+            return result;
+        }
+    }
+    return result;
+}
 } // namespace doris
diff --git a/be/src/util/url_parser.h b/be/src/util/url_parser.h
index 3363f65e6b..0d212b1acd 100644
--- a/be/src/util/url_parser.h
+++ b/be/src/util/url_parser.h
@@ -60,6 +60,11 @@ public:
     // If part did not match any of the url part constants, returns INVALID.
     static UrlPart get_url_part(const StringValue& part);
 
+    // Extract parameter value from url
+    // Example for url:
+    // http://doris.apache.org?k1=aa&k2=bb&k3=cc&test=dd#999
+    static std::string extract_url(const StringValue& url, const StringValue& name);
+
 private:
     // Constants representing parts of a URL.
     static const StringValue _s_url_authority;
diff --git a/be/src/vec/functions/function_string.cpp b/be/src/vec/functions/function_string.cpp
index b8bf150249..7033eee2f8 100644
--- a/be/src/vec/functions/function_string.cpp
+++ b/be/src/vec/functions/function_string.cpp
@@ -678,6 +678,7 @@ void register_function_string(SimpleFunctionFactory& factory) {
     factory.register_function<FunctionFromBase64>();
     factory.register_function<FunctionSplitPart>();
     factory.register_function<FunctionStringMd5AndSM3<MD5Sum>>();
+    factory.register_function<FunctionExtractURLParameter>();
     factory.register_function<FunctionStringParseUrl>();
     factory.register_function<FunctionMoneyFormat<MoneyFormatDoubleImpl>>();
     factory.register_function<FunctionMoneyFormat<MoneyFormatInt64Impl>>();
diff --git a/be/src/vec/functions/function_string.h b/be/src/vec/functions/function_string.h
index 9588e7a2c0..403bc7b2f5 100644
--- a/be/src/vec/functions/function_string.h
+++ b/be/src/vec/functions/function_string.h
@@ -31,6 +31,8 @@
 #include <fmt/ranges.h>
 
 #include <cstdint>
+#include <memory_resource>
+#include <string>
 #include <string_view>
 
 #include "exprs/math_functions.h"
@@ -1248,6 +1250,54 @@ public:
     }
 };
 
+class FunctionExtractURLParameter : public IFunction {
+public:
+    static constexpr auto name = "extract_url_parameter";
+    static FunctionPtr create() { return std::make_shared<FunctionExtractURLParameter>(); }
+    String get_name() const override { return name; }
+    size_t get_number_of_arguments() const override { return 2; }
+
+    DataTypePtr get_return_type_impl(const DataTypes& arguments) const override {
+        return std::make_shared<DataTypeString>();
+    }
+
+    bool use_default_implementation_for_constants() const override { return true; }
+
+    Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments,
+                        size_t result, size_t input_rows_count) override {
+        auto col_url =
+                block.get_by_position(arguments[0]).column->convert_to_full_column_if_const();
+        auto col_parameter =
+                block.get_by_position(arguments[1]).column->convert_to_full_column_if_const();
+        auto url_col = assert_cast<const ColumnString*>(col_url.get());
+        auto parameter_col = assert_cast<const ColumnString*>(col_parameter.get());
+
+        ColumnString::MutablePtr col_res = ColumnString::create();
+
+        for (int i = 0; i < input_rows_count; ++i) {
+            auto source = url_col->get_data_at(i);
+            auto param = parameter_col->get_data_at(i);
+            StringValue url_str(const_cast<char*>(source.data), source.size);
+            StringValue parameter_str(const_cast<char*>(param.data), param.size);
+
+            std::string result = extract_url(url_str, parameter_str);
+
+            col_res->insert_data(result.data(), result.length());
+        }
+
+        block.replace_by_position(result, std::move(col_res));
+        return Status::OK();
+    }
+
+private:
+    std::string extract_url(StringValue url, StringValue parameter) {
+        if (url.len == 0 || parameter.len == 0) {
+            return "";
+        }
+        return UrlParser::extract_url(url, parameter);
+    }
+};
+
 class FunctionStringParseUrl : public IFunction {
 public:
     static constexpr auto name = "parse_url";
diff --git a/be/test/vec/function/function_string_test.cpp b/be/test/vec/function/function_string_test.cpp
index 3ee81d1391..e996bb13f9 100644
--- a/be/test/vec/function/function_string_test.cpp
+++ b/be/test/vec/function/function_string_test.cpp
@@ -953,6 +953,31 @@ TEST(function_string_test, function_sm4_decrypt_test) {
     }
 }
 
+TEST(function_string_test, function_extract_url_parameter_test) {
+    std::string func_name = "extract_url_parameter";
+    InputTypeSet input_types = {TypeIndex::String, TypeIndex::String};
+    DataSet data_set = {
+            {{VARCHAR(""), VARCHAR("k1")}, {VARCHAR("")}},
+            {{VARCHAR("http://doris.apache.org?k1=aa"), VARCHAR("")}, {VARCHAR("")}},
+            {{VARCHAR("https://doris.apache.org/"), VARCHAR("k1")}, {VARCHAR("")}},
+            {{VARCHAR("http://doris.apache.org?"), VARCHAR("k1")}, {VARCHAR("")}},
+            {{VARCHAR("http://doris.apache.org?k1=aa"), VARCHAR("k1")}, {VARCHAR("aa")}},
+            {{VARCHAR("http://doris.apache.org:8080?k1&k2=bb#99"), VARCHAR("k1")}, {VARCHAR("")}},
+            {{VARCHAR("http://doris.apache.org?k1=aa#999"), VARCHAR("k1")}, {VARCHAR("aa")}},
+            {{VARCHAR("http://doris.apache.org?k1=aa&k2=bb&test=dd#999/"), VARCHAR("k1")},
+             {VARCHAR("aa")}},
+            {{VARCHAR("http://doris.apache.org?k1=aa&k2=bb&test=dd#999/"), VARCHAR("k2")},
+             {VARCHAR("bb")}},
+            {{VARCHAR("http://doris.apache.org?k1=aa&k2=bb&test=dd#999/"), VARCHAR("999")},
+             {VARCHAR("")}},
+            {{VARCHAR("http://doris.apache.org?k1=aa&k2=bb&test=dd#999/"), VARCHAR("k3")},
+             {VARCHAR("")}},
+            {{VARCHAR("http://doris.apache.org?k1=aa&k2=bb&test=dd#999/"), VARCHAR("test")},
+             {VARCHAR("dd")}}};
+
+    check_function<DataTypeString, true>(func_name, input_types, data_set);
+}
+
 TEST(function_string_test, function_parse_url_test) {
     std::string func_name = "parse_url";
 
diff --git a/docs/en/docs/sql-manual/sql-functions/string-functions/extract_url_parameter.md b/docs/en/docs/sql-manual/sql-functions/string-functions/extract_url_parameter.md
new file mode 100644
index 0000000000..eb4bd8301b
--- /dev/null
+++ b/docs/en/docs/sql-manual/sql-functions/string-functions/extract_url_parameter.md
@@ -0,0 +1,50 @@
+---
+{
+"title": "extract_url_parameter",
+"language": "en"
+}
+---
+
+<!-- 
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE 
+file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on 
+an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied.  See the License for the
+specific language governing permissions and limitations
+under the License.
+-->
+
+## extract_url_parameter
+### description
+#### Syntax
+
+`VARCHAR  extract_url_parameter(VARCHAR url, VARCHAR  name)`
+
+
+Returns the value of the "name" parameter in the URL, if present. Otherwise an empty string.
+If there are many parameters with this name, the first occurrence is returned.
+This function works assuming that the parameter name is encoded in the URL exactly as it was in the passed parameter.
+
+```
+mysql> SELECT extract_url_parameter ("http://doris.apache.org?k1=aa&k2=bb&test=cc#999", "k2");
++--------------------------------------------------------------------------------+
+| extract_url_parameter('http://doris.apache.org?k1=aa&k2=bb&test=cc#999', 'k2') |
++--------------------------------------------------------------------------------+
+| bb                                                                             |
++--------------------------------------------------------------------------------+
+```
+
+### keywords
+    EXTRACT URL PARAMETER
diff --git a/docs/zh-CN/docs/sql-manual/sql-functions/string-functions/extract_url_parameter.md b/docs/zh-CN/docs/sql-manual/sql-functions/string-functions/extract_url_parameter.md
new file mode 100644
index 0000000000..2a17ede2d2
--- /dev/null
+++ b/docs/zh-CN/docs/sql-manual/sql-functions/string-functions/extract_url_parameter.md
@@ -0,0 +1,50 @@
+---
+{
+"title": "extract_url_parameter",
+"language": "zh-CN"
+}
+---
+
+<!-- 
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE 
+file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on 
+an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied.  See the License for the
+specific language governing permissions and limitations
+under the License.
+-->
+
+## extract_url_parameter
+### description
+#### Syntax
+
+`VARCHAR  extract_url_parameter(VARCHAR url, VARCHAR  name)`
+
+
+返回 URL 中“name”参数的值(如果存在)。否则为空字符串。
+如果有许多具有此名称的参数,则返回第一个出现的参数。
+此函数的工作假设参数名称在 URL 中的编码方式与在传递参数中的编码方式完全相同。
+
+```
+mysql> SELECT extract_url_parameter ("http://doris.apache.org?k1=aa&k2=bb&test=cc#999", "k2");
++--------------------------------------------------------------------------------+
+| extract_url_parameter('http://doris.apache.org?k1=aa&k2=bb&test=cc#999', 'k2') |
++--------------------------------------------------------------------------------+
+| bb                                                                             |
++--------------------------------------------------------------------------------+
+```
+
+### keywords
+    EXTRACT URL PARAMETER
diff --git a/gensrc/script/doris_builtins_functions.py b/gensrc/script/doris_builtins_functions.py
index ed71c714fc..e832428207 100755
--- a/gensrc/script/doris_builtins_functions.py
+++ b/gensrc/script/doris_builtins_functions.py
@@ -2137,6 +2137,7 @@ visible_functions = [
     [['split_part'], 'VARCHAR', ['VARCHAR', 'VARCHAR', 'INT'],
         '_ZN5doris15StringFunctions10split_partEPN9doris_udf15FunctionContextERKNS1_9StringValES6_RKNS1_6IntValE',
         '', '', 'vec', 'ALWAYS_NULLABLE'],
+     [['extract_url_parameter'], 'VARCHAR', ['VARCHAR', 'VARCHAR'],'','', '', 'vec', ''],
 
     # Longtext function
     [['substr', 'substring'], 'STRING', ['STRING', 'INT'],


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org
For additional commands, e-mail: commits-help@doris.apache.org