You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@doris.apache.org by li...@apache.org on 2022/10/20 03:11:49 UTC
[doris] branch master updated: [function](string_function) add new string function 'extract_url_parameter' (#13323)
This is an automated email from the ASF dual-hosted git repository.
lihaopeng pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new 2b328eafbb [function](string_function) add new string function 'extract_url_parameter' (#13323)
2b328eafbb is described below
commit 2b328eafbb20f5ec206912b41165616f88727f63
Author: DongLiang-0 <46...@users.noreply.github.com>
AuthorDate: Thu Oct 20 11:11:43 2022 +0800
[function](string_function) add new string function 'extract_url_parameter' (#13323)
---
be/src/util/url_parser.cpp | 57 ++++++++++++++++++++++
be/src/util/url_parser.h | 5 ++
be/src/vec/functions/function_string.cpp | 1 +
be/src/vec/functions/function_string.h | 50 +++++++++++++++++++
be/test/vec/function/function_string_test.cpp | 25 ++++++++++
.../string-functions/extract_url_parameter.md | 50 +++++++++++++++++++
.../string-functions/extract_url_parameter.md | 50 +++++++++++++++++++
gensrc/script/doris_builtins_functions.py | 1 +
8 files changed, 239 insertions(+)
diff --git a/be/src/util/url_parser.cpp b/be/src/util/url_parser.cpp
index 0ce0913f3d..00d2783bd6 100644
--- a/be/src/util/url_parser.cpp
+++ b/be/src/util/url_parser.cpp
@@ -17,6 +17,8 @@
#include "util/url_parser.h"
+#include <string>
+
#include "runtime/string_value.hpp"
namespace doris {
@@ -344,4 +346,59 @@ UrlParser::UrlPart UrlParser::get_url_part(const StringValue& part) {
}
}
+std::string UrlParser::extract_url(const StringValue& url, const StringValue& name) {
+ std::string result;
+ std::string str_name = name.to_string();
+ // Remove leading and trailing spaces.
+ StringValue trimmed_url = url.trim();
+ // find '?'
+ int32_t question_pos = _s_question_search.search(&trimmed_url);
+ if (question_pos < 0) {
+ // this url no parameters.
+ // Example: https://doris.apache.org/
+ return result;
+ }
+ // find '#'
+ int32_t hash_pos = _s_hash_search.search(&trimmed_url);
+ std::string sub_url = "";
+ if (hash_pos < 0) {
+ sub_url = trimmed_url.substring(question_pos + 1, trimmed_url.len - question_pos - 1)
+ .to_string();
+ } else {
+ sub_url = trimmed_url.substring(question_pos + 1, hash_pos - question_pos - 1).to_string();
+ }
+
+ // find '&' and '=', and extract target parameter
+ // Example: k1=aa&k2=bb&k3=cc&test=dd
+ std::string::size_type and_pod;
+ std::string::size_type len = sub_url.length();
+ std::string key_url;
+ while (true) {
+ if (len <= 0) {
+ break;
+ }
+ and_pod = sub_url.find_first_of('&');
+ if (and_pod != std::string::npos) {
+ key_url = sub_url.substr(0, and_pod);
+ sub_url = sub_url.substr(and_pod + 1, len - and_pod);
+ } else {
+ key_url = sub_url;
+ sub_url = "";
+ }
+ len = sub_url.length();
+
+ std::string::size_type eq_pod = key_url.find_first_of('=');
+ if (eq_pod == std::string::npos) {
+ // invalid url. like: k1&k2=bb
+ continue;
+ }
+ int32_t key_len = key_url.length();
+ std::string key = key_url.substr(0, eq_pod);
+ if (str_name == key) {
+ result = key_url.substr(eq_pod + 1, key_len - eq_pod);
+ return result;
+ }
+ }
+ return result;
+}
} // namespace doris
diff --git a/be/src/util/url_parser.h b/be/src/util/url_parser.h
index 3363f65e6b..0d212b1acd 100644
--- a/be/src/util/url_parser.h
+++ b/be/src/util/url_parser.h
@@ -60,6 +60,11 @@ public:
// If part did not match any of the url part constants, returns INVALID.
static UrlPart get_url_part(const StringValue& part);
+ // Extract parameter value from url
+ // Example for url:
+ // http://doris.apache.org?k1=aa&k2=bb&k3=cc&test=dd#999
+ static std::string extract_url(const StringValue& url, const StringValue& name);
+
private:
// Constants representing parts of a URL.
static const StringValue _s_url_authority;
diff --git a/be/src/vec/functions/function_string.cpp b/be/src/vec/functions/function_string.cpp
index b8bf150249..7033eee2f8 100644
--- a/be/src/vec/functions/function_string.cpp
+++ b/be/src/vec/functions/function_string.cpp
@@ -678,6 +678,7 @@ void register_function_string(SimpleFunctionFactory& factory) {
factory.register_function<FunctionFromBase64>();
factory.register_function<FunctionSplitPart>();
factory.register_function<FunctionStringMd5AndSM3<MD5Sum>>();
+ factory.register_function<FunctionExtractURLParameter>();
factory.register_function<FunctionStringParseUrl>();
factory.register_function<FunctionMoneyFormat<MoneyFormatDoubleImpl>>();
factory.register_function<FunctionMoneyFormat<MoneyFormatInt64Impl>>();
diff --git a/be/src/vec/functions/function_string.h b/be/src/vec/functions/function_string.h
index 9588e7a2c0..403bc7b2f5 100644
--- a/be/src/vec/functions/function_string.h
+++ b/be/src/vec/functions/function_string.h
@@ -31,6 +31,8 @@
#include <fmt/ranges.h>
#include <cstdint>
+#include <memory_resource>
+#include <string>
#include <string_view>
#include "exprs/math_functions.h"
@@ -1248,6 +1250,54 @@ public:
}
};
+class FunctionExtractURLParameter : public IFunction {
+public:
+ static constexpr auto name = "extract_url_parameter";
+ static FunctionPtr create() { return std::make_shared<FunctionExtractURLParameter>(); }
+ String get_name() const override { return name; }
+ size_t get_number_of_arguments() const override { return 2; }
+
+ DataTypePtr get_return_type_impl(const DataTypes& arguments) const override {
+ return std::make_shared<DataTypeString>();
+ }
+
+ bool use_default_implementation_for_constants() const override { return true; }
+
+ Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments,
+ size_t result, size_t input_rows_count) override {
+ auto col_url =
+ block.get_by_position(arguments[0]).column->convert_to_full_column_if_const();
+ auto col_parameter =
+ block.get_by_position(arguments[1]).column->convert_to_full_column_if_const();
+ auto url_col = assert_cast<const ColumnString*>(col_url.get());
+ auto parameter_col = assert_cast<const ColumnString*>(col_parameter.get());
+
+ ColumnString::MutablePtr col_res = ColumnString::create();
+
+ for (int i = 0; i < input_rows_count; ++i) {
+ auto source = url_col->get_data_at(i);
+ auto param = parameter_col->get_data_at(i);
+ StringValue url_str(const_cast<char*>(source.data), source.size);
+ StringValue parameter_str(const_cast<char*>(param.data), param.size);
+
+ std::string result = extract_url(url_str, parameter_str);
+
+ col_res->insert_data(result.data(), result.length());
+ }
+
+ block.replace_by_position(result, std::move(col_res));
+ return Status::OK();
+ }
+
+private:
+ std::string extract_url(StringValue url, StringValue parameter) {
+ if (url.len == 0 || parameter.len == 0) {
+ return "";
+ }
+ return UrlParser::extract_url(url, parameter);
+ }
+};
+
class FunctionStringParseUrl : public IFunction {
public:
static constexpr auto name = "parse_url";
diff --git a/be/test/vec/function/function_string_test.cpp b/be/test/vec/function/function_string_test.cpp
index 3ee81d1391..e996bb13f9 100644
--- a/be/test/vec/function/function_string_test.cpp
+++ b/be/test/vec/function/function_string_test.cpp
@@ -953,6 +953,31 @@ TEST(function_string_test, function_sm4_decrypt_test) {
}
}
+TEST(function_string_test, function_extract_url_parameter_test) {
+ std::string func_name = "extract_url_parameter";
+ InputTypeSet input_types = {TypeIndex::String, TypeIndex::String};
+ DataSet data_set = {
+ {{VARCHAR(""), VARCHAR("k1")}, {VARCHAR("")}},
+ {{VARCHAR("http://doris.apache.org?k1=aa"), VARCHAR("")}, {VARCHAR("")}},
+ {{VARCHAR("https://doris.apache.org/"), VARCHAR("k1")}, {VARCHAR("")}},
+ {{VARCHAR("http://doris.apache.org?"), VARCHAR("k1")}, {VARCHAR("")}},
+ {{VARCHAR("http://doris.apache.org?k1=aa"), VARCHAR("k1")}, {VARCHAR("aa")}},
+ {{VARCHAR("http://doris.apache.org:8080?k1&k2=bb#99"), VARCHAR("k1")}, {VARCHAR("")}},
+ {{VARCHAR("http://doris.apache.org?k1=aa#999"), VARCHAR("k1")}, {VARCHAR("aa")}},
+ {{VARCHAR("http://doris.apache.org?k1=aa&k2=bb&test=dd#999/"), VARCHAR("k1")},
+ {VARCHAR("aa")}},
+ {{VARCHAR("http://doris.apache.org?k1=aa&k2=bb&test=dd#999/"), VARCHAR("k2")},
+ {VARCHAR("bb")}},
+ {{VARCHAR("http://doris.apache.org?k1=aa&k2=bb&test=dd#999/"), VARCHAR("999")},
+ {VARCHAR("")}},
+ {{VARCHAR("http://doris.apache.org?k1=aa&k2=bb&test=dd#999/"), VARCHAR("k3")},
+ {VARCHAR("")}},
+ {{VARCHAR("http://doris.apache.org?k1=aa&k2=bb&test=dd#999/"), VARCHAR("test")},
+ {VARCHAR("dd")}}};
+
+ check_function<DataTypeString, true>(func_name, input_types, data_set);
+}
+
TEST(function_string_test, function_parse_url_test) {
std::string func_name = "parse_url";
diff --git a/docs/en/docs/sql-manual/sql-functions/string-functions/extract_url_parameter.md b/docs/en/docs/sql-manual/sql-functions/string-functions/extract_url_parameter.md
new file mode 100644
index 0000000000..eb4bd8301b
--- /dev/null
+++ b/docs/en/docs/sql-manual/sql-functions/string-functions/extract_url_parameter.md
@@ -0,0 +1,50 @@
+---
+{
+"title": "extract_url_parameter",
+"language": "en"
+}
+---
+
+<!--
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements. See the NOTICE
+file
+distributed with this work for additional information
+regarding copyright ownership. The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on
+an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied. See the License for the
+specific language governing permissions and limitations
+under the License.
+-->
+
+## extract_url_parameter
+### description
+#### Syntax
+
+`VARCHAR extract_url_parameter(VARCHAR url, VARCHAR name)`
+
+
+Returns the value of the "name" parameter in the URL, if present. Otherwise an empty string.
+If there are many parameters with this name, the first occurrence is returned.
+This function works assuming that the parameter name is encoded in the URL exactly as it was in the passed parameter.
+
+```
+mysql> SELECT extract_url_parameter ("http://doris.apache.org?k1=aa&k2=bb&test=cc#999", "k2");
++--------------------------------------------------------------------------------+
+| extract_url_parameter('http://doris.apache.org?k1=aa&k2=bb&test=cc#999', 'k2') |
++--------------------------------------------------------------------------------+
+| bb |
++--------------------------------------------------------------------------------+
+```
+
+### keywords
+ EXTRACT URL PARAMETER
diff --git a/docs/zh-CN/docs/sql-manual/sql-functions/string-functions/extract_url_parameter.md b/docs/zh-CN/docs/sql-manual/sql-functions/string-functions/extract_url_parameter.md
new file mode 100644
index 0000000000..2a17ede2d2
--- /dev/null
+++ b/docs/zh-CN/docs/sql-manual/sql-functions/string-functions/extract_url_parameter.md
@@ -0,0 +1,50 @@
+---
+{
+"title": "extract_url_parameter",
+"language": "zh-CN"
+}
+---
+
+<!--
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements. See the NOTICE
+file
+distributed with this work for additional information
+regarding copyright ownership. The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on
+an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied. See the License for the
+specific language governing permissions and limitations
+under the License.
+-->
+
+## extract_url_parameter
+### description
+#### Syntax
+
+`VARCHAR extract_url_parameter(VARCHAR url, VARCHAR name)`
+
+
+返回 URL 中“name”参数的值(如果存在)。否则为空字符串。
+如果有许多具有此名称的参数,则返回第一个出现的参数。
+此函数的工作假设参数名称在 URL 中的编码方式与在传递参数中的编码方式完全相同。
+
+```
+mysql> SELECT extract_url_parameter ("http://doris.apache.org?k1=aa&k2=bb&test=cc#999", "k2");
++--------------------------------------------------------------------------------+
+| extract_url_parameter('http://doris.apache.org?k1=aa&k2=bb&test=cc#999', 'k2') |
++--------------------------------------------------------------------------------+
+| bb |
++--------------------------------------------------------------------------------+
+```
+
+### keywords
+ EXTRACT URL PARAMETER
diff --git a/gensrc/script/doris_builtins_functions.py b/gensrc/script/doris_builtins_functions.py
index ed71c714fc..e832428207 100755
--- a/gensrc/script/doris_builtins_functions.py
+++ b/gensrc/script/doris_builtins_functions.py
@@ -2137,6 +2137,7 @@ visible_functions = [
[['split_part'], 'VARCHAR', ['VARCHAR', 'VARCHAR', 'INT'],
'_ZN5doris15StringFunctions10split_partEPN9doris_udf15FunctionContextERKNS1_9StringValES6_RKNS1_6IntValE',
'', '', 'vec', 'ALWAYS_NULLABLE'],
+ [['extract_url_parameter'], 'VARCHAR', ['VARCHAR', 'VARCHAR'],'','', '', 'vec', ''],
# Longtext function
[['substr', 'substring'], 'STRING', ['STRING', 'INT'],
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org
For additional commands, e-mail: commits-help@doris.apache.org