You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@doris.apache.org by da...@apache.org on 2022/10/31 11:13:14 UTC
[doris] branch master updated: [feature](function)add url functions: domain and protocol (#13662)
This is an automated email from the ASF dual-hosted git repository.
dataroaring pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new 7ae60a0ad2 [feature](function)add url functions: domain and protocol (#13662)
7ae60a0ad2 is described below
commit 7ae60a0ad22a54ed5270a53511b73629d4767ccc
Author: Kang <kx...@gmail.com>
AuthorDate: Mon Oct 31 19:13:08 2022 +0800
[feature](function)add url functions: domain and protocol (#13662)
---
be/src/vec/CMakeLists.txt | 1 +
be/src/vec/functions/simple_function_factory.h | 3 +
be/src/vec/functions/url/domain.h | 148 ++++++++++++++++++++++
be/src/vec/functions/url/function_url.cpp | 53 ++++++++
be/src/vec/functions/url/functions_url.h | 165 +++++++++++++++++++++++++
be/src/vec/functions/url/protocol.h | 63 ++++++++++
be/test/CMakeLists.txt | 1 +
be/test/vec/function/function_url_test.cpp | 92 ++++++++++++++
gensrc/script/doris_builtins_functions.py | 11 ++
9 files changed, 537 insertions(+)
diff --git a/be/src/vec/CMakeLists.txt b/be/src/vec/CMakeLists.txt
index 612fb3fac4..8f375d9fb7 100644
--- a/be/src/vec/CMakeLists.txt
+++ b/be/src/vec/CMakeLists.txt
@@ -210,6 +210,7 @@ set(VEC_FILES
functions/function_convert_tz.cpp
functions/least_greast.cpp
functions/function_fake.cpp
+ functions/url/function_url.cpp
olap/vgeneric_iterators.cpp
olap/vcollect_iterator.cpp
olap/block_reader.cpp
diff --git a/be/src/vec/functions/simple_function_factory.h b/be/src/vec/functions/simple_function_factory.h
index aef1029449..cc7fca5008 100644
--- a/be/src/vec/functions/simple_function_factory.h
+++ b/be/src/vec/functions/simple_function_factory.h
@@ -82,6 +82,8 @@ void register_function_encryption(SimpleFunctionFactory& factory);
void register_function_regexp_extract(SimpleFunctionFactory& factory);
void register_function_hex_variadic(SimpleFunctionFactory& factory);
+void register_function_url(SimpleFunctionFactory& factory);
+
class SimpleFunctionFactory {
using Creator = std::function<FunctionBuilderPtr()>;
using FunctionCreators = phmap::flat_hash_map<std::string, Creator>;
@@ -215,6 +217,7 @@ public:
register_function_hex_variadic(instance);
register_function_array(instance);
register_function_geo(instance);
+ register_function_url(instance);
});
return instance;
}
diff --git a/be/src/vec/functions/url/domain.h b/be/src/vec/functions/url/domain.h
new file mode 100644
index 0000000000..98c2e053b5
--- /dev/null
+++ b/be/src/vec/functions/url/domain.h
@@ -0,0 +1,148 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+// This file is copied from
+// https://github.com/ClickHouse/ClickHouse/blob/master/src/Functions/URL/domain.h
+// and modified by Doris
+
+#pragma once
+
+// #include <base/find_symbols.h>
+#include <vec/common/string_utils/string_utils.h>
+
+#include <cstring>
+
+#include "vec/functions/url/protocol.h"
+
+namespace doris::vectorized {
+
+inline StringRef check_and_return_host(const Pos& pos, const Pos& dot_pos,
+ const Pos& start_of_host) {
+ if (!dot_pos || start_of_host >= pos || pos - dot_pos == 1) return StringRef {};
+
+ auto after_dot = *(dot_pos + 1);
+ if (after_dot == ':' || after_dot == '/' || after_dot == '?' || after_dot == '#')
+ return StringRef {};
+
+ return StringRef(start_of_host, pos - start_of_host);
+}
+
+/// Extracts host from given url.
+///
+/// @return empty StringRef if the host is not valid (i.e. it does not have dot, or there no symbol after dot).
+inline StringRef get_url_host(const char* data, size_t size) {
+ Pos pos = data;
+ Pos end = data + size;
+
+ if (*pos == '/' && *(pos + 1) == '/') {
+ pos += 2;
+ } else {
+ Pos scheme_end = data + std::min(size, 16UL);
+ for (++pos; pos < scheme_end; ++pos) {
+ if (!is_alpha_numeric_ascii(*pos)) {
+ switch (*pos) {
+ case '.':
+ case '-':
+ case '+':
+ break;
+ case ' ': /// restricted symbols
+ case '\t':
+ case '<':
+ case '>':
+ case '%':
+ case '{':
+ case '}':
+ case '|':
+ case '\\':
+ case '^':
+ case '~':
+ case '[':
+ case ']':
+ case ';':
+ case '=':
+ case '&':
+ return StringRef {};
+ default:
+ goto exloop;
+ }
+ }
+ }
+ exloop:
+ if ((scheme_end - pos) > 2 && *pos == ':' && *(pos + 1) == '/' && *(pos + 2) == '/')
+ pos += 3;
+ else
+ pos = data;
+ }
+
+ Pos dot_pos = nullptr;
+ const auto* start_of_host = pos;
+ for (; pos < end; ++pos) {
+ switch (*pos) {
+ case '.':
+ dot_pos = pos;
+ break;
+ case ':': /// end symbols
+ case '/':
+ case '?':
+ case '#':
+ return check_and_return_host(pos, dot_pos, start_of_host);
+ case '@': /// myemail@gmail.com
+ start_of_host = pos + 1;
+ break;
+ case ' ': /// restricted symbols in whole URL
+ case '\t':
+ case '<':
+ case '>':
+ case '%':
+ case '{':
+ case '}':
+ case '|':
+ case '\\':
+ case '^':
+ case '~':
+ case '[':
+ case ']':
+ case ';':
+ case '=':
+ case '&':
+ return StringRef {};
+ }
+ }
+
+ return check_and_return_host(pos, dot_pos, start_of_host);
+}
+
+template <bool without_www>
+struct ExtractDomain {
+ static size_t get_reserve_length_for_element() { return 15; }
+
+ static void execute(Pos data, size_t size, Pos& res_data, size_t& res_size) {
+ StringRef host = get_url_host(data, size);
+
+ if (host.size == 0) {
+ res_data = data;
+ res_size = 0;
+ } else {
+ if (without_www && host.size > 4 && !strncmp(host.data, "www.", 4))
+ host = {host.data + 4, host.size - 4};
+
+ res_data = host.data;
+ res_size = host.size;
+ }
+ }
+};
+
+} // namespace doris::vectorized
diff --git a/be/src/vec/functions/url/function_url.cpp b/be/src/vec/functions/url/function_url.cpp
new file mode 100644
index 0000000000..501343c1f6
--- /dev/null
+++ b/be/src/vec/functions/url/function_url.cpp
@@ -0,0 +1,53 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+// This file is copied from
+// https://github.com/ClickHouse/ClickHouse/blob/master/src/Functions/URL/domain.cpp
+// and modified by Doris
+
+#include "vec/functions/function_string_to_string.h"
+#include "vec/functions/simple_function_factory.h"
+#include "vec/functions/url/domain.h"
+#include "vec/functions/url/functions_url.h"
+#include "vec/functions/url/protocol.h"
+
+namespace doris::vectorized {
+
+struct NameDomain {
+ static constexpr auto name = "domain";
+};
+using FunctionDomain =
+ FunctionStringToString<ExtractSubstringImpl<ExtractDomain<false>>, NameDomain>;
+
+struct NameDomainWithoutWWW {
+ static constexpr auto name = "domain_without_www";
+};
+using FunctionDomainWithoutWWW =
+ FunctionStringToString<ExtractSubstringImpl<ExtractDomain<true>>, NameDomainWithoutWWW>;
+
+struct NameProtocol {
+ static constexpr auto name = "protocol";
+};
+using FunctionProtocol =
+ FunctionStringToString<ExtractSubstringImpl<ExtractProtocol>, NameProtocol>;
+
+void register_function_url(SimpleFunctionFactory& factory) {
+ factory.register_function<FunctionDomain>();
+ factory.register_function<FunctionDomainWithoutWWW>();
+ factory.register_function<FunctionProtocol>();
+}
+
+} // namespace doris::vectorized
diff --git a/be/src/vec/functions/url/functions_url.h b/be/src/vec/functions/url/functions_url.h
new file mode 100644
index 0000000000..f9f02a17a6
--- /dev/null
+++ b/be/src/vec/functions/url/functions_url.h
@@ -0,0 +1,165 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+// This file is copied from
+// https://github.com/ClickHouse/ClickHouse/blob/master/src/Functions/URL/FunctionsURL.h
+// and modified by Doris
+
+#pragma once
+
+#include "vec/columns/column_string.h"
+#include "vec/common/memcpy_small.h"
+
+namespace doris::vectorized {
+
+/** URL processing functions. See implementation in separate .cpp files.
+ * All functions are not strictly follow RFC, instead they are maximally simplified for performance reasons.
+ *
+ * Functions for extraction parts of URL.
+ * If URL has nothing like, then empty string is returned.
+ *
+ * domain
+ * domainWithoutWWW
+ * topLevelDomain
+ * protocol
+ * path
+ * queryString
+ * fragment
+ * queryStringAndFragment
+ * netloc
+ *
+ * Functions, removing parts from URL.
+ * If URL has nothing like, then it is returned unchanged.
+ *
+ * cutWWW
+ * cutFragment
+ * cutQueryString
+ * cutQueryStringAndFragment
+ *
+ * Extract value of parameter in query string or in fragment identifier. Return empty string, if URL has no such parameter.
+ * If there are many parameters with same name - return value of first one. Value is not %-decoded.
+ *
+ * extractURLParameter(URL, name)
+ *
+ * Extract all parameters from URL in form of array of strings name=value.
+ * extractURLParameters(URL)
+ *
+ * Extract names of all parameters from URL in form of array of strings.
+ * extractURLParameterNames(URL)
+ *
+ * Remove specified parameter from URL.
+ * cutURLParameter(URL, name)
+ *
+ * Get array of URL 'hierarchy' as in web-analytics tree-like reports. See the docs.
+ * URLHierarchy(URL)
+ */
+
+using Pos = const char*;
+
+/** Select part of string using the Extractor.
+ */
+template <typename Extractor>
+struct ExtractSubstringImpl {
+ static void vector(const ColumnString::Chars& data, const ColumnString::Offsets& offsets,
+ ColumnString::Chars& res_data, ColumnString::Offsets& res_offsets) {
+ size_t size = offsets.size();
+ res_offsets.resize(size);
+ res_data.reserve(size * Extractor::get_reserve_length_for_element());
+
+ size_t prev_offset = 0;
+ size_t res_offset = 0;
+
+ /// Matched part.
+ Pos start;
+ size_t length;
+
+ for (size_t i = 0; i < size; ++i) {
+ Extractor::execute(reinterpret_cast<const char*>(&data[prev_offset]),
+ offsets[i] - prev_offset, start, length);
+
+ res_data.resize(res_data.size() + length);
+ memcpy_small_allow_read_write_overflow15(&res_data[res_offset], start, length);
+ res_offset += length;
+
+ res_offsets[i] = res_offset;
+ prev_offset = offsets[i];
+ }
+ }
+
+ static void constant(const std::string& data, std::string& res_data) {
+ Pos start;
+ size_t length;
+ Extractor::execute(data.data(), data.size(), start, length);
+ res_data.assign(start, length);
+ }
+
+ // static void vector_fixed(const ColumnString::Chars &, size_t, ColumnString::Chars &)
+ // {
+ // throw Exception("Column of type FixedString is not supported by URL functions", ErrorCodes::ILLEGAL_COLUMN);
+ // }
+};
+
+/** Delete part of string using the Extractor.
+ */
+template <typename Extractor>
+struct CutSubstringImpl {
+ static void vector(const ColumnString::Chars& data, const ColumnString::Offsets& offsets,
+ ColumnString::Chars& res_data, ColumnString::Offsets& res_offsets) {
+ res_data.reserve(data.size());
+ size_t size = offsets.size();
+ res_offsets.resize(size);
+
+ size_t prev_offset = 0;
+ size_t res_offset = 0;
+
+ /// Matched part.
+ Pos start;
+ size_t length;
+
+ for (size_t i = 0; i < size; ++i) {
+ const char* current = reinterpret_cast<const char*>(&data[prev_offset]);
+ Extractor::execute(current, offsets[i] - prev_offset, start, length);
+ size_t start_index = start - reinterpret_cast<const char*>(data.data());
+
+ res_data.resize(res_data.size() + offsets[i] - prev_offset - length);
+ memcpy_small_allow_read_write_overflow15(&res_data[res_offset], current,
+ start - current);
+ memcpy_small_allow_read_write_overflow15(&res_data[res_offset + start - current],
+ start + length,
+ offsets[i] - start_index - length);
+ res_offset += offsets[i] - prev_offset - length;
+
+ res_offsets[i] = res_offset;
+ prev_offset = offsets[i];
+ }
+ }
+
+ static void constant(const std::string& data, std::string& res_data) {
+ Pos start;
+ size_t length;
+ Extractor::execute(data.data(), data.size(), start, length);
+ res_data.reserve(data.size() - length);
+ res_data.append(data.data(), start);
+ res_data.append(start + length, data.data() + data.size());
+ }
+
+ // static void vector_fixed(const ColumnString::Chars &, size_t, ColumnString::Chars &)
+ // {
+ // throw Exception("Column of type FixedString is not supported by URL functions", ErrorCodes::ILLEGAL_COLUMN);
+ // }
+};
+
+} // namespace doris::vectorized
diff --git a/be/src/vec/functions/url/protocol.h b/be/src/vec/functions/url/protocol.h
new file mode 100644
index 0000000000..b80a58c6fe
--- /dev/null
+++ b/be/src/vec/functions/url/protocol.h
@@ -0,0 +1,63 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+// This file is copied from
+// https://github.com/ClickHouse/ClickHouse/blob/master/src/Functions/URL/protocol.h
+// and modified by Doris
+
+#pragma once
+
+#include "vec/common/string_utils/string_utils.h"
+#include "vec/functions/url/functions_url.h"
+
+namespace doris::vectorized {
+
+/// Extracts scheme from given url.
+inline StringRef get_url_scheme(const char* data, size_t size) {
+ // scheme = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." )
+ const char* pos = data;
+ const char* end = data + size;
+
+ if (is_alpha_ascii(*pos)) {
+ for (++pos; pos < end; ++pos) {
+ if (!(is_alpha_numeric_ascii(*pos) || *pos == '+' || *pos == '-' || *pos == '.')) {
+ break;
+ }
+ }
+
+ return StringRef(data, pos - data);
+ }
+
+ return {};
+}
+
+struct ExtractProtocol {
+ static size_t get_reserve_length_for_element() { return strlen("https") + 1; }
+
+ static void execute(Pos data, size_t size, Pos& res_data, size_t& res_size) {
+ res_data = data;
+ res_size = 0;
+
+ StringRef scheme = get_url_scheme(data, size);
+ Pos pos = data + scheme.size;
+
+ if (scheme.size == 0 || (data + size) - pos < 4) return;
+
+ if (pos[0] == ':') res_size = pos - data;
+ }
+};
+
+} // namespace doris::vectorized
diff --git a/be/test/CMakeLists.txt b/be/test/CMakeLists.txt
index be8d7a17bd..4c6c951b88 100644
--- a/be/test/CMakeLists.txt
+++ b/be/test/CMakeLists.txt
@@ -361,6 +361,7 @@ set(VEC_TEST_FILES
vec/function/function_jsonb_test.cpp
vec/function/function_geo_test.cpp
vec/function/function_test_util.cpp
+ vec/function/function_url_test.cpp
vec/function/table_function_test.cpp
vec/runtime/vdata_stream_test.cpp
vec/runtime/vdatetime_value_test.cpp
diff --git a/be/test/vec/function/function_url_test.cpp b/be/test/vec/function/function_url_test.cpp
new file mode 100644
index 0000000000..b97cb36389
--- /dev/null
+++ b/be/test/vec/function/function_url_test.cpp
@@ -0,0 +1,92 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <gtest/gtest.h>
+
+#include "function_test_util.h"
+#include "vec/data_types/data_type_jsonb.h"
+#include "vec/data_types/data_type_number.h"
+#include "vec/data_types/data_type_string.h"
+
+namespace doris::vectorized {
+using namespace ut_type;
+
+TEST(FunctionUrlTEST, DomainTest) {
+ std::string func_name = "domain";
+ InputTypeSet input_types = {TypeIndex::String};
+
+ DataSet data_set = {
+ {{Null()}, Null()},
+ {{STRING("http://paul@www.example.com:80/")}, STRING("www.example.com")},
+ {{STRING("http:/paul/example/com")}, STRING("")},
+ {{STRING("http://www.example.com?q=4")}, STRING("www.example.com")},
+ {{STRING("http://127.0.0.1:443/")}, STRING("127.0.0.1")},
+ {{STRING("//www.example.com")}, STRING("www.example.com")},
+ {{STRING("//paul@www.example.com")}, STRING("www.example.com")},
+ {{STRING("www.example.com")}, STRING("www.example.com")},
+ {{STRING("example.com")}, STRING("example.com")},
+ };
+
+ check_function<DataTypeString, true>(func_name, input_types, data_set);
+}
+
+TEST(FunctionUrlTEST, DomainWithoutWWWTest) {
+ std::string func_name = "domain_without_www";
+ InputTypeSet input_types = {TypeIndex::String};
+
+ DataSet data_set = {
+ {{Null()}, Null()},
+ {{STRING("http://paul@www.example.com:80/")}, STRING("example.com")},
+ {{STRING("http:/paul/example/com")}, STRING("")},
+ {{STRING("http://www.example.com?q=4")}, STRING("example.com")},
+ {{STRING("http://127.0.0.1:443/")}, STRING("127.0.0.1")},
+ {{STRING("//www.example.com")}, STRING("example.com")},
+ {{STRING("//paul@www.example.com")}, STRING("example.com")},
+ {{STRING("www.example.com")}, STRING("example.com")},
+ {{STRING("example.com")}, STRING("example.com")},
+ };
+
+ check_function<DataTypeString, true>(func_name, input_types, data_set);
+}
+
+TEST(FunctionUrlTEST, ProtocolTest) {
+ std::string func_name = "protocol";
+ InputTypeSet input_types = {TypeIndex::String};
+
+ DataSet data_set = {
+ {{Null()}, Null()},
+ {{STRING("http://paul@www.example.com:80/")}, STRING("http")},
+ {{STRING("http:/paul/example/com")}, STRING("http")},
+ {{STRING("http://www.example.com?q=4")}, STRING("http")},
+ {{STRING("http://127.0.0.1:443/")}, STRING("http")},
+ {{STRING("//www.example.com")}, STRING("")},
+ {{STRING("//paul@www.example.com")}, STRING("")},
+ {{STRING("www.example.com")}, STRING("")},
+ {{STRING("example.com")}, STRING("")},
+ {{STRING("https://example.com/")}, STRING("https")},
+ {{STRING("svn+ssh://example.com?q=hello%20world")}, STRING("svn+ssh")},
+ {{STRING("ftp://example.com/")}, STRING("ftp")},
+ {{STRING("ftp!://example.com/")}, STRING("")},
+ {{STRING("http://127.0.0.1:443/")}, STRING("http")},
+ {{STRING("https!://example.com/")}, STRING("")},
+ {{STRING("http!://example.com/")}, STRING("")},
+ };
+
+ check_function<DataTypeString, true>(func_name, input_types, data_set);
+}
+
+} // namespace doris::vectorized
diff --git a/gensrc/script/doris_builtins_functions.py b/gensrc/script/doris_builtins_functions.py
index caa609da3b..0c3a238761 100755
--- a/gensrc/script/doris_builtins_functions.py
+++ b/gensrc/script/doris_builtins_functions.py
@@ -2772,6 +2772,17 @@ visible_functions = [
[['grouping'], 'BIGINT', ['BIGINT'],
'_ZN5doris21GroupingSetsFunctions8groupingEPN9doris_udf15FunctionContextERKNS1_9BigIntValE',
'' ,'', 'vec', 'ALWAYS_NOT_NULLABLE'],
+
+ # url functions
+ [['domain'], 'STRING', ['STRING'],
+ 'fake_symble_for_no_vec', '', '',
+ 'vec', ''],
+ [['domain_without_www'], 'STRING', ['STRING'],
+ 'fake_symble_for_no_vec', '', '',
+ 'vec', ''],
+ [['protocol'], 'STRING', ['STRING'],
+ 'fake_symble_for_no_vec', '', '',
+ 'vec', ''],
]
# Except the following functions, other function will directly return
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org
For additional commands, e-mail: commits-help@doris.apache.org