You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@doris.apache.org by ga...@apache.org on 2022/12/15 01:54:42 UTC
[doris] branch master updated: [improvment](function) add new function substring_index (#15024)
This is an automated email from the ASF dual-hosted git repository.
gabriellee pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new 21c2e485ae [improvment](function) add new function substring_index (#15024)
21c2e485ae is described below
commit 21c2e485ae0969585f968c8a7ca324aeb4857151
Author: Yulei-Yang <yu...@gmail.com>
AuthorDate: Thu Dec 15 09:54:34 2022 +0800
[improvment](function) add new function substring_index (#15024)
---
be/src/vec/functions/function_string.cpp | 1 +
be/src/vec/functions/function_string.h | 173 +++++++++++++++++++++
.../string-functions/substring_index.md | 92 +++++++++++
docs/sidebars.json | 1 +
.../string-functions/substring_index.md | 91 +++++++++++
gensrc/script/doris_builtins_functions.py | 6 +
.../string_functions/test_string_function.out | 40 ++++-
.../string_functions/test_string_function.groovy | 14 ++
8 files changed, 417 insertions(+), 1 deletion(-)
diff --git a/be/src/vec/functions/function_string.cpp b/be/src/vec/functions/function_string.cpp
index 2a6a8af38c..ad8499dcc7 100644
--- a/be/src/vec/functions/function_string.cpp
+++ b/be/src/vec/functions/function_string.cpp
@@ -685,6 +685,7 @@ void register_function_string(SimpleFunctionFactory& factory) {
factory.register_function<FunctionSplitPart>();
factory.register_function<FunctionSplitByString>();
factory.register_function<FunctionStringMd5AndSM3<MD5Sum>>();
+ factory.register_function<FunctionSubstringIndex>();
factory.register_function<FunctionExtractURLParameter>();
factory.register_function<FunctionStringParseUrl>();
factory.register_function<FunctionMoneyFormat<MoneyFormatDoubleImpl>>();
diff --git a/be/src/vec/functions/function_string.h b/be/src/vec/functions/function_string.h
index 5cd0297f02..bfe3a2f1dc 100644
--- a/be/src/vec/functions/function_string.h
+++ b/be/src/vec/functions/function_string.h
@@ -1407,6 +1407,179 @@ public:
}
};
+class FunctionSubstringIndex : public IFunction {
+public:
+ static constexpr auto name = "substring_index";
+ static FunctionPtr create() { return std::make_shared<FunctionSubstringIndex>(); }
+ String get_name() const override { return name; }
+ size_t get_number_of_arguments() const override { return 3; }
+
+ DataTypePtr get_return_type_impl(const DataTypes& arguments) const override {
+ return make_nullable(std::make_shared<DataTypeString>());
+ }
+
+ bool use_default_implementation_for_nulls() const override { return true; }
+ bool use_default_implementation_for_constants() const override { return false; }
+
+ Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments,
+ size_t result, size_t input_rows_count) override {
+ DCHECK_EQ(arguments.size(), 3);
+
+ auto null_map = ColumnUInt8::create(input_rows_count, 0);
+ // Create a zero column to simply implement
+ auto const_null_map = ColumnUInt8::create(input_rows_count, 0);
+ auto res = ColumnString::create();
+
+ auto& res_offsets = res->get_offsets();
+ auto& res_chars = res->get_chars();
+ res_offsets.resize(input_rows_count);
+
+ ColumnPtr content_column =
+ block.get_by_position(arguments[0]).column->convert_to_full_column_if_const();
+
+ if (auto* nullable = check_and_get_column<const ColumnNullable>(*content_column)) {
+ // Danger: Here must dispose the null map data first! Because
+ // argument_columns[0]=nullable->get_nested_column_ptr(); will release the mem
+ // of column nullable mem of null map
+ VectorizedUtils::update_null_map(null_map->get_data(), nullable->get_null_map_data());
+ content_column = nullable->get_nested_column_ptr();
+ }
+
+ for (size_t i = 1; i <= 2; i++) {
+ ColumnPtr columnPtr = remove_nullable(block.get_by_position(arguments[i]).column);
+
+ if (!is_column_const(*columnPtr)) {
+ return Status::RuntimeError("Argument at index {} for function {} must be constant",
+ i + 1, get_name());
+ }
+ }
+
+ auto str_col = assert_cast<const ColumnString*>(content_column.get());
+
+ const IColumn& delimiter_col = *block.get_by_position(arguments[1]).column;
+ const auto* delimiter_const = typeid_cast<const ColumnConst*>(&delimiter_col);
+ auto delimiter = delimiter_const->get_field().get<String>();
+ int32_t delimiter_size = delimiter.size();
+
+ const IColumn& part_num_col = *block.get_by_position(arguments[2]).column;
+ const auto* part_num_col_const = typeid_cast<const ColumnConst*>(&part_num_col);
+ auto part_number = part_num_col_const->get_field().get<Int32>();
+
+ if (part_number == 0 || delimiter_size == 0) {
+ for (size_t i = 0; i < input_rows_count; ++i) {
+ StringOP::push_empty_string(i, res_chars, res_offsets);
+ }
+ } else if (part_number > 0) {
+ if (delimiter_size == 1) {
+ // If delimiter is a char, use memchr to split
+ for (size_t i = 0; i < input_rows_count; ++i) {
+ auto str = str_col->get_data_at(i);
+ int32_t offset = -1;
+ int32_t num = 0;
+ while (num < part_number) {
+ size_t n = str.size - offset - 1;
+ const char* pos = reinterpret_cast<const char*>(
+ memchr(str.data + offset + 1, delimiter[0], n));
+ if (pos != nullptr) {
+ offset = pos - str.data;
+ num++;
+ } else {
+ offset = str.size;
+ num = (num == 0) ? 0 : num + 1;
+ break;
+ }
+ }
+
+ if (num == part_number) {
+ StringOP::push_value_string(
+ std::string_view {reinterpret_cast<const char*>(str.data),
+ (size_t)offset},
+ i, res_chars, res_offsets);
+ } else {
+ StringOP::push_value_string(std::string_view(str.data, str.size), i,
+ res_chars, res_offsets);
+ }
+ }
+ } else {
+ // If delimiter is a string, use memmem to split
+ for (size_t i = 0; i < input_rows_count; ++i) {
+ auto str = str_col->get_data_at(i);
+ int32_t offset = -delimiter_size;
+ int32_t num = 0;
+ while (num < part_number) {
+ size_t n = str.size - offset - delimiter_size;
+ char* pos = reinterpret_cast<char*>(
+ memmem(str.data + offset + delimiter_size, n, delimiter.c_str(),
+ delimiter_size));
+ if (pos != nullptr) {
+ offset = pos - str.data;
+ num++;
+ } else {
+ offset = str.size;
+ num = (num == 0) ? 0 : num + 1;
+ break;
+ }
+ }
+
+ if (num == part_number) {
+ StringOP::push_value_string(
+ std::string_view {reinterpret_cast<const char*>(str.data),
+ (size_t)offset},
+ i, res_chars, res_offsets);
+ } else {
+ StringOP::push_value_string(std::string_view(str.data, str.size), i,
+ res_chars, res_offsets);
+ }
+ }
+ }
+ } else {
+ // if part_number is negative
+ part_number = -part_number;
+ for (size_t i = 0; i < input_rows_count; ++i) {
+ auto str = str_col->get_data_at(i);
+ auto str_str = str.to_string();
+ int32_t offset = str.size;
+ int32_t pre_offset = offset;
+ int32_t num = 0;
+ auto substr = str_str;
+ while (num <= part_number && offset >= 0) {
+ offset = (int)substr.rfind(delimiter, offset);
+ if (offset != -1) {
+ if (++num == part_number) {
+ break;
+ }
+ pre_offset = offset;
+ offset = offset - 1;
+ substr = str_str.substr(0, pre_offset);
+ } else {
+ break;
+ }
+ }
+ num = (offset == -1 && num != 0) ? num + 1 : num;
+
+ if (num == part_number) {
+ if (offset == -1) {
+ StringOP::push_value_string(std::string_view(str.data, str.size), i,
+ res_chars, res_offsets);
+ } else {
+ StringOP::push_value_string(
+ std::string_view {str.data + offset + delimiter_size,
+ str.size - offset - delimiter_size},
+ i, res_chars, res_offsets);
+ }
+ } else {
+ StringOP::push_value_string(std::string_view(str.data, str.size), i, res_chars,
+ res_offsets);
+ }
+ }
+ }
+
+ block.get_by_position(result).column =
+ ColumnNullable::create(std::move(res), std::move(null_map));
+ return Status::OK();
+ }
+};
+
class FunctionSplitByString : public IFunction {
public:
static constexpr auto name = "split_by_string";
diff --git a/docs/en/docs/sql-manual/sql-functions/string-functions/substring_index.md b/docs/en/docs/sql-manual/sql-functions/string-functions/substring_index.md
new file mode 100644
index 0000000000..129e6afcd0
--- /dev/null
+++ b/docs/en/docs/sql-manual/sql-functions/string-functions/substring_index.md
@@ -0,0 +1,92 @@
+---
+{
+"title": "substring_index",
+"language": "en"
+}
+---
+
+<!--
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements. See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership. The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License. You may obtain a copy of the License at
+ http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied. See the License for the
+specific language governing permissions and limitations
+under the License.
+-->
+
+## substring_index
+
+### Name
+
+<version since="1.2">
+
+SUBSTRING_INDEX
+
+</version>
+
+### description
+
+#### Syntax
+
+`VARCHAR substring_index(VARCHAR content, VARCHAR delimiter, INT field)`
+
+Split `content` to two parts at position where the `field`s of `delimiter` stays, return one of them according to below rules:
+if `field` is positive, return the left part;
+else if `field` is negative, return the right part;
+if `field` is zero, return an empty string when `content` is not null, else will return null.
+
+- `delimiter` is case sensitive and multi-byte safe.
+- `delimiter` and `field` parameter should be constant.
+
+
+### example
+
+```
+mysql> select substring_index("hello world", " ", 1);
++----------------------------------------+
+| substring_index("hello world", " ", 1) |
++----------------------------------------+
+| hello |
++----------------------------------------+
+mysql> select substring_index("hello world", " ", 2);
++----------------------------------------+
+| substring_index("hello world", " ", 2) |
++----------------------------------------+
+| hello world |
++----------------------------------------+
+mysql> select substring_index("hello world", " ", -1);
++-----------------------------------------+
+| substring_index("hello world", " ", -1) |
++-----------------------------------------+
+| world |
++-----------------------------------------+
+mysql> select substring_index("hello world", " ", -2);
++-----------------------------------------+
+| substring_index("hello world", " ", -2) |
++-----------------------------------------+
+| hello world |
++-----------------------------------------+
+mysql> select substring_index("hello world", " ", -3);
++-----------------------------------------+
+| substring_index("hello world", " ", -3) |
++-----------------------------------------+
+| hello world |
++-----------------------------------------+
+mysql> select substring_index("hello world", " ", 0);
++----------------------------------------+
+| substring_index("hello world", " ", 0) |
++----------------------------------------+
+| |
++----------------------------------------+
+```
+### keywords
+
+ SUBSTRING_INDEX, SUBSTRING
\ No newline at end of file
diff --git a/docs/sidebars.json b/docs/sidebars.json
index a565ef094c..ee3d0d8b0c 100644
--- a/docs/sidebars.json
+++ b/docs/sidebars.json
@@ -410,6 +410,7 @@
"sql-manual/sql-functions/string-functions/strleft",
"sql-manual/sql-functions/string-functions/strright",
"sql-manual/sql-functions/string-functions/split_part",
+ "sql-manual/sql-functions/string-functions/substring_index",
"sql-manual/sql-functions/string-functions/money_format",
"sql-manual/sql-functions/string-functions/parse_url",
"sql-manual/sql-functions/string-functions/convert_to",
diff --git a/docs/zh-CN/docs/sql-manual/sql-functions/string-functions/substring_index.md b/docs/zh-CN/docs/sql-manual/sql-functions/string-functions/substring_index.md
new file mode 100644
index 0000000000..f328c08003
--- /dev/null
+++ b/docs/zh-CN/docs/sql-manual/sql-functions/string-functions/substring_index.md
@@ -0,0 +1,91 @@
+---
+{
+"title": "substring_index",
+"language": "zh-CN"
+}
+---
+
+<!--
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements. See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership. The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License. You may obtain a copy of the License at
+ http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied. See the License for the
+specific language governing permissions and limitations
+under the License.
+-->
+
+## substring_index
+
+### Name
+
+<version since="1.2">
+
+SUBSTRING_INDEX
+
+</version>
+
+### description
+
+#### Syntax
+
+`VARCHAR substring_index(VARCHAR content, VARCHAR delimiter, INT field)`
+
+返回 content 的子字符串,在 delimiter 出现 field 次的位置按如下规则截取:
+如果 field > 0,则从左边算起,返回截取位置前的子串;
+如果 field < 0,则从右边算起,返回截取位置后的子串;
+如果 field = 0,返回一个空串(`content` 不为null), 或者Null (`content` = null)。
+
+- delimiter 大小写敏感,且是多字节安全的。
+- `delimiter` 和 `field` 参数需要是常量, 不支持变量。
+
+### example
+
+```
+mysql> select substring_index("hello world", " ", 1);
++----------------------------------------+
+| substring_index("hello world", " ", 1) |
++----------------------------------------+
+| hello |
++----------------------------------------+
+mysql> select substring_index("hello world", " ", 2);
++----------------------------------------+
+| substring_index("hello world", " ", 2) |
++----------------------------------------+
+| hello world |
++----------------------------------------+
+mysql> select substring_index("hello world", " ", -1);
++-----------------------------------------+
+| substring_index("hello world", " ", -1) |
++-----------------------------------------+
+| world |
++-----------------------------------------+
+mysql> select substring_index("hello world", " ", -2);
++-----------------------------------------+
+| substring_index("hello world", " ", -2) |
++-----------------------------------------+
+| hello world |
++-----------------------------------------+
+mysql> select substring_index("hello world", " ", -3);
++-----------------------------------------+
+| substring_index("hello world", " ", -3) |
++-----------------------------------------+
+| hello world |
++-----------------------------------------+
+mysql> select substring_index("hello world", " ", 0);
++----------------------------------------+
+| substring_index("hello world", " ", 0) |
++----------------------------------------+
+| |
++----------------------------------------+
+```
+### keywords
+
+ SUBSTRING_INDEX, SUBSTRING
\ No newline at end of file
diff --git a/gensrc/script/doris_builtins_functions.py b/gensrc/script/doris_builtins_functions.py
index a76653b7fe..ccf4cbfbe3 100755
--- a/gensrc/script/doris_builtins_functions.py
+++ b/gensrc/script/doris_builtins_functions.py
@@ -2382,6 +2382,9 @@ visible_functions = [
[['split_part'], 'VARCHAR', ['VARCHAR', 'VARCHAR', 'INT'],
'_ZN5doris15StringFunctions10split_partEPN9doris_udf15FunctionContextERKNS1_9StringValES6_RKNS1_6IntValE',
'', '', 'vec', 'ALWAYS_NULLABLE'],
+ [['substring_index'], 'VARCHAR', ['VARCHAR', 'VARCHAR', 'INT'],
+ '_ZN5doris15StringFunctions15substring_indexEPN9doris_udf15FunctionContextERKNS1_9StringValES6_RKNS1_6IntValE',
+ '', '', 'vec', 'ALWAYS_NULLABLE'],
[['extract_url_parameter'], 'VARCHAR', ['VARCHAR', 'VARCHAR'],'','', '', 'vec', ''],
[['sub_replace'], 'VARCHAR', ['VARCHAR', 'VARCHAR', 'INT'],'','', '', 'vec', 'ALWAYS_NULLABLE'],
@@ -2539,6 +2542,9 @@ visible_functions = [
[['split_part'], 'STRING', ['STRING', 'STRING', 'INT'],
'_ZN5doris15StringFunctions10split_partEPN9doris_udf15FunctionContextERKNS1_9StringValES6_RKNS1_6IntValE',
'', '', 'vec', 'ALWAYS_NULLABLE'],
+ [['substring_index'], 'STRING', ['STRING', 'STRING', 'INT'],
+ '_ZN5doris15StringFunctions15substring_indexEPN9doris_udf15FunctionContextERKNS1_9StringValES6_RKNS1_6IntValE',
+ '', '', 'vec', 'ALWAYS_NULLABLE'],
# Utility functions
[['convert_to'], 'VARCHAR', ['VARCHAR','VARCHAR'], '','', '', 'vec', ''],
diff --git a/regression-test/data/query_p0/sql_functions/string_functions/test_string_function.out b/regression-test/data/query_p0/sql_functions/string_functions/test_string_function.out
index 883c242f1a..cf6ad2c94c 100644
--- a/regression-test/data/query_p0/sql_functions/string_functions/test_string_function.out
+++ b/regression-test/data/query_p0/sql_functions/string_functions/test_string_function.out
@@ -311,6 +311,45 @@ tNEW-STRorigin str
-- !sql --
d***is
+-- !sql --
+hello
+
+-- !sql --
+hello world
+
+-- !sql --
+hello world
+
+-- !sql --
+world
+
+-- !sql --
+hello world
+
+-- !sql --
+hello world
+
+-- !sql --
+prefix__string2
+
+-- !sql --
+prefix_
+
+-- !sql --
+prefix_string2
+
+-- !sql --
+\N
+
+-- !sql --
+\N
+
+-- !sql --
+\N
+
+-- !sql --
+prefix_string
+
-- !sql --
\N
@@ -328,4 +367,3 @@ tNEW-STRorigin str
-- !sql --
d***is
-
diff --git a/regression-test/suites/query_p0/sql_functions/string_functions/test_string_function.groovy b/regression-test/suites/query_p0/sql_functions/string_functions/test_string_function.groovy
index aed5b73a07..d4882544e0 100644
--- a/regression-test/suites/query_p0/sql_functions/string_functions/test_string_function.groovy
+++ b/regression-test/suites/query_p0/sql_functions/string_functions/test_string_function.groovy
@@ -154,6 +154,20 @@ suite("test_string_function") {
qt_sql "select sub_replace(\"this is origin str\",\"NEW-STR\",1);"
qt_sql "select sub_replace(\"doris\",\"***\",1,2);"
+ qt_sql "select substring_index(\"hello world\", \" \", 1);"
+ qt_sql "select substring_index(\"hello world\", \" \", 2);"
+ qt_sql "select substring_index(\"hello world\", \" \", 3);"
+ qt_sql "select substring_index(\"hello world\", \" \", -1);"
+ qt_sql "select substring_index(\"hello world\", \" \", -2);"
+ qt_sql "select substring_index(\"hello world\", \" \", -3);"
+ qt_sql "select substring_index(\"prefix__string2\", \"__\", 2);"
+ qt_sql "select substring_index(\"prefix__string2\", \"_\", 2);"
+ qt_sql "select substring_index(\"prefix_string2\", \"__\", 1);"
+ qt_sql "select substring_index(null, \"__\", 1);"
+ qt_sql "select substring_index(\"prefix_string\", null, 1);"
+ qt_sql "select substring_index(\"prefix_string\", \"_\", null);"
+ qt_sql "select substring_index(\"prefix_string\", \"__\", -1);"
+
sql 'set enable_nereids_planner=true'
sql 'set enable_fallback_to_original_planner=false'
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org
For additional commands, e-mail: commits-help@doris.apache.org