You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@doris.apache.org by ga...@apache.org on 2022/12/15 01:54:42 UTC

[doris] branch master updated: [improvment](function) add new function substring_index (#15024)

This is an automated email from the ASF dual-hosted git repository.

gabriellee pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new 21c2e485ae [improvment](function) add new function substring_index (#15024)
21c2e485ae is described below

commit 21c2e485ae0969585f968c8a7ca324aeb4857151
Author: Yulei-Yang <yu...@gmail.com>
AuthorDate: Thu Dec 15 09:54:34 2022 +0800

    [improvment](function) add new function substring_index (#15024)
---
 be/src/vec/functions/function_string.cpp           |   1 +
 be/src/vec/functions/function_string.h             | 173 +++++++++++++++++++++
 .../string-functions/substring_index.md            |  92 +++++++++++
 docs/sidebars.json                                 |   1 +
 .../string-functions/substring_index.md            |  91 +++++++++++
 gensrc/script/doris_builtins_functions.py          |   6 +
 .../string_functions/test_string_function.out      |  40 ++++-
 .../string_functions/test_string_function.groovy   |  14 ++
 8 files changed, 417 insertions(+), 1 deletion(-)

diff --git a/be/src/vec/functions/function_string.cpp b/be/src/vec/functions/function_string.cpp
index 2a6a8af38c..ad8499dcc7 100644
--- a/be/src/vec/functions/function_string.cpp
+++ b/be/src/vec/functions/function_string.cpp
@@ -685,6 +685,7 @@ void register_function_string(SimpleFunctionFactory& factory) {
     factory.register_function<FunctionSplitPart>();
     factory.register_function<FunctionSplitByString>();
     factory.register_function<FunctionStringMd5AndSM3<MD5Sum>>();
+    factory.register_function<FunctionSubstringIndex>();
     factory.register_function<FunctionExtractURLParameter>();
     factory.register_function<FunctionStringParseUrl>();
     factory.register_function<FunctionMoneyFormat<MoneyFormatDoubleImpl>>();
diff --git a/be/src/vec/functions/function_string.h b/be/src/vec/functions/function_string.h
index 5cd0297f02..bfe3a2f1dc 100644
--- a/be/src/vec/functions/function_string.h
+++ b/be/src/vec/functions/function_string.h
@@ -1407,6 +1407,179 @@ public:
     }
 };
 
+class FunctionSubstringIndex : public IFunction {
+public:
+    static constexpr auto name = "substring_index";
+    static FunctionPtr create() { return std::make_shared<FunctionSubstringIndex>(); }
+    String get_name() const override { return name; }
+    size_t get_number_of_arguments() const override { return 3; }
+
+    DataTypePtr get_return_type_impl(const DataTypes& arguments) const override {
+        return make_nullable(std::make_shared<DataTypeString>());
+    }
+
+    bool use_default_implementation_for_nulls() const override { return true; }
+    bool use_default_implementation_for_constants() const override { return false; }
+
+    Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments,
+                        size_t result, size_t input_rows_count) override {
+        DCHECK_EQ(arguments.size(), 3);
+
+        auto null_map = ColumnUInt8::create(input_rows_count, 0);
+        // Create a zero column to simply implement
+        auto const_null_map = ColumnUInt8::create(input_rows_count, 0);
+        auto res = ColumnString::create();
+
+        auto& res_offsets = res->get_offsets();
+        auto& res_chars = res->get_chars();
+        res_offsets.resize(input_rows_count);
+
+        ColumnPtr content_column =
+                block.get_by_position(arguments[0]).column->convert_to_full_column_if_const();
+
+        if (auto* nullable = check_and_get_column<const ColumnNullable>(*content_column)) {
+            // Danger: Here must dispose the null map data first! Because
+            // argument_columns[0]=nullable->get_nested_column_ptr(); will release the mem
+            // of column nullable mem of null map
+            VectorizedUtils::update_null_map(null_map->get_data(), nullable->get_null_map_data());
+            content_column = nullable->get_nested_column_ptr();
+        }
+
+        for (size_t i = 1; i <= 2; i++) {
+            ColumnPtr columnPtr = remove_nullable(block.get_by_position(arguments[i]).column);
+
+            if (!is_column_const(*columnPtr)) {
+                return Status::RuntimeError("Argument at index {} for function {} must be constant",
+                                            i + 1, get_name());
+            }
+        }
+
+        auto str_col = assert_cast<const ColumnString*>(content_column.get());
+
+        const IColumn& delimiter_col = *block.get_by_position(arguments[1]).column;
+        const auto* delimiter_const = typeid_cast<const ColumnConst*>(&delimiter_col);
+        auto delimiter = delimiter_const->get_field().get<String>();
+        int32_t delimiter_size = delimiter.size();
+
+        const IColumn& part_num_col = *block.get_by_position(arguments[2]).column;
+        const auto* part_num_col_const = typeid_cast<const ColumnConst*>(&part_num_col);
+        auto part_number = part_num_col_const->get_field().get<Int32>();
+
+        if (part_number == 0 || delimiter_size == 0) {
+            for (size_t i = 0; i < input_rows_count; ++i) {
+                StringOP::push_empty_string(i, res_chars, res_offsets);
+            }
+        } else if (part_number > 0) {
+            if (delimiter_size == 1) {
+                // If delimiter is a char, use memchr to split
+                for (size_t i = 0; i < input_rows_count; ++i) {
+                    auto str = str_col->get_data_at(i);
+                    int32_t offset = -1;
+                    int32_t num = 0;
+                    while (num < part_number) {
+                        size_t n = str.size - offset - 1;
+                        const char* pos = reinterpret_cast<const char*>(
+                                memchr(str.data + offset + 1, delimiter[0], n));
+                        if (pos != nullptr) {
+                            offset = pos - str.data;
+                            num++;
+                        } else {
+                            offset = str.size;
+                            num = (num == 0) ? 0 : num + 1;
+                            break;
+                        }
+                    }
+
+                    if (num == part_number) {
+                        StringOP::push_value_string(
+                                std::string_view {reinterpret_cast<const char*>(str.data),
+                                                  (size_t)offset},
+                                i, res_chars, res_offsets);
+                    } else {
+                        StringOP::push_value_string(std::string_view(str.data, str.size), i,
+                                                    res_chars, res_offsets);
+                    }
+                }
+            } else {
+                // If delimiter is a string, use memmem to split
+                for (size_t i = 0; i < input_rows_count; ++i) {
+                    auto str = str_col->get_data_at(i);
+                    int32_t offset = -delimiter_size;
+                    int32_t num = 0;
+                    while (num < part_number) {
+                        size_t n = str.size - offset - delimiter_size;
+                        char* pos = reinterpret_cast<char*>(
+                                memmem(str.data + offset + delimiter_size, n, delimiter.c_str(),
+                                       delimiter_size));
+                        if (pos != nullptr) {
+                            offset = pos - str.data;
+                            num++;
+                        } else {
+                            offset = str.size;
+                            num = (num == 0) ? 0 : num + 1;
+                            break;
+                        }
+                    }
+
+                    if (num == part_number) {
+                        StringOP::push_value_string(
+                                std::string_view {reinterpret_cast<const char*>(str.data),
+                                                  (size_t)offset},
+                                i, res_chars, res_offsets);
+                    } else {
+                        StringOP::push_value_string(std::string_view(str.data, str.size), i,
+                                                    res_chars, res_offsets);
+                    }
+                }
+            }
+        } else {
+            // if part_number is negative
+            part_number = -part_number;
+            for (size_t i = 0; i < input_rows_count; ++i) {
+                auto str = str_col->get_data_at(i);
+                auto str_str = str.to_string();
+                int32_t offset = str.size;
+                int32_t pre_offset = offset;
+                int32_t num = 0;
+                auto substr = str_str;
+                while (num <= part_number && offset >= 0) {
+                    offset = (int)substr.rfind(delimiter, offset);
+                    if (offset != -1) {
+                        if (++num == part_number) {
+                            break;
+                        }
+                        pre_offset = offset;
+                        offset = offset - 1;
+                        substr = str_str.substr(0, pre_offset);
+                    } else {
+                        break;
+                    }
+                }
+                num = (offset == -1 && num != 0) ? num + 1 : num;
+
+                if (num == part_number) {
+                    if (offset == -1) {
+                        StringOP::push_value_string(std::string_view(str.data, str.size), i,
+                                                    res_chars, res_offsets);
+                    } else {
+                        StringOP::push_value_string(
+                                std::string_view {str.data + offset + delimiter_size,
+                                                  str.size - offset - delimiter_size},
+                                i, res_chars, res_offsets);
+                    }
+                } else {
+                    StringOP::push_value_string(std::string_view(str.data, str.size), i, res_chars,
+                                                res_offsets);
+                }
+            }
+        }
+
+        block.get_by_position(result).column =
+                ColumnNullable::create(std::move(res), std::move(null_map));
+        return Status::OK();
+    }
+};
+
 class FunctionSplitByString : public IFunction {
 public:
     static constexpr auto name = "split_by_string";
diff --git a/docs/en/docs/sql-manual/sql-functions/string-functions/substring_index.md b/docs/en/docs/sql-manual/sql-functions/string-functions/substring_index.md
new file mode 100644
index 0000000000..129e6afcd0
--- /dev/null
+++ b/docs/en/docs/sql-manual/sql-functions/string-functions/substring_index.md
@@ -0,0 +1,92 @@
+---
+{
+"title": "substring_index",
+"language": "en"
+}
+---
+
+<!-- 
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+  http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied.  See the License for the
+specific language governing permissions and limitations
+under the License.
+-->
+
+## substring_index
+
+### Name
+
+<version since="1.2">
+
+SUBSTRING_INDEX
+
+</version>
+
+### description
+
+#### Syntax
+
+`VARCHAR substring_index(VARCHAR content, VARCHAR delimiter, INT field)`
+
+Split `content` to two parts at position where the `field`s of `delimiter` stays, return one of them according to below rules:
+if `field` is positive, return the left part;
+else if `field` is negative, return the right part;
+if `field` is zero, return an empty string when `content` is not null, else will return null.
+
+- `delimiter` is case sensitive and multi-byte safe.
+- `delimiter` and `field` parameter should be constant.
+
+
+### example
+
+```
+mysql> select substring_index("hello world", " ", 1);
++----------------------------------------+
+| substring_index("hello world", " ", 1) |
++----------------------------------------+
+| hello                                  |
++----------------------------------------+
+mysql> select substring_index("hello world", " ", 2);
++----------------------------------------+
+| substring_index("hello world", " ", 2) |
++----------------------------------------+
+| hello world                            |
++----------------------------------------+
+mysql> select substring_index("hello world", " ", -1);
++-----------------------------------------+
+| substring_index("hello world", " ", -1) |
++-----------------------------------------+
+| world                                   |
++-----------------------------------------+
+mysql> select substring_index("hello world", " ", -2);
++-----------------------------------------+
+| substring_index("hello world", " ", -2) |
++-----------------------------------------+
+| hello world                             |
++-----------------------------------------+
+mysql> select substring_index("hello world", " ", -3);
++-----------------------------------------+
+| substring_index("hello world", " ", -3) |
++-----------------------------------------+
+| hello world                             |
++-----------------------------------------+
+mysql> select substring_index("hello world", " ", 0);
++----------------------------------------+
+| substring_index("hello world", " ", 0) |
++----------------------------------------+
+|                                        |
++----------------------------------------+
+```
+### keywords
+
+    SUBSTRING_INDEX, SUBSTRING
\ No newline at end of file
diff --git a/docs/sidebars.json b/docs/sidebars.json
index a565ef094c..ee3d0d8b0c 100644
--- a/docs/sidebars.json
+++ b/docs/sidebars.json
@@ -410,6 +410,7 @@
                                 "sql-manual/sql-functions/string-functions/strleft",
                                 "sql-manual/sql-functions/string-functions/strright",
                                 "sql-manual/sql-functions/string-functions/split_part",
+                                "sql-manual/sql-functions/string-functions/substring_index",
                                 "sql-manual/sql-functions/string-functions/money_format",
                                 "sql-manual/sql-functions/string-functions/parse_url",
                                 "sql-manual/sql-functions/string-functions/convert_to",
diff --git a/docs/zh-CN/docs/sql-manual/sql-functions/string-functions/substring_index.md b/docs/zh-CN/docs/sql-manual/sql-functions/string-functions/substring_index.md
new file mode 100644
index 0000000000..f328c08003
--- /dev/null
+++ b/docs/zh-CN/docs/sql-manual/sql-functions/string-functions/substring_index.md
@@ -0,0 +1,91 @@
+---
+{
+"title": "substring_index",
+"language": "zh-CN"
+}
+---
+
+<!-- 
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+  http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied.  See the License for the
+specific language governing permissions and limitations
+under the License.
+-->
+
+## substring_index
+
+### Name
+
+<version since="1.2">
+
+SUBSTRING_INDEX
+
+</version>
+
+### description
+
+#### Syntax
+
+`VARCHAR substring_index(VARCHAR content, VARCHAR delimiter, INT field)`
+
+返回 content 的子字符串,在 delimiter 出现 field 次的位置按如下规则截取:  
+如果 field > 0,则从左边算起,返回截取位置前的子串;  
+如果 field < 0,则从右边算起,返回截取位置后的子串;
+如果 field = 0,返回一个空串(`content` 不为null), 或者Null (`content` = null)。
+
+- delimiter 大小写敏感,且是多字节安全的。
+- `delimiter` 和 `field` 参数需要是常量, 不支持变量。
+
+### example
+
+```
+mysql> select substring_index("hello world", " ", 1);
++----------------------------------------+
+| substring_index("hello world", " ", 1) |
++----------------------------------------+
+| hello                                  |
++----------------------------------------+
+mysql> select substring_index("hello world", " ", 2);
++----------------------------------------+
+| substring_index("hello world", " ", 2) |
++----------------------------------------+
+| hello world                            |
++----------------------------------------+
+mysql> select substring_index("hello world", " ", -1);
++-----------------------------------------+
+| substring_index("hello world", " ", -1) |
++-----------------------------------------+
+| world                                   |
++-----------------------------------------+
+mysql> select substring_index("hello world", " ", -2);
++-----------------------------------------+
+| substring_index("hello world", " ", -2) |
++-----------------------------------------+
+| hello world                             |
++-----------------------------------------+
+mysql> select substring_index("hello world", " ", -3);
++-----------------------------------------+
+| substring_index("hello world", " ", -3) |
++-----------------------------------------+
+| hello world                             |
++-----------------------------------------+
+mysql> select substring_index("hello world", " ", 0);
++----------------------------------------+
+| substring_index("hello world", " ", 0) |
++----------------------------------------+
+|                                        |
++----------------------------------------+
+```
+### keywords
+
+    SUBSTRING_INDEX, SUBSTRING
\ No newline at end of file
diff --git a/gensrc/script/doris_builtins_functions.py b/gensrc/script/doris_builtins_functions.py
index a76653b7fe..ccf4cbfbe3 100755
--- a/gensrc/script/doris_builtins_functions.py
+++ b/gensrc/script/doris_builtins_functions.py
@@ -2382,6 +2382,9 @@ visible_functions = [
     [['split_part'], 'VARCHAR', ['VARCHAR', 'VARCHAR', 'INT'],
         '_ZN5doris15StringFunctions10split_partEPN9doris_udf15FunctionContextERKNS1_9StringValES6_RKNS1_6IntValE',
         '', '', 'vec', 'ALWAYS_NULLABLE'],
+    [['substring_index'], 'VARCHAR', ['VARCHAR', 'VARCHAR', 'INT'],
+        '_ZN5doris15StringFunctions15substring_indexEPN9doris_udf15FunctionContextERKNS1_9StringValES6_RKNS1_6IntValE',
+        '', '', 'vec', 'ALWAYS_NULLABLE'],
     [['extract_url_parameter'], 'VARCHAR', ['VARCHAR', 'VARCHAR'],'','', '', 'vec', ''],
 
     [['sub_replace'], 'VARCHAR', ['VARCHAR', 'VARCHAR', 'INT'],'','', '', 'vec', 'ALWAYS_NULLABLE'],
@@ -2539,6 +2542,9 @@ visible_functions = [
     [['split_part'], 'STRING', ['STRING', 'STRING', 'INT'],
         '_ZN5doris15StringFunctions10split_partEPN9doris_udf15FunctionContextERKNS1_9StringValES6_RKNS1_6IntValE',
         '', '', 'vec', 'ALWAYS_NULLABLE'],
+    [['substring_index'], 'STRING', ['STRING', 'STRING', 'INT'],
+        '_ZN5doris15StringFunctions15substring_indexEPN9doris_udf15FunctionContextERKNS1_9StringValES6_RKNS1_6IntValE',
+        '', '', 'vec', 'ALWAYS_NULLABLE'],
 
     # Utility functions
     [['convert_to'], 'VARCHAR', ['VARCHAR','VARCHAR'], '','', '', 'vec', ''],
diff --git a/regression-test/data/query_p0/sql_functions/string_functions/test_string_function.out b/regression-test/data/query_p0/sql_functions/string_functions/test_string_function.out
index 883c242f1a..cf6ad2c94c 100644
--- a/regression-test/data/query_p0/sql_functions/string_functions/test_string_function.out
+++ b/regression-test/data/query_p0/sql_functions/string_functions/test_string_function.out
@@ -311,6 +311,45 @@ tNEW-STRorigin str
 -- !sql --
 d***is
 
+-- !sql --
+hello
+
+-- !sql --
+hello world
+
+-- !sql --
+hello world
+
+-- !sql --
+world
+
+-- !sql --
+hello world
+
+-- !sql --
+hello world
+
+-- !sql --
+prefix__string2
+
+-- !sql --
+prefix_
+
+-- !sql --
+prefix_string2
+
+-- !sql --
+\N
+
+-- !sql --
+\N
+
+-- !sql --
+\N
+
+-- !sql --
+prefix_string
+
 -- !sql --
 \N
 
@@ -328,4 +367,3 @@ tNEW-STRorigin str
 
 -- !sql --
 d***is
-
diff --git a/regression-test/suites/query_p0/sql_functions/string_functions/test_string_function.groovy b/regression-test/suites/query_p0/sql_functions/string_functions/test_string_function.groovy
index aed5b73a07..d4882544e0 100644
--- a/regression-test/suites/query_p0/sql_functions/string_functions/test_string_function.groovy
+++ b/regression-test/suites/query_p0/sql_functions/string_functions/test_string_function.groovy
@@ -154,6 +154,20 @@ suite("test_string_function") {
     qt_sql "select sub_replace(\"this is origin str\",\"NEW-STR\",1);"
     qt_sql "select sub_replace(\"doris\",\"***\",1,2);"
 
+    qt_sql "select substring_index(\"hello world\", \" \", 1);"
+    qt_sql "select substring_index(\"hello world\", \" \", 2);"
+    qt_sql "select substring_index(\"hello world\", \" \", 3);"
+    qt_sql "select substring_index(\"hello world\", \" \", -1);"
+    qt_sql "select substring_index(\"hello world\", \" \", -2);"
+    qt_sql "select substring_index(\"hello world\", \" \", -3);"
+    qt_sql "select substring_index(\"prefix__string2\", \"__\", 2);"
+    qt_sql "select substring_index(\"prefix__string2\", \"_\", 2);"
+    qt_sql "select substring_index(\"prefix_string2\", \"__\", 1);"
+    qt_sql "select substring_index(null, \"__\", 1);"
+    qt_sql "select substring_index(\"prefix_string\", null, 1);"
+    qt_sql "select substring_index(\"prefix_string\", \"_\", null);"
+    qt_sql "select substring_index(\"prefix_string\", \"__\", -1);"
+
     sql 'set enable_nereids_planner=true'
     sql 'set enable_fallback_to_original_planner=false'
 


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org
For additional commands, e-mail: commits-help@doris.apache.org