You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@doris.apache.org by xu...@apache.org on 2022/07/17 09:50:44 UTC

[doris] branch master updated: [feature-wip] (array-type) function concat_ws support array (#10749)

This is an automated email from the ASF dual-hosted git repository.

xuyang pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new 5793cb11d0 [feature-wip] (array-type) function concat_ws support array (#10749)
5793cb11d0 is described below

commit 5793cb11d07e8d46fd4675820a661d36b623d9ac
Author: zxealous <xe...@gmail.com>
AuthorDate: Sun Jul 17 17:50:39 2022 +0800

    [feature-wip] (array-type) function concat_ws support array (#10749)
    
    Issue #10052
    function concat_ws support array
---
 be/src/vec/functions/function_string.h             | 147 ++++++++++++++++++---
 be/test/vec/function/function_string_test.cpp      |  17 +++
 .../sql-functions/string-functions/concat_ws.md    |  30 ++++-
 .../sql-functions/string-functions/concat_ws.md    |  28 +++-
 gensrc/script/doris_builtins_functions.py          |   3 +
 .../string_functions/test_string_function.out      |  12 ++
 .../string_functions/test_string_function.groovy   |   4 +
 7 files changed, 215 insertions(+), 26 deletions(-)

diff --git a/be/src/vec/functions/function_string.h b/be/src/vec/functions/function_string.h
index c545223a23..a7df35b1ed 100644
--- a/be/src/vec/functions/function_string.h
+++ b/be/src/vec/functions/function_string.h
@@ -30,12 +30,14 @@
 #include "util/md5.h"
 #include "util/sm3.h"
 #include "util/url_parser.h"
+#include "vec/columns/column_array.h"
 #include "vec/columns/column_decimal.h"
 #include "vec/columns/column_nullable.h"
 #include "vec/columns/column_string.h"
 #include "vec/columns/columns_number.h"
 #include "vec/common/assert_cast.h"
 #include "vec/common/string_ref.h"
+#include "vec/data_types/data_type_array.h"
 #include "vec/data_types/data_type_decimal.h"
 #include "vec/data_types/data_type_nullable.h"
 #include "vec/data_types/data_type_number.h"
@@ -491,10 +493,13 @@ public:
     }
 };
 
-// concat_ws (string,string....)
+// concat_ws (string,string....) or (string, Array)
 // TODO: avoid use fmtlib
 class FunctionStringConcatWs : public IFunction {
 public:
+    using Chars = ColumnString::Chars;
+    using Offsets = ColumnString::Offsets;
+
     static constexpr auto name = "concat_ws";
     static FunctionPtr create() { return std::make_shared<FunctionStringConcatWs>(); }
     String get_name() const override { return name; }
@@ -520,8 +525,8 @@ public:
         auto res = ColumnString::create();
         bool is_null_type = block.get_by_position(arguments[0]).type.get()->is_nullable();
         size_t argument_size = arguments.size();
-        std::vector<const ColumnString::Offsets*> offsets_list(argument_size);
-        std::vector<const ColumnString::Chars*> chars_list(argument_size);
+        std::vector<const Offsets*> offsets_list(argument_size);
+        std::vector<const Chars*> chars_list(argument_size);
         std::vector<const ColumnUInt8::Container*> null_list(argument_size);
 
         ColumnPtr argument_columns[argument_size];
@@ -540,6 +545,11 @@ public:
             } else {
                 null_list[i] = &const_null_map->get_data();
             }
+
+            if (check_column<ColumnArray>(argument_columns[i].get())) {
+                continue;
+            }
+
             auto col_str = assert_cast<const ColumnString*>(argument_columns[i].get());
             offsets_list[i] = &col_str->get_offsets();
             chars_list[i] = &col_str->get_chars();
@@ -553,20 +563,126 @@ public:
         fmt::memory_buffer buffer;
         std::vector<std::string_view> views;
 
+        if (check_column<ColumnArray>(argument_columns[1].get())) {
+            // Determine if the nested type of the array is String
+            const ColumnArray& array_column =
+                    reinterpret_cast<const ColumnArray&>(*argument_columns[1]);
+            if (!array_column.get_data().is_column_string()) {
+                return Status::NotSupported(
+                        fmt::format("unsupported nested array of type {} for function {}",
+                                    is_column_nullable(array_column.get_data())
+                                            ? array_column.get_data().get_name()
+                                            : array_column.get_data().get_family_name(),
+                                    get_name()));
+            }
+            // Concat string in array
+            _execute_array(input_rows_count, array_column, buffer, views, offsets_list, chars_list,
+                           null_list, res_data, res_offset);
+
+        } else {
+            // Concat string
+            _execute_string(input_rows_count, argument_size, buffer, views, offsets_list,
+                            chars_list, null_list, res_data, res_offset);
+        }
+        if (is_null_type) {
+            block.get_by_position(result).column =
+                    ColumnNullable::create(std::move(res), std::move(null_map));
+        } else {
+            block.get_by_position(result).column = std::move(res);
+        }
+        return Status::OK();
+    }
+
+private:
+    void _execute_array(const size_t& input_rows_count, const ColumnArray& array_column,
+                        fmt::memory_buffer& buffer, std::vector<std::string_view>& views,
+                        const std::vector<const Offsets*>& offsets_list,
+                        const std::vector<const Chars*>& chars_list,
+                        const std::vector<const ColumnUInt8::Container*>& null_list,
+                        Chars& res_data, Offsets& res_offset) {
+        // Get array nested column
+        const UInt8* array_nested_null_map = nullptr;
+        ColumnPtr array_nested_column = nullptr;
+
+        if (is_column_nullable(array_column.get_data())) {
+            const auto& array_nested_null_column =
+                    reinterpret_cast<const ColumnNullable&>(array_column.get_data());
+            // String's null map in array
+            array_nested_null_map =
+                    array_nested_null_column.get_null_map_column().get_data().data();
+            array_nested_column = array_nested_null_column.get_nested_column_ptr();
+        } else {
+            array_nested_column = array_column.get_data_ptr();
+        }
+
+        const auto& string_column = reinterpret_cast<const ColumnString&>(*array_nested_column);
+        const Chars& string_src_chars = string_column.get_chars();
+        const Offsets& src_string_offsets = string_column.get_offsets();
+        const Offsets& src_array_offsets = array_column.get_offsets();
+        ColumnArray::Offset current_src_array_offset = 0;
+
+        // Concat string in array
         for (size_t i = 0; i < input_rows_count; ++i) {
-            auto& seq_offsets = *offsets_list[0];
-            auto& seq_chars = *chars_list[0];
-            auto& seq_nullmap = *null_list[0];
-            if (seq_nullmap[i]) {
-                res_data.push_back('\0');
+            auto& sep_offsets = *offsets_list[0];
+            auto& sep_chars = *chars_list[0];
+            auto& sep_nullmap = *null_list[0];
+
+            if (sep_nullmap[i]) {
                 res_offset[i] = res_data.size();
+                current_src_array_offset += src_array_offsets[i] - src_array_offsets[i - 1];
                 continue;
             }
 
-            int seq_size = seq_offsets[i] - seq_offsets[i - 1] - 1;
-            const char* seq_data = reinterpret_cast<const char*>(&seq_chars[seq_offsets[i - 1]]);
+            int sep_size = sep_offsets[i] - sep_offsets[i - 1] - 1;
+            const char* sep_data = reinterpret_cast<const char*>(&sep_chars[sep_offsets[i - 1]]);
 
-            std::string_view seq(seq_data, seq_size);
+            std::string_view sep(sep_data, sep_size);
+            buffer.clear();
+            views.clear();
+
+            for (auto next_src_array_offset = src_array_offsets[i];
+                 current_src_array_offset < next_src_array_offset; ++current_src_array_offset) {
+                const auto current_src_string_offset =
+                        current_src_array_offset ? src_string_offsets[current_src_array_offset - 1]
+                                                 : 0;
+                size_t bytes_to_copy = src_string_offsets[current_src_array_offset] -
+                                       current_src_string_offset - 1;
+                const char* ptr =
+                        reinterpret_cast<const char*>(&string_src_chars[current_src_string_offset]);
+
+                if (array_nested_null_map == nullptr ||
+                    !array_nested_null_map[current_src_array_offset]) {
+                    views.emplace_back(ptr, bytes_to_copy);
+                }
+            }
+
+            fmt::format_to(buffer, "{}", fmt::join(views, sep));
+
+            StringOP::push_value_string(std::string_view(buffer.data(), buffer.size()), i, res_data,
+                                        res_offset);
+        }
+    }
+
+    void _execute_string(const size_t& input_rows_count, const size_t& argument_size,
+                         fmt::memory_buffer& buffer, std::vector<std::string_view>& views,
+                         const std::vector<const Offsets*>& offsets_list,
+                         const std::vector<const Chars*>& chars_list,
+                         const std::vector<const ColumnUInt8::Container*>& null_list,
+                         Chars& res_data, Offsets& res_offset) {
+        // Concat string
+        for (size_t i = 0; i < input_rows_count; ++i) {
+            auto& sep_offsets = *offsets_list[0];
+            auto& sep_chars = *chars_list[0];
+            auto& sep_nullmap = *null_list[0];
+            if (sep_nullmap[i]) {
+                res_offset[i] = res_data.size();
+                continue;
+            }
+
+            int sep_size = sep_offsets[i] - sep_offsets[i - 1] - 1;
+            const char* sep_data = reinterpret_cast<const char*>(&sep_chars[sep_offsets[i - 1]]);
+
+            std::string_view sep(sep_data, sep_size);
             buffer.clear();
             views.clear();
             for (size_t j = 1; j < argument_size; ++j) {
@@ -580,17 +696,10 @@ public:
                     views.emplace_back(ptr, size);
                 }
             }
-            fmt::format_to(buffer, "{}", fmt::join(views, seq));
+            fmt::format_to(buffer, "{}", fmt::join(views, sep));
             StringOP::push_value_string(std::string_view(buffer.data(), buffer.size()), i, res_data,
                                         res_offset);
         }
-        if (is_null_type) {
-            block.get_by_position(result).column =
-                    ColumnNullable::create(std::move(res), std::move(null_map));
-        } else {
-            block.get_by_position(result).column = std::move(res);
-        }
-        return Status::OK();
     }
 };
 
diff --git a/be/test/vec/function/function_string_test.cpp b/be/test/vec/function/function_string_test.cpp
index 60cb797aae..2d259014be 100644
--- a/be/test/vec/function/function_string_test.cpp
+++ b/be/test/vec/function/function_string_test.cpp
@@ -387,6 +387,23 @@ TEST(function_string_test, function_concat_ws_test) {
 
         check_function<DataTypeString, true>(func_name, input_types, data_set);
     };
+
+    {
+        InputTypeSet input_types = {TypeIndex::String, TypeIndex::Array, TypeIndex::String};
+
+        Array vec1 = {Field("", 0), Field("", 0), Field("", 0)};
+        Array vec2 = {Field("123", 3), Field("456", 3), Field("789", 3)};
+        Array vec3 = {Field("", 0), Field("?", 1), Field("", 0)};
+        Array vec4 = {Field("abc", 3), Field("", 0), Field("def", 3)};
+        Array vec5 = {Field("abc", 3), Field("def", 3), Field("ghi", 3)};
+        DataSet data_set = {{{std::string("-"), vec1}, std::string("--")},
+                            {{std::string(""), vec2}, std::string("123456789")},
+                            {{std::string("-"), vec3}, std::string("-?-")},
+                            {{Null(), vec4}, Null()},
+                            {{std::string("-"), vec5}, std::string("abc-def-ghi")}};
+
+        check_function<DataTypeString, true>(func_name, input_types, data_set);
+    };
 }
 
 TEST(function_string_test, function_null_or_empty_test) {
diff --git a/docs/en/docs/sql-manual/sql-functions/string-functions/concat_ws.md b/docs/en/docs/sql-manual/sql-functions/string-functions/concat_ws.md
index 80a0c11dee..a8a4f57d4b 100644
--- a/docs/en/docs/sql-manual/sql-functions/string-functions/concat_ws.md
+++ b/docs/en/docs/sql-manual/sql-functions/string-functions/concat_ws.md
@@ -28,12 +28,13 @@ under the License.
 ### Description
 #### Syntax
 
-`VARCHAR concat ws (VARCHAR sep, VARCHAR str,...)`
+`VARCHAR concat_ws (VARCHAR sep, VARCHAR str,...)`
+`VARCHAR concat_ws(VARCHAR sep, ARRAY array)`
 
 
-Using the first parameter SEP as a connector, the second parameter and all subsequent parameters are spliced into a string.
+Using the first parameter SEP as a connector, the second parameter and all subsequent parameters(or all string in an ARRAY) are spliced into a string.
 If the separator is NULL, return NULL.
-` The concat_ws` function does not skip empty strings, but NULL values.
+The `concat_ws` function does not skip empty strings, it skips NULL values.
 
 ### example
 
@@ -58,6 +59,27 @@ mysql> select concat_ws("or", "d", NULL,"is");
 +---------------------------------+
 | doris                           |
 +---------------------------------+
+
+mysql> select concat_ws("or", ["d", "is"]);
++-----------------------------------+
+| concat_ws('or', ARRAY('d', 'is')) |
++-----------------------------------+
+| doris                             |
++-----------------------------------+
+
+mysql> select concat_ws(NULL, ["d", "is"]);
++-----------------------------------+
+| concat_ws(NULL, ARRAY('d', 'is')) |
++-----------------------------------+
+| NULL                              |
++-----------------------------------+
+
+mysql> select concat_ws("or", ["d", NULL,"is"]);
++-----------------------------------------+
+| concat_ws('or', ARRAY('d', NULL, 'is')) |
++-----------------------------------------+
+| doris                                   |
++-----------------------------------------+
 ```
 ### keywords
-CONCAT_WS,CONCAT,WS
+CONCAT_WS,CONCAT,WS,ARRAY
diff --git a/docs/zh-CN/docs/sql-manual/sql-functions/string-functions/concat_ws.md b/docs/zh-CN/docs/sql-manual/sql-functions/string-functions/concat_ws.md
index bcebf9f4a6..a21d1b3ba5 100644
--- a/docs/zh-CN/docs/sql-manual/sql-functions/string-functions/concat_ws.md
+++ b/docs/zh-CN/docs/sql-manual/sql-functions/string-functions/concat_ws.md
@@ -29,11 +29,12 @@ under the License.
 #### Syntax
 
 `VARCHAR concat_ws(VARCHAR sep, VARCHAR str,...)`
+`VARCHAR concat_ws(VARCHAR sep, ARRAY array)`
 
 
-使用第一个参数 sep 作为连接符,将第二个参数以及后续所有参数拼接成一个字符串.
+使用第一个参数 sep 作为连接符,将第二个参数以及后续所有参数(或ARRAY中的所有字符串)拼接成一个字符串。
 如果分隔符是 NULL,返回 NULL。
-`concat_ws`函数不会跳过空字符串,会跳过 NULL 值
+`concat_ws`函数不会跳过空字符串,会跳过 NULL 值。
 
 ### example
 
@@ -58,6 +59,27 @@ mysql> select concat_ws("or", "d", NULL,"is");
 +---------------------------------+
 | doris                           |
 +---------------------------------+
+
+mysql> select concat_ws("or", ["d", "is"]);
++-----------------------------------+
+| concat_ws('or', ARRAY('d', 'is')) |
++-----------------------------------+
+| doris                             |
++-----------------------------------+
+
+mysql> select concat_ws(NULL, ["d", "is"]);
++-----------------------------------+
+| concat_ws(NULL, ARRAY('d', 'is')) |
++-----------------------------------+
+| NULL                              |
++-----------------------------------+
+
+mysql> select concat_ws("or", ["d", NULL,"is"]);
++-----------------------------------------+
+| concat_ws('or', ARRAY('d', NULL, 'is')) |
++-----------------------------------------+
+| doris                                   |
++-----------------------------------------+
 ```
 ### keywords
-CONCAT_WS,CONCAT,WS
+CONCAT_WS,CONCAT,WS,ARRAY
diff --git a/gensrc/script/doris_builtins_functions.py b/gensrc/script/doris_builtins_functions.py
index 3883e7b1ac..249638831b 100755
--- a/gensrc/script/doris_builtins_functions.py
+++ b/gensrc/script/doris_builtins_functions.py
@@ -1018,6 +1018,9 @@ visible_functions = [
     [['concat_ws'], 'VARCHAR', ['VARCHAR', 'VARCHAR', '...'],
             '_ZN5doris15StringFunctions9concat_wsEPN9doris_udf'
             '15FunctionContextERKNS1_9StringValEiPS5_', '', '', 'vec', 'CUSTOM'],
+    [['concat_ws'], 'VARCHAR', ['VARCHAR', 'ARRAY_VARCHAR'],
+            '_ZN5doris15StringFunctions9concat_wsEPN9doris_udf'
+            '15FunctionContextERKNS1_9StringValEiPS5_', '', '', 'vec', 'CUSTOM'],
     [['find_in_set'], 'INT', ['VARCHAR', 'VARCHAR'],
             '_ZN5doris15StringFunctions11find_in_setEPN9doris_udf'
             '15FunctionContextERKNS1_9StringValES6_', '', '', 'vec', ''],
diff --git a/regression-test/data/query/sql_functions/string_functions/test_string_function.out b/regression-test/data/query/sql_functions/string_functions/test_string_function.out
index 70c7d33202..61e0295054 100644
--- a/regression-test/data/query/sql_functions/string_functions/test_string_function.out
+++ b/regression-test/data/query/sql_functions/string_functions/test_string_function.out
@@ -41,6 +41,18 @@ doris
 -- !sql --
 doris
 
+-- !sql --
+doris
+
+-- !sql --
+\N
+
+-- !sql --
+doris
+
+-- !sql --
+dororis
+
 -- !sql --
 true
 
diff --git a/regression-test/suites/query/sql_functions/string_functions/test_string_function.groovy b/regression-test/suites/query/sql_functions/string_functions/test_string_function.groovy
index 00351890ef..8ecbc0e3d5 100644
--- a/regression-test/suites/query/sql_functions/string_functions/test_string_function.groovy
+++ b/regression-test/suites/query/sql_functions/string_functions/test_string_function.groovy
@@ -38,6 +38,10 @@ suite("test_string_function", "query") {
     qt_sql "select concat_ws(\"or\", \"d\", \"is\");"
     qt_sql "select concat_ws(NULL, \"d\", \"is\");"
     qt_sql "select concat_ws(\"or\", \"d\", NULL,\"is\");"
+    qt_sql "select concat_ws(\"or\", [\"d\", \"is\"]);"
+    qt_sql "select concat_ws(NULL, [\"d\", \"is\"]);"
+    qt_sql "select concat_ws(\"or\", [\"d\", NULL,\"is\"]);"
+    qt_sql "select concat_ws(\"or\", [\"d\", \"\",\"is\"]);"
 
     qt_sql "select ends_with(\"Hello doris\", \"doris\");"
     qt_sql "select ends_with(\"Hello doris\", \"Hello\");"


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org
For additional commands, e-mail: commits-help@doris.apache.org