You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@doris.apache.org by GitBox <gi...@apache.org> on 2022/10/27 12:41:30 UTC

[GitHub] [doris] Yukang-Lian commented on a diff in pull request #13741: [feature](split_by_char)support split by char function

Yukang-Lian commented on code in PR #13741:
URL: https://github.com/apache/doris/pull/13741#discussion_r1006819548


##########
be/src/vec/functions/function_string.h:
##########
@@ -1159,6 +1166,129 @@ class FunctionSplitPart : public IFunction {
     }
 };
 
+
+class FunctionSplitByChar : public IFunction {
+
+private:
+    void getOffsetsAndLen(const std::string& s, const std::string& c, std::vector<int>& v_offset, std::vector<int>& v_charlen) {
+        /**
+         * 
+         * s : string need to be split
+         * c : delimiter_string
+         * v_offset  : each word splited offset in string
+         * v_charlen : each word length in string
+        */
+        char delimiter_char = c[0];
+        int32_t pos = 0;
+	    int32_t pos_start = 0;
+	    int32_t pos_end = 0;
+        int32_t len = s.size();
+        bool flag = true;
+
+	    while (flag) {
+		    while (pos < len && s[pos] == delimiter_char) {
+			    pos++;
+                if (pos >= len - 1) {
+                    flag = false;
+                }
+            }
+
+            if (!flag || pos >= len) {
+                break;
+            }
+            pos_start = pos;
+            v_offset.emplace_back(pos_start);
+            while (pos < len && s[pos] != delimiter_char) {
+                pos++;
+            }
+            pos_end = pos;
+            v_charlen.emplace_back(pos_end - pos_start);
+        }
+    }
+public:
+    static constexpr auto name = "split_by_char";
+    static FunctionPtr create() { return std::make_shared<FunctionSplitByChar>(); }
+    String get_name() const override { return name; }
+    size_t get_number_of_arguments() const override { return 2; }
+
+    DataTypePtr get_return_type_impl(const DataTypes& arguments) const override {
+        return std::make_shared<DataTypeArray>(make_nullable(std::make_shared<DataTypeString>()));
+    }
+
+    bool use_default_implementation_for_nulls() const override { return false; }
+    bool use_default_implementation_for_constants() const override { return true; }
+
+    Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments,
+                        size_t result, size_t input_rows_count) override {
+        DCHECK_EQ(arguments.size(), 2);
+
+        auto null_map = ColumnUInt8::create(input_rows_count, 0);
+        //auto const_null_map = ColumnUInt8::create(input_rows_count, 0);
+        auto col_res = ColumnArray::create(ColumnString::create());
+
+        auto& res_data = typeid_cast<ColumnString &>(col_res->get_data());
+        auto& res_offsets = col_res->get_offsets();
+
+        auto& res_data_chars = res_data.get_chars();
+        auto& res_data_offsets = res_data.get_offsets();
+
+        //auto& null_map_data = null_map->get_data();
+
+        res_data_offsets.resize(input_rows_count);
+
+        /**
+         * 获得 argument参数(列数据),并存入argument_columns数组中,[0]为str,[1]为delimiter
+        */
+        size_t argument_size = arguments.size();
+        ColumnPtr argument_columns[argument_size];
+        for (size_t i = 0; i < argument_size; ++i) {
+            argument_columns[i] = block.get_by_position(arguments[i]).column->convert_to_full_column_if_const();
+            if (auto* nullable = check_and_get_column<const ColumnNullable>(*argument_columns[i])) {
+                // Danger: Here must dispose the null map data first! Because
+                // argument_columns[i]=nullable->get_nested_column_ptr(); will release the mem
+                // of column nullable mem of null map
+                VectorizedUtils::update_null_map(null_map->get_data(), nullable->get_null_map_data());
+                argument_columns[i] = nullable->get_nested_column_ptr();
+            }
+        }
+        auto str_col = assert_cast<const ColumnString*>(argument_columns[0].get());
+        auto delimiter_col = assert_cast<const ColumnString*>(argument_columns[1].get());
+        
+        /**
+         * 取出列元素中的每一行(delimiter,str),并且进行相关的操作
+        */
+        for (size_t i = 0; i < input_rows_count; ++i) {    
+            auto delimiter = delimiter_col->get_data_at(i);
+            auto delimiter_str = delimiter_col->get_data_at(i).to_string();
+            auto str = str_col->get_data_at(i);
+            auto str_str = str_col->get_data_at(i).to_string();
+            if (delimiter.size == 0) {
+                res_data_offsets[i] = res_data_chars.size();
+            } else if (delimiter.size == 1) {
+                std::vector<int> v_offset;
+                std::vector<int> v_charlen;
+                getOffsetsAndLen(str_col->get_data_at(i).to_string(), delimiter_str, v_offset, v_charlen);
+                for (size_t i = 0; i < v_offset.size(); i++) {
+                    StringOP::push_value_string1(
+                            std::string_view {
+                                    reinterpret_cast<const char*>(str.data + v_offset[i] + 1),
+                                    (size_t)v_charlen[i] - 1},
+                            i, res_data_chars, res_data_offsets);
+                    //res_data_offsets.emplace_back(v_charlen[i]);
+                }
+                res_offsets.emplace_back(v_offset.size()); 
+
+            }
+             
+        }
+        //block.replace_by_position(result, std::move(col_res));

Review Comment:
   here too



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org
For additional commands, e-mail: commits-help@doris.apache.org