You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@doris.apache.org by GitBox <gi...@apache.org> on 2022/07/29 06:16:37 UTC

[GitHub] [doris] xy720 commented on a diff in pull request #11306: [refactor](array-type) optimize the array_distinct and array_sort fun…

xy720 commented on code in PR #11306:
URL: https://github.com/apache/doris/pull/11306#discussion_r932898835


##########
be/src/vec/functions/array/function_array_sort.cpp:
##########
@@ -15,12 +15,203 @@
 // specific language governing permissions and limitations
 // under the License.
 
-#include "vec/functions/array/function_array_sort.h"
-
+#include "vec/functions/array/function_array_unary.h"
 #include "vec/functions/simple_function_factory.h"
 
 namespace doris::vectorized {
 
+struct NameArraySort {
+    static constexpr auto name = "array_sort";
+};
+
+struct ArraySortImpl {
+    // sort the non-null element according to the permutation
+    template <typename SrcDataType>
+    static void _sort_by_permutation(ColumnArray::Offset& prev_offset,
+                                     const ColumnArray::Offset& curr_offset,
+                                     const SrcDataType* src_data_concrete,
+                                     const IColumn& src_column, const NullMapType* src_null_map,
+                                     IColumn::Permutation& permutation) {
+        for (ColumnArray::Offset j = prev_offset; j < curr_offset - 1; ++j) {
+            if (src_null_map && (*src_null_map)[j]) {
+                continue;
+            }
+            for (ColumnArray::Offset k = j + 1; k < curr_offset; ++k) {
+                if (src_null_map && (*src_null_map)[k]) {
+                    continue;
+                }
+                int result = src_data_concrete->compare_at(permutation[j], permutation[k],
+                                                           src_column, 1);
+                if (result > 0) {
+                    auto temp = permutation[j];
+                    permutation[j] = permutation[k];
+                    permutation[k] = temp;
+                }
+            }
+        }
+        return;
+    }
+
+    template <typename ColumnType>
+    static bool _execute_number(const IColumn& src_column, const ColumnArray::Offsets& src_offsets,
+                                IColumn& dest_column, ColumnArray::Offsets& dest_offsets,
+                                const NullMapType* src_null_map, NullMapType* dest_null_map) {
+        using NestType = typename ColumnType::value_type;
+        const ColumnType* src_data_concrete = reinterpret_cast<const ColumnType*>(&src_column);
+        if (!src_data_concrete) {
+            return false;
+        }
+        const PaddedPODArray<NestType>& src_datas = src_data_concrete->get_data();
+
+        ColumnType& dest_data_concrete = reinterpret_cast<ColumnType&>(dest_column);
+        PaddedPODArray<NestType>& dest_datas = dest_data_concrete.get_data();
+
+        ColumnArray::Offset prev_src_offset = 0;
+        IColumn::Permutation permutation(src_column.size());
+        for (size_t i = 0; i < src_column.size(); ++i) {
+            permutation[i] = i;
+        }
+
+        for (auto curr_src_offset : src_offsets) {
+            // filter and insert null element first
+            for (ColumnArray::Offset j = prev_src_offset; j < curr_src_offset; ++j) {
+                if (src_null_map && (*src_null_map)[j]) {
+                    DCHECK(dest_null_map != nullptr);
+                    (*dest_null_map).push_back(true);
+                    dest_datas.push_back(NestType());
+                }
+            }
+
+            _sort_by_permutation<ColumnType>(prev_src_offset, curr_src_offset, src_data_concrete,
+                                             src_column, src_null_map, permutation);
+
+            // insert non-null element after sort by permutation
+            for (ColumnArray::Offset j = prev_src_offset; j < curr_src_offset; ++j) {
+                if (src_null_map && (*src_null_map)[j]) {
+                    continue;
+                }
+
+                dest_datas.push_back(src_datas[permutation[j]]);
+                if (dest_null_map) {
+                    (*dest_null_map).push_back(false);
+                }
+            }
+            dest_offsets.push_back(curr_src_offset);
+            prev_src_offset = curr_src_offset;
+        }
+
+        return true;
+    }
+
+    static bool _execute_string(const IColumn& src_column, const ColumnArray::Offsets& src_offsets,
+                                IColumn& dest_column, ColumnArray::Offsets& dest_offsets,
+                                const NullMapType* src_null_map, NullMapType* dest_null_map) {
+        const ColumnString* src_data_concrete = reinterpret_cast<const ColumnString*>(&src_column);
+        if (!src_data_concrete) {
+            return false;
+        }
+
+        ColumnString& dest_column_string = reinterpret_cast<ColumnString&>(dest_column);
+        ColumnString::Chars& column_string_chars = dest_column_string.get_chars();
+        ColumnString::Offsets& column_string_offsets = dest_column_string.get_offsets();
+        column_string_chars.reserve(src_column.size());
+
+        ColumnArray::Offset prev_src_offset = 0;
+        IColumn::Permutation permutation(src_column.size());
+        for (size_t i = 0; i < src_column.size(); ++i) {
+            permutation[i] = i;
+        }
+
+        for (auto curr_src_offset : src_offsets) {
+            // filter and insert null element first
+            for (ColumnArray::Offset j = prev_src_offset; j < curr_src_offset; ++j) {
+                if (src_null_map && (*src_null_map)[j]) {
+                    DCHECK(dest_null_map != nullptr);
+                    column_string_offsets.push_back(column_string_offsets.back());
+                    (*dest_null_map).push_back(true);
+                }
+            }
+
+            _sort_by_permutation<ColumnString>(prev_src_offset, curr_src_offset, src_data_concrete,
+                                               src_column, src_null_map, permutation);
+
+            // insert non-null element after sort by permutation
+            for (ColumnArray::Offset j = prev_src_offset; j < curr_src_offset; ++j) {
+                if (src_null_map && (*src_null_map)[j]) {
+                    continue;
+                }
+
+                StringRef src_str_ref = src_data_concrete->get_data_at(permutation[j]);
+                // copy the src data to column_string_chars
+                const size_t old_size = column_string_chars.size();
+                const size_t new_size = old_size + src_str_ref.size + 1;
+                column_string_chars.resize(new_size);
+                if (src_str_ref.size > 0) {
+                    memcpy(column_string_chars.data() + old_size, src_str_ref.data,
+                           src_str_ref.size);
+                }
+                column_string_chars[old_size + src_str_ref.size] = 0;
+                column_string_offsets.push_back(new_size);
+
+                if (dest_null_map) {
+                    (*dest_null_map).push_back(false);
+                }
+            }
+            dest_offsets.push_back(curr_src_offset);
+            prev_src_offset = curr_src_offset;
+        }
+        return true;
+    }
+
+    static bool _execute_by_type(const IColumn& src_column, const ColumnArray::Offsets& src_offsets,

Review Comment:
   This function is completely duplicate with the one in function_array_distinct.cpp.
   You need to think for an abstract method or impl.



##########
be/src/vec/functions/array/function_array_unary.h:
##########
@@ -0,0 +1,104 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+#pragma once
+
+#include "vec/columns/column_array.h"
+#include "vec/columns/column_const.h"
+#include "vec/data_types/data_type_array.h"
+#include "vec/data_types/data_type_number.h"
+#include "vec/functions/function.h"
+#include "vec/functions/function_helpers.h"
+
+namespace doris::vectorized {
+using NullMapType = PaddedPODArray<UInt8>;
+
+// define functions with one array type argument.
+template <typename Impl, typename Name>
+class FunctionArrayUnary : public IFunction {
+public:
+    static constexpr auto name = Name::name;
+    static FunctionPtr create() { return std::make_shared<FunctionArrayUnary>(); }
+
+    /// Get function name.
+    String get_name() const override { return name; }
+
+    bool is_variadic() const override { return false; }
+
+    size_t get_number_of_arguments() const override { return 1; }
+
+    DataTypePtr get_return_type_impl(const DataTypes& arguments) const override {
+        DCHECK(is_array(arguments[0]))
+                << "first argument for function: " << name << " should be DataTypeArray"
+                << " and arguments[0] is " << arguments[0]->get_name();
+        return arguments[0];
+    }
+
+    Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments,
+                        size_t result, size_t input_rows_count) override {
+        ColumnPtr src_column =
+                block.get_by_position(arguments[0]).column->convert_to_full_column_if_const();
+        const auto& src_column_array = check_and_get_column<ColumnArray>(*src_column);

Review Comment:
   You should use the extract_column_array_info() in function_array_utils.h here.



##########
be/src/vec/functions/array/function_array_sort.cpp:
##########
@@ -15,12 +15,203 @@
 // specific language governing permissions and limitations
 // under the License.
 
-#include "vec/functions/array/function_array_sort.h"
-
+#include "vec/functions/array/function_array_unary.h"
 #include "vec/functions/simple_function_factory.h"
 
 namespace doris::vectorized {
 
+struct NameArraySort {
+    static constexpr auto name = "array_sort";
+};
+
+struct ArraySortImpl {
+    // sort the non-null element according to the permutation
+    template <typename SrcDataType>
+    static void _sort_by_permutation(ColumnArray::Offset& prev_offset,
+                                     const ColumnArray::Offset& curr_offset,
+                                     const SrcDataType* src_data_concrete,
+                                     const IColumn& src_column, const NullMapType* src_null_map,
+                                     IColumn::Permutation& permutation) {
+        for (ColumnArray::Offset j = prev_offset; j < curr_offset - 1; ++j) {
+            if (src_null_map && (*src_null_map)[j]) {
+                continue;
+            }
+            for (ColumnArray::Offset k = j + 1; k < curr_offset; ++k) {
+                if (src_null_map && (*src_null_map)[k]) {
+                    continue;
+                }
+                int result = src_data_concrete->compare_at(permutation[j], permutation[k],
+                                                           src_column, 1);
+                if (result > 0) {
+                    auto temp = permutation[j];
+                    permutation[j] = permutation[k];
+                    permutation[k] = temp;
+                }
+            }
+        }
+        return;
+    }
+
+    template <typename ColumnType>
+    static bool _execute_number(const IColumn& src_column, const ColumnArray::Offsets& src_offsets,
+                                IColumn& dest_column, ColumnArray::Offsets& dest_offsets,
+                                const NullMapType* src_null_map, NullMapType* dest_null_map) {
+        using NestType = typename ColumnType::value_type;
+        const ColumnType* src_data_concrete = reinterpret_cast<const ColumnType*>(&src_column);
+        if (!src_data_concrete) {
+            return false;
+        }
+        const PaddedPODArray<NestType>& src_datas = src_data_concrete->get_data();
+
+        ColumnType& dest_data_concrete = reinterpret_cast<ColumnType&>(dest_column);
+        PaddedPODArray<NestType>& dest_datas = dest_data_concrete.get_data();
+
+        ColumnArray::Offset prev_src_offset = 0;
+        IColumn::Permutation permutation(src_column.size());
+        for (size_t i = 0; i < src_column.size(); ++i) {
+            permutation[i] = i;
+        }
+
+        for (auto curr_src_offset : src_offsets) {
+            // filter and insert null element first
+            for (ColumnArray::Offset j = prev_src_offset; j < curr_src_offset; ++j) {
+                if (src_null_map && (*src_null_map)[j]) {
+                    DCHECK(dest_null_map != nullptr);
+                    (*dest_null_map).push_back(true);
+                    dest_datas.push_back(NestType());
+                }
+            }
+
+            _sort_by_permutation<ColumnType>(prev_src_offset, curr_src_offset, src_data_concrete,
+                                             src_column, src_null_map, permutation);
+
+            // insert non-null element after sort by permutation
+            for (ColumnArray::Offset j = prev_src_offset; j < curr_src_offset; ++j) {
+                if (src_null_map && (*src_null_map)[j]) {
+                    continue;
+                }
+
+                dest_datas.push_back(src_datas[permutation[j]]);
+                if (dest_null_map) {
+                    (*dest_null_map).push_back(false);
+                }
+            }
+            dest_offsets.push_back(curr_src_offset);
+            prev_src_offset = curr_src_offset;
+        }
+
+        return true;
+    }
+
+    static bool _execute_string(const IColumn& src_column, const ColumnArray::Offsets& src_offsets,
+                                IColumn& dest_column, ColumnArray::Offsets& dest_offsets,
+                                const NullMapType* src_null_map, NullMapType* dest_null_map) {
+        const ColumnString* src_data_concrete = reinterpret_cast<const ColumnString*>(&src_column);
+        if (!src_data_concrete) {
+            return false;
+        }
+
+        ColumnString& dest_column_string = reinterpret_cast<ColumnString&>(dest_column);

Review Comment:
   Duplicate too.



##########
be/src/vec/functions/array/function_array_sort.cpp:
##########
@@ -15,12 +15,203 @@
 // specific language governing permissions and limitations
 // under the License.
 
-#include "vec/functions/array/function_array_sort.h"
-
+#include "vec/functions/array/function_array_unary.h"
 #include "vec/functions/simple_function_factory.h"
 
 namespace doris::vectorized {
 
+struct NameArraySort {
+    static constexpr auto name = "array_sort";
+};
+
+struct ArraySortImpl {
+    // sort the non-null element according to the permutation
+    template <typename SrcDataType>
+    static void _sort_by_permutation(ColumnArray::Offset& prev_offset,
+                                     const ColumnArray::Offset& curr_offset,
+                                     const SrcDataType* src_data_concrete,
+                                     const IColumn& src_column, const NullMapType* src_null_map,
+                                     IColumn::Permutation& permutation) {
+        for (ColumnArray::Offset j = prev_offset; j < curr_offset - 1; ++j) {
+            if (src_null_map && (*src_null_map)[j]) {
+                continue;
+            }
+            for (ColumnArray::Offset k = j + 1; k < curr_offset; ++k) {
+                if (src_null_map && (*src_null_map)[k]) {
+                    continue;
+                }
+                int result = src_data_concrete->compare_at(permutation[j], permutation[k],
+                                                           src_column, 1);
+                if (result > 0) {
+                    auto temp = permutation[j];
+                    permutation[j] = permutation[k];
+                    permutation[k] = temp;
+                }
+            }
+        }
+        return;
+    }
+
+    template <typename ColumnType>
+    static bool _execute_number(const IColumn& src_column, const ColumnArray::Offsets& src_offsets,
+                                IColumn& dest_column, ColumnArray::Offsets& dest_offsets,
+                                const NullMapType* src_null_map, NullMapType* dest_null_map) {
+        using NestType = typename ColumnType::value_type;
+        const ColumnType* src_data_concrete = reinterpret_cast<const ColumnType*>(&src_column);
+        if (!src_data_concrete) {
+            return false;
+        }
+        const PaddedPODArray<NestType>& src_datas = src_data_concrete->get_data();
+
+        ColumnType& dest_data_concrete = reinterpret_cast<ColumnType&>(dest_column);
+        PaddedPODArray<NestType>& dest_datas = dest_data_concrete.get_data();
+
+        ColumnArray::Offset prev_src_offset = 0;
+        IColumn::Permutation permutation(src_column.size());
+        for (size_t i = 0; i < src_column.size(); ++i) {
+            permutation[i] = i;
+        }
+
+        for (auto curr_src_offset : src_offsets) {
+            // filter and insert null element first
+            for (ColumnArray::Offset j = prev_src_offset; j < curr_src_offset; ++j) {
+                if (src_null_map && (*src_null_map)[j]) {
+                    DCHECK(dest_null_map != nullptr);
+                    (*dest_null_map).push_back(true);
+                    dest_datas.push_back(NestType());
+                }
+            }
+
+            _sort_by_permutation<ColumnType>(prev_src_offset, curr_src_offset, src_data_concrete,
+                                             src_column, src_null_map, permutation);
+
+            // insert non-null element after sort by permutation
+            for (ColumnArray::Offset j = prev_src_offset; j < curr_src_offset; ++j) {
+                if (src_null_map && (*src_null_map)[j]) {
+                    continue;
+                }
+
+                dest_datas.push_back(src_datas[permutation[j]]);
+                if (dest_null_map) {
+                    (*dest_null_map).push_back(false);
+                }
+            }
+            dest_offsets.push_back(curr_src_offset);
+            prev_src_offset = curr_src_offset;
+        }
+
+        return true;
+    }
+
+    static bool _execute_string(const IColumn& src_column, const ColumnArray::Offsets& src_offsets,
+                                IColumn& dest_column, ColumnArray::Offsets& dest_offsets,
+                                const NullMapType* src_null_map, NullMapType* dest_null_map) {
+        const ColumnString* src_data_concrete = reinterpret_cast<const ColumnString*>(&src_column);

Review Comment:
   Duplicate with function_array_distinct.cpp
   If it is hard to add an abstract method, you can put the duplicate code into function_array_utils.h and use a common util method instead.



##########
be/src/vec/functions/array/function_array_unary.h:
##########
@@ -0,0 +1,104 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+#pragma once
+
+#include "vec/columns/column_array.h"
+#include "vec/columns/column_const.h"
+#include "vec/data_types/data_type_array.h"
+#include "vec/data_types/data_type_number.h"
+#include "vec/functions/function.h"
+#include "vec/functions/function_helpers.h"
+
+namespace doris::vectorized {
+using NullMapType = PaddedPODArray<UInt8>;
+
+// define functions with one array type argument.
+template <typename Impl, typename Name>
+class FunctionArrayUnary : public IFunction {
+public:
+    static constexpr auto name = Name::name;
+    static FunctionPtr create() { return std::make_shared<FunctionArrayUnary>(); }
+
+    /// Get function name.
+    String get_name() const override { return name; }
+
+    bool is_variadic() const override { return false; }
+
+    size_t get_number_of_arguments() const override { return 1; }
+
+    DataTypePtr get_return_type_impl(const DataTypes& arguments) const override {
+        DCHECK(is_array(arguments[0]))
+                << "first argument for function: " << name << " should be DataTypeArray"
+                << " and arguments[0] is " << arguments[0]->get_name();
+        return arguments[0];
+    }
+
+    Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments,
+                        size_t result, size_t input_rows_count) override {
+        ColumnPtr src_column =
+                block.get_by_position(arguments[0]).column->convert_to_full_column_if_const();
+        const auto& src_column_array = check_and_get_column<ColumnArray>(*src_column);
+        if (!src_column_array) {
+            return Status::RuntimeError(
+                    fmt::format("unsupported types for function {}({})", get_name(),
+                                block.get_by_position(arguments[0]).type->get_name()));
+        }
+        const auto& src_offsets = src_column_array->get_offsets();
+        const auto* src_nested_column = &src_column_array->get_data();
+        DCHECK(src_nested_column != nullptr);
+
+        DataTypePtr src_column_type = block.get_by_position(arguments[0]).type;
+        auto nested_type = assert_cast<const DataTypeArray&>(*src_column_type).get_nested_type();
+        auto dest_column_ptr = ColumnArray::create(nested_type->create_column(),
+                                                   ColumnArray::ColumnOffsets::create());
+        IColumn* dest_nested_column = &dest_column_ptr->get_data();
+        ColumnArray::Offsets& dest_offsets = dest_column_ptr->get_offsets();
+        DCHECK(dest_nested_column != nullptr);
+        dest_nested_column->reserve(src_nested_column->size());
+        dest_offsets.reserve(input_rows_count);
+
+        const NullMapType* src_null_map = nullptr;
+        if (src_nested_column->is_nullable()) {
+            const ColumnNullable* src_nested_nullable_col =
+                    check_and_get_column<ColumnNullable>(*src_nested_column);
+            src_nested_column = src_nested_nullable_col->get_nested_column_ptr();
+            src_null_map = &src_nested_nullable_col->get_null_map_column().get_data();
+        }
+
+        NullMapType* dest_null_map = nullptr;
+        if (dest_nested_column->is_nullable()) {
+            ColumnNullable* dest_nested_nullable_col =
+                    reinterpret_cast<ColumnNullable*>(dest_nested_column);
+            dest_nested_column = dest_nested_nullable_col->get_nested_column_ptr();
+            dest_null_map = &dest_nested_nullable_col->get_null_map_column().get_data();
+        }
+
+        auto res_val =
+                Impl::_execute_by_type(*src_nested_column, src_offsets, *dest_nested_column,

Review Comment:
   ```suggestion
                   Impl::execute(*src_nested_column, src_offsets, *dest_nested_column,
   ```
   I think execute is better. Because we may be add a function in future which don't need to classify the execute by type.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org
For additional commands, e-mail: commits-help@doris.apache.org