You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@doris.apache.org by yi...@apache.org on 2022/06/16 10:09:38 UTC

[incubator-doris] branch master updated: [Enhancement][Storage] refactor InListPredicate/NotInListPredicate (#10139)

This is an automated email from the ASF dual-hosted git repository.

yiguolei pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-doris.git


The following commit(s) were added to refs/heads/master by this push:
     new ae9c231925 [Enhancement][Storage] refactor InListPredicate/NotInListPredicate (#10139)
ae9c231925 is described below

commit ae9c231925d7b4dd76acac2f924b550217f64b28
Author: Pxl <95...@qq.com>
AuthorDate: Thu Jun 16 18:09:29 2022 +0800

    [Enhancement][Storage] refactor InListPredicate/NotInListPredicate (#10139)
    
    * refactor in_list_pred
    
    * update
---
 be/src/olap/CMakeLists.txt             |   1 -
 be/src/olap/in_list_predicate.cpp      | 391 ---------------------------------
 be/src/olap/in_list_predicate.h        | 313 +++++++++++++++++++++++---
 be/src/vec/columns/column_dictionary.h |  18 +-
 4 files changed, 289 insertions(+), 434 deletions(-)

diff --git a/be/src/olap/CMakeLists.txt b/be/src/olap/CMakeLists.txt
index 2c02122883..615a9fff44 100644
--- a/be/src/olap/CMakeLists.txt
+++ b/be/src/olap/CMakeLists.txt
@@ -47,7 +47,6 @@ add_library(Olap STATIC
     file_stream.cpp
     generic_iterators.cpp
     hll.cpp
-    in_list_predicate.cpp
     bloom_filter_predicate.cpp
     in_stream.cpp
     key_coder.cpp
diff --git a/be/src/olap/in_list_predicate.cpp b/be/src/olap/in_list_predicate.cpp
deleted file mode 100644
index 1b2ab20f3b..0000000000
--- a/be/src/olap/in_list_predicate.cpp
+++ /dev/null
@@ -1,391 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include "olap/in_list_predicate.h"
-
-#include "olap/field.h"
-#include "runtime/string_value.hpp"
-#include "runtime/vectorized_row_batch.h"
-#include "vec/columns/column_dictionary.h"
-#include "vec/columns/column_nullable.h"
-#include "vec/columns/predicate_column.h"
-
-namespace doris {
-
-#define IN_LIST_PRED_CONSTRUCTOR(CLASS)                                                  \
-    template <class T>                                                                   \
-    CLASS<T>::CLASS(uint32_t column_id, phmap::flat_hash_set<T>&& values, bool opposite) \
-            : ColumnPredicate(column_id, opposite), _values(std::move(values)) {}
-
-IN_LIST_PRED_CONSTRUCTOR(InListPredicate)
-IN_LIST_PRED_CONSTRUCTOR(NotInListPredicate)
-
-#define IN_LIST_PRED_EVALUATE(CLASS, OP)                                                         \
-    template <class T>                                                                           \
-    void CLASS<T>::evaluate(VectorizedRowBatch* batch) const {                                   \
-        uint16_t n = batch->size();                                                              \
-        if (n == 0) {                                                                            \
-            return;                                                                              \
-        }                                                                                        \
-        uint16_t* sel = batch->selected();                                                       \
-        const T* col_vector = reinterpret_cast<const T*>(batch->column(_column_id)->col_data()); \
-        uint16_t new_size = 0;                                                                   \
-        if (batch->column(_column_id)->no_nulls()) {                                             \
-            if (batch->selected_in_use()) {                                                      \
-                for (uint16_t j = 0; j != n; ++j) {                                              \
-                    uint16_t i = sel[j];                                                         \
-                    sel[new_size] = i;                                                           \
-                    new_size += (_values.find(col_vector[i]) OP _values.end());                  \
-                }                                                                                \
-                batch->set_size(new_size);                                                       \
-            } else {                                                                             \
-                for (uint16_t i = 0; i != n; ++i) {                                              \
-                    sel[new_size] = i;                                                           \
-                    new_size += (_values.find(col_vector[i]) OP _values.end());                  \
-                }                                                                                \
-                if (new_size < n) {                                                              \
-                    batch->set_size(new_size);                                                   \
-                    batch->set_selected_in_use(true);                                            \
-                }                                                                                \
-            }                                                                                    \
-        } else {                                                                                 \
-            bool* is_null = batch->column(_column_id)->is_null();                                \
-            if (batch->selected_in_use()) {                                                      \
-                for (uint16_t j = 0; j != n; ++j) {                                              \
-                    uint16_t i = sel[j];                                                         \
-                    sel[new_size] = i;                                                           \
-                    new_size += (!is_null[i] && _values.find(col_vector[i]) OP _values.end());   \
-                }                                                                                \
-                batch->set_size(new_size);                                                       \
-            } else {                                                                             \
-                for (int i = 0; i != n; ++i) {                                                   \
-                    sel[new_size] = i;                                                           \
-                    new_size += (!is_null[i] && _values.find(col_vector[i]) OP _values.end());   \
-                }                                                                                \
-                if (new_size < n) {                                                              \
-                    batch->set_size(new_size);                                                   \
-                    batch->set_selected_in_use(true);                                            \
-                }                                                                                \
-            }                                                                                    \
-        }                                                                                        \
-    }
-
-IN_LIST_PRED_EVALUATE(InListPredicate, !=)
-IN_LIST_PRED_EVALUATE(NotInListPredicate, ==)
-
-#define IN_LIST_PRED_COLUMN_BLOCK_EVALUATE(CLASS, OP)                                          \
-    template <class T>                                                                         \
-    void CLASS<T>::evaluate(ColumnBlock* block, uint16_t* sel, uint16_t* size) const {         \
-        uint16_t new_size = 0;                                                                 \
-        if (block->is_nullable()) {                                                            \
-            for (uint16_t i = 0; i < *size; ++i) {                                             \
-                uint16_t idx = sel[i];                                                         \
-                sel[new_size] = idx;                                                           \
-                const T* cell_value = reinterpret_cast<const T*>(block->cell(idx).cell_ptr()); \
-                auto result = (!block->cell(idx).is_null() && _values.find(*cell_value)        \
-                                                                      OP _values.end());       \
-                new_size += _opposite ? !result : result;                                      \
-            }                                                                                  \
-        } else {                                                                               \
-            for (uint16_t i = 0; i < *size; ++i) {                                             \
-                uint16_t idx = sel[i];                                                         \
-                sel[new_size] = idx;                                                           \
-                const T* cell_value = reinterpret_cast<const T*>(block->cell(idx).cell_ptr()); \
-                auto result = (_values.find(*cell_value) OP _values.end());                    \
-                new_size += _opposite ? !result : result;                                      \
-            }                                                                                  \
-        }                                                                                      \
-        *size = new_size;                                                                      \
-    }
-
-IN_LIST_PRED_COLUMN_BLOCK_EVALUATE(InListPredicate, !=)
-IN_LIST_PRED_COLUMN_BLOCK_EVALUATE(NotInListPredicate, ==)
-
-// todo(zeno) define interface in IColumn to simplify code
-#define IN_LIST_PRED_COLUMN_EVALUATE(CLASS, OP)                                                  \
-    template <class T>                                                                           \
-    void CLASS<T>::evaluate(vectorized::IColumn& column, uint16_t* sel, uint16_t* size) const {  \
-        uint16_t new_size = 0;                                                                   \
-        if (column.is_nullable()) {                                                              \
-            auto* nullable_col =                                                                 \
-                    vectorized::check_and_get_column<vectorized::ColumnNullable>(column);        \
-            auto& null_bitmap = reinterpret_cast<const vectorized::ColumnUInt8&>(                \
-                                        nullable_col->get_null_map_column())                     \
-                                        .get_data();                                             \
-            auto& nested_col = nullable_col->get_nested_column();                                \
-            if (nested_col.is_column_dictionary()) {                                             \
-                if constexpr (std::is_same_v<T, StringValue>) {                                  \
-                    auto* nested_col_ptr = vectorized::check_and_get_column<                     \
-                            vectorized::ColumnDictionary<vectorized::Int32>>(nested_col);        \
-                    auto& data_array = nested_col_ptr->get_data();                               \
-                    std::vector<bool> selected;                                                  \
-                    nested_col_ptr->find_codes(_values, selected);                               \
-                    for (uint16_t i = 0; i < *size; i++) {                                       \
-                        uint16_t idx = sel[i];                                                   \
-                        sel[new_size] = idx;                                                     \
-                        const auto& cell_value = data_array[idx];                                \
-                        DCHECK(cell_value < (int64_t)selected.size());                           \
-                        bool ret = !null_bitmap[idx] && (selected[cell_value] OP false);         \
-                        new_size += _opposite ? !ret : ret;                                      \
-                    }                                                                            \
-                }                                                                                \
-            } else {                                                                             \
-                auto* nested_col_ptr =                                                           \
-                        vectorized::check_and_get_column<vectorized::PredicateColumnType<T>>(    \
-                                nested_col);                                                     \
-                auto& data_array = nested_col_ptr->get_data();                                   \
-                for (uint16_t i = 0; i < *size; i++) {                                           \
-                    uint16_t idx = sel[i];                                                       \
-                    sel[new_size] = idx;                                                         \
-                    const auto& cell_value = reinterpret_cast<const T&>(data_array[idx]);        \
-                    bool ret = !null_bitmap[idx] && (_values.find(cell_value) OP _values.end()); \
-                    new_size += _opposite ? !ret : ret;                                          \
-                }                                                                                \
-            }                                                                                    \
-        } else if (column.is_column_dictionary()) {                                              \
-            if constexpr (std::is_same_v<T, StringValue>) {                                      \
-                auto& dict_col =                                                                 \
-                        reinterpret_cast<vectorized::ColumnDictionary<vectorized::Int32>&>(      \
-                                column);                                                         \
-                auto& data_array = dict_col.get_data();                                          \
-                std::vector<bool> selected;                                                      \
-                dict_col.find_codes(_values, selected);                                          \
-                for (uint16_t i = 0; i < *size; i++) {                                           \
-                    uint16_t idx = sel[i];                                                       \
-                    sel[new_size] = idx;                                                         \
-                    const auto& cell_value = data_array[idx];                                    \
-                    DCHECK(cell_value < (int64_t)selected.size());                               \
-                    auto result = (selected[cell_value] OP false);                               \
-                    new_size += _opposite ? !result : result;                                    \
-                }                                                                                \
-            }                                                                                    \
-        } else {                                                                                 \
-            auto& number_column = reinterpret_cast<vectorized::PredicateColumnType<T>&>(column); \
-            auto& data_array = number_column.get_data();                                         \
-            for (uint16_t i = 0; i < *size; i++) {                                               \
-                uint16_t idx = sel[i];                                                           \
-                sel[new_size] = idx;                                                             \
-                const auto& cell_value = reinterpret_cast<const T&>(data_array[idx]);            \
-                auto result = (_values.find(cell_value) OP _values.end());                       \
-                new_size += _opposite ? !result : result;                                        \
-            }                                                                                    \
-        }                                                                                        \
-        *size = new_size;                                                                        \
-    }
-
-IN_LIST_PRED_COLUMN_EVALUATE(InListPredicate, !=)
-IN_LIST_PRED_COLUMN_EVALUATE(NotInListPredicate, ==)
-
-#define IN_LIST_PRED_COLUMN_BLOCK_EVALUATE_OR(CLASS, OP)                                       \
-    template <class T>                                                                         \
-    void CLASS<T>::evaluate_or(ColumnBlock* block, uint16_t* sel, uint16_t size, bool* flags)  \
-            const {                                                                            \
-        if (block->is_nullable()) {                                                            \
-            for (uint16_t i = 0; i < size; ++i) {                                              \
-                if (flags[i]) continue;                                                        \
-                uint16_t idx = sel[i];                                                         \
-                const T* cell_value = reinterpret_cast<const T*>(block->cell(idx).cell_ptr()); \
-                auto result = (!block->cell(idx).is_null() && _values.find(*cell_value)        \
-                                                                      OP _values.end());       \
-                flags[i] |= _opposite ? !result : result;                                      \
-            }                                                                                  \
-        } else {                                                                               \
-            for (uint16_t i = 0; i < size; ++i) {                                              \
-                if (flags[i]) continue;                                                        \
-                uint16_t idx = sel[i];                                                         \
-                const T* cell_value = reinterpret_cast<const T*>(block->cell(idx).cell_ptr()); \
-                auto result = (_values.find(*cell_value) OP _values.end());                    \
-                flags[i] |= _opposite ? !result : result;                                      \
-            }                                                                                  \
-        }                                                                                      \
-    }
-
-IN_LIST_PRED_COLUMN_BLOCK_EVALUATE_OR(InListPredicate, !=)
-IN_LIST_PRED_COLUMN_BLOCK_EVALUATE_OR(NotInListPredicate, ==)
-
-#define IN_LIST_PRED_COLUMN_BLOCK_EVALUATE_AND(CLASS, OP)                                      \
-    template <class T>                                                                         \
-    void CLASS<T>::evaluate_and(ColumnBlock* block, uint16_t* sel, uint16_t size, bool* flags) \
-            const {                                                                            \
-        if (block->is_nullable()) {                                                            \
-            for (uint16_t i = 0; i < size; ++i) {                                              \
-                if (!flags[i]) continue;                                                       \
-                uint16_t idx = sel[i];                                                         \
-                const T* cell_value = reinterpret_cast<const T*>(block->cell(idx).cell_ptr()); \
-                auto result = (!block->cell(idx).is_null() && _values.find(*cell_value)        \
-                                                                      OP _values.end());       \
-                flags[i] &= _opposite ? !result : result;                                      \
-            }                                                                                  \
-        } else {                                                                               \
-            for (uint16_t i = 0; i < size; ++i) {                                              \
-                if (!flags[i]) continue;                                                       \
-                uint16_t idx = sel[i];                                                         \
-                const T* cell_value = reinterpret_cast<const T*>(block->cell(idx).cell_ptr()); \
-                auto result = (_values.find(*cell_value) OP _values.end());                    \
-                flags[i] &= _opposite ? !result : result;                                      \
-            }                                                                                  \
-        }                                                                                      \
-    }
-
-IN_LIST_PRED_COLUMN_BLOCK_EVALUATE_AND(InListPredicate, !=)
-IN_LIST_PRED_COLUMN_BLOCK_EVALUATE_AND(NotInListPredicate, ==)
-
-#define IN_LIST_PRED_BITMAP_EVALUATE(CLASS, OP)                                     \
-    template <class T>                                                              \
-    Status CLASS<T>::evaluate(const Schema& schema,                                 \
-                              const std::vector<BitmapIndexIterator*>& iterators,   \
-                              uint32_t num_rows, roaring::Roaring* result) const {  \
-        BitmapIndexIterator* iterator = iterators[_column_id];                      \
-        if (iterator == nullptr) {                                                  \
-            return Status::OK();                                                    \
-        }                                                                           \
-        if (iterator->has_null_bitmap()) {                                          \
-            roaring::Roaring null_bitmap;                                           \
-            RETURN_IF_ERROR(iterator->read_null_bitmap(&null_bitmap));              \
-            *result -= null_bitmap;                                                 \
-        }                                                                           \
-        roaring::Roaring indices;                                                   \
-        for (auto value : _values) {                                                \
-            bool exact_match;                                                       \
-            Status s = iterator->seek_dictionary(&value, &exact_match);             \
-            rowid_t seeked_ordinal = iterator->current_ordinal();                   \
-            if (!s.is_not_found()) {                                                \
-                if (!s.ok()) {                                                      \
-                    return s;                                                       \
-                }                                                                   \
-                if (exact_match) {                                                  \
-                    roaring::Roaring index;                                         \
-                    RETURN_IF_ERROR(iterator->read_bitmap(seeked_ordinal, &index)); \
-                    indices |= index;                                               \
-                }                                                                   \
-            }                                                                       \
-        }                                                                           \
-        *result OP indices;                                                         \
-        return Status::OK();                                                        \
-    }
-
-IN_LIST_PRED_BITMAP_EVALUATE(InListPredicate, &=)
-IN_LIST_PRED_BITMAP_EVALUATE(NotInListPredicate, -=)
-
-#define IN_LIST_PRED_CONSTRUCTOR_DECLARATION(CLASS)                                                \
-    template CLASS<int8_t>::CLASS(uint32_t column_id, phmap::flat_hash_set<int8_t>&& values,       \
-                                  bool opposite);                                                  \
-    template CLASS<int16_t>::CLASS(uint32_t column_id, phmap::flat_hash_set<int16_t>&& values,     \
-                                   bool opposite);                                                 \
-    template CLASS<int32_t>::CLASS(uint32_t column_id, phmap::flat_hash_set<int32_t>&& values,     \
-                                   bool opposite);                                                 \
-    template CLASS<int64_t>::CLASS(uint32_t column_id, phmap::flat_hash_set<int64_t>&& values,     \
-                                   bool opposite);                                                 \
-    template CLASS<int128_t>::CLASS(uint32_t column_id, phmap::flat_hash_set<int128_t>&& values,   \
-                                    bool opposite);                                                \
-    template CLASS<float>::CLASS(uint32_t column_id, phmap::flat_hash_set<float>&& values,         \
-                                 bool opposite);                                                   \
-    template CLASS<double>::CLASS(uint32_t column_id, phmap::flat_hash_set<double>&& values,       \
-                                  bool opposite);                                                  \
-    template CLASS<decimal12_t>::CLASS(uint32_t column_id,                                         \
-                                       phmap::flat_hash_set<decimal12_t>&& values, bool opposite); \
-    template CLASS<StringValue>::CLASS(uint32_t column_id,                                         \
-                                       phmap::flat_hash_set<StringValue>&& values, bool opposite); \
-    template CLASS<uint24_t>::CLASS(uint32_t column_id, phmap::flat_hash_set<uint24_t>&& values,   \
-                                    bool opposite);                                                \
-    template CLASS<uint64_t>::CLASS(uint32_t column_id, phmap::flat_hash_set<uint64_t>&& values,   \
-                                    bool opposite);
-
-IN_LIST_PRED_CONSTRUCTOR_DECLARATION(InListPredicate)
-IN_LIST_PRED_CONSTRUCTOR_DECLARATION(NotInListPredicate)
-
-#define IN_LIST_PRED_EVALUATE_DECLARATION(CLASS)                                 \
-    template void CLASS<int8_t>::evaluate(VectorizedRowBatch* batch) const;      \
-    template void CLASS<int16_t>::evaluate(VectorizedRowBatch* batch) const;     \
-    template void CLASS<int32_t>::evaluate(VectorizedRowBatch* batch) const;     \
-    template void CLASS<int64_t>::evaluate(VectorizedRowBatch* batch) const;     \
-    template void CLASS<int128_t>::evaluate(VectorizedRowBatch* batch) const;    \
-    template void CLASS<float>::evaluate(VectorizedRowBatch* batch) const;       \
-    template void CLASS<double>::evaluate(VectorizedRowBatch* batch) const;      \
-    template void CLASS<decimal12_t>::evaluate(VectorizedRowBatch* batch) const; \
-    template void CLASS<StringValue>::evaluate(VectorizedRowBatch* batch) const; \
-    template void CLASS<uint24_t>::evaluate(VectorizedRowBatch* batch) const;    \
-    template void CLASS<uint64_t>::evaluate(VectorizedRowBatch* batch) const;
-
-IN_LIST_PRED_EVALUATE_DECLARATION(InListPredicate)
-IN_LIST_PRED_EVALUATE_DECLARATION(NotInListPredicate)
-
-#define IN_LIST_PRED_COLUMN_BLOCK_EVALUATE_DECLARATION(CLASS)                                      \
-    template void CLASS<int8_t>::evaluate(ColumnBlock* block, uint16_t* sel, uint16_t* size)       \
-            const;                                                                                 \
-    template void CLASS<int16_t>::evaluate(ColumnBlock* block, uint16_t* sel, uint16_t* size)      \
-            const;                                                                                 \
-    template void CLASS<int32_t>::evaluate(ColumnBlock* block, uint16_t* sel, uint16_t* size)      \
-            const;                                                                                 \
-    template void CLASS<int64_t>::evaluate(ColumnBlock* block, uint16_t* sel, uint16_t* size)      \
-            const;                                                                                 \
-    template void CLASS<int128_t>::evaluate(ColumnBlock* block, uint16_t* sel, uint16_t* size)     \
-            const;                                                                                 \
-    template void CLASS<float>::evaluate(ColumnBlock* block, uint16_t* sel, uint16_t* size) const; \
-    template void CLASS<double>::evaluate(ColumnBlock* block, uint16_t* sel, uint16_t* size)       \
-            const;                                                                                 \
-    template void CLASS<decimal12_t>::evaluate(ColumnBlock* block, uint16_t* sel, uint16_t* size)  \
-            const;                                                                                 \
-    template void CLASS<StringValue>::evaluate(ColumnBlock* block, uint16_t* sel, uint16_t* size)  \
-            const;                                                                                 \
-    template void CLASS<uint24_t>::evaluate(ColumnBlock* block, uint16_t* sel, uint16_t* size)     \
-            const;                                                                                 \
-    template void CLASS<uint64_t>::evaluate(ColumnBlock* block, uint16_t* sel, uint16_t* size)     \
-            const;
-
-IN_LIST_PRED_COLUMN_BLOCK_EVALUATE_DECLARATION(InListPredicate)
-IN_LIST_PRED_COLUMN_BLOCK_EVALUATE_DECLARATION(NotInListPredicate)
-
-#define IN_LIST_PRED_BITMAP_EVALUATE_DECLARATION(CLASS)                                           \
-    template Status CLASS<int8_t>::evaluate(const Schema& schema,                                 \
-                                            const std::vector<BitmapIndexIterator*>& iterators,   \
-                                            uint32_t num_rows, roaring::Roaring* bitmap) const;   \
-    template Status CLASS<int16_t>::evaluate(const Schema& schema,                                \
-                                             const std::vector<BitmapIndexIterator*>& iterators,  \
-                                             uint32_t num_rows, roaring::Roaring* bitmap) const;  \
-    template Status CLASS<int32_t>::evaluate(const Schema& schema,                                \
-                                             const std::vector<BitmapIndexIterator*>& iterators,  \
-                                             uint32_t num_rows, roaring::Roaring* bitmap) const;  \
-    template Status CLASS<int64_t>::evaluate(const Schema& schema,                                \
-                                             const std::vector<BitmapIndexIterator*>& iterators,  \
-                                             uint32_t num_rows, roaring::Roaring* bitmap) const;  \
-    template Status CLASS<int128_t>::evaluate(const Schema& schema,                               \
-                                              const std::vector<BitmapIndexIterator*>& iterators, \
-                                              uint32_t num_rows, roaring::Roaring* bitmap) const; \
-    template Status CLASS<float>::evaluate(const Schema& schema,                                  \
-                                           const std::vector<BitmapIndexIterator*>& iterators,    \
-                                           uint32_t num_rows, roaring::Roaring* bitmap) const;    \
-    template Status CLASS<double>::evaluate(const Schema& schema,                                 \
-                                            const std::vector<BitmapIndexIterator*>& iterators,   \
-                                            uint32_t num_rows, roaring::Roaring* bitmap) const;   \
-    template Status CLASS<decimal12_t>::evaluate(                                                 \
-            const Schema& schema, const std::vector<BitmapIndexIterator*>& iterators,             \
-            uint32_t num_rows, roaring::Roaring* bitmap) const;                                   \
-    template Status CLASS<StringValue>::evaluate(                                                 \
-            const Schema& schema, const std::vector<BitmapIndexIterator*>& iterators,             \
-            uint32_t num_rows, roaring::Roaring* bitmap) const;                                   \
-    template Status CLASS<uint24_t>::evaluate(const Schema& schema,                               \
-                                              const std::vector<BitmapIndexIterator*>& iterators, \
-                                              uint32_t num_rows, roaring::Roaring* bitmap) const; \
-    template Status CLASS<uint64_t>::evaluate(const Schema& schema,                               \
-                                              const std::vector<BitmapIndexIterator*>& iterators, \
-                                              uint32_t num_rows, roaring::Roaring* bitmap) const;
-
-IN_LIST_PRED_BITMAP_EVALUATE_DECLARATION(InListPredicate)
-IN_LIST_PRED_BITMAP_EVALUATE_DECLARATION(NotInListPredicate)
-
-} //namespace doris
\ No newline at end of file
diff --git a/be/src/olap/in_list_predicate.h b/be/src/olap/in_list_predicate.h
index cf2f4b2cdc..f41a7f51c5 100644
--- a/be/src/olap/in_list_predicate.h
+++ b/be/src/olap/in_list_predicate.h
@@ -21,11 +21,15 @@
 #include <stdint.h>
 
 #include <roaring/roaring.hh>
+#include <type_traits>
 
 #include "decimal12.h"
 #include "olap/column_predicate.h"
 #include "runtime/string_value.h"
+#include "runtime/vectorized_row_batch.h"
 #include "uint24.h"
+#include "vec/columns/column_dictionary.h"
+#include "vec/core/types.h"
 
 namespace std {
 // for string value
@@ -75,34 +79,289 @@ namespace doris {
 
 class VectorizedRowBatch;
 
-// todo(wb) support evaluate_and,evaluate_or
-
-#define IN_LIST_PRED_CLASS_DEFINE(CLASS, PT)                                                      \
-    template <class T>                                                                            \
-    class CLASS : public ColumnPredicate {                                                        \
-    public:                                                                                       \
-        CLASS(uint32_t column_id, phmap::flat_hash_set<T>&& values, bool is_opposite = false);    \
-        PredicateType type() const override { return PredicateType::PT; }                         \
-        virtual void evaluate(VectorizedRowBatch* batch) const override;                          \
-        void evaluate(ColumnBlock* block, uint16_t* sel, uint16_t* size) const override;          \
-        void evaluate_or(ColumnBlock* block, uint16_t* sel, uint16_t size,                        \
-                         bool* flags) const override;                                             \
-        void evaluate_and(ColumnBlock* block, uint16_t* sel, uint16_t size,                       \
-                          bool* flags) const override;                                            \
-        virtual Status evaluate(const Schema& schema,                                             \
-                                const std::vector<BitmapIndexIterator*>& iterators,               \
-                                uint32_t num_rows, roaring::Roaring* bitmap) const override;      \
-        void evaluate(vectorized::IColumn& column, uint16_t* sel, uint16_t* size) const override; \
-        void evaluate_and(vectorized::IColumn& column, uint16_t* sel, uint16_t size,              \
-                          bool* flags) const override {}                                          \
-        void evaluate_or(vectorized::IColumn& column, uint16_t* sel, uint16_t size,               \
-                         bool* flags) const override {}                                           \
-                                                                                                  \
-    private:                                                                                      \
-        phmap::flat_hash_set<T> _values;                                                          \
+template <class T, PredicateType PT>
+class InListPredicateBase : public ColumnPredicate {
+public:
+    InListPredicateBase(uint32_t column_id, phmap::flat_hash_set<T>&& values,
+                        bool is_opposite = false)
+            : ColumnPredicate(column_id, is_opposite), _values(std::move(values)) {}
+
+    PredicateType type() const override { return PT; }
+
+    void evaluate(VectorizedRowBatch* batch) const override {
+        uint16_t n = batch->size();
+        if (!n) {
+            return;
+        }
+
+        uint16_t* sel = batch->selected();
+        const T* col_vector = reinterpret_cast<const T*>(batch->column(_column_id)->col_data());
+        uint16_t new_size = 0;
+        if (batch->column(_column_id)->no_nulls()) {
+            if (batch->selected_in_use()) {
+                for (uint16_t j = 0; j != n; ++j) {
+                    uint16_t i = sel[j];
+                    sel[new_size] = i;
+                    new_size += _operator(_values.find(col_vector[i]), _values.end());
+                }
+                batch->set_size(new_size);
+            } else {
+                for (uint16_t i = 0; i != n; ++i) {
+                    sel[new_size] = i;
+                    new_size += _operator(_values.find(col_vector[i]), _values.end());
+                }
+                if (new_size < n) {
+                    batch->set_size(new_size);
+                    batch->set_selected_in_use(true);
+                }
+            }
+        } else {
+            bool* is_null = batch->column(_column_id)->is_null();
+            if (batch->selected_in_use()) {
+                for (uint16_t j = 0; j != n; ++j) {
+                    uint16_t i = sel[j];
+                    sel[new_size] = i;
+                    new_size +=
+                            (!is_null[i] && _operator(_values.find(col_vector[i]), _values.end()));
+                }
+                batch->set_size(new_size);
+            } else {
+                for (int i = 0; i != n; ++i) {
+                    sel[new_size] = i;
+                    new_size +=
+                            (!is_null[i] && _operator(_values.find(col_vector[i]), _values.end()));
+                }
+                if (new_size < n) {
+                    batch->set_size(new_size);
+                    batch->set_selected_in_use(true);
+                }
+            }
+        }
     };
 
-IN_LIST_PRED_CLASS_DEFINE(InListPredicate, IN_LIST)
-IN_LIST_PRED_CLASS_DEFINE(NotInListPredicate, NOT_IN_LIST)
+    void evaluate(ColumnBlock* block, uint16_t* sel, uint16_t* size) const override {
+        if (block->is_nullable()) {
+            _base_evaluate<true>(block, sel, size);
+        } else {
+            _base_evaluate<false>(block, sel, size);
+        }
+    }
+
+    void evaluate_or(ColumnBlock* block, uint16_t* sel, uint16_t size, bool* flags) const override {
+        if (block->is_nullable()) {
+            _base_evaluate<true, false>(block, sel, size, flags);
+        } else {
+            _base_evaluate<false, false>(block, sel, size, flags);
+        }
+    }
+
+    void evaluate_and(ColumnBlock* block, uint16_t* sel, uint16_t size,
+                      bool* flags) const override {
+        if (block->is_nullable()) {
+            _base_evaluate<true, true>(block, sel, size, flags);
+        } else {
+            _base_evaluate<false, true>(block, sel, size, flags);
+        }
+    }
+
+    Status evaluate(const Schema& schema, const std::vector<BitmapIndexIterator*>& iterators,
+                    uint32_t num_rows, roaring::Roaring* result) const override {
+        BitmapIndexIterator* iterator = iterators[_column_id];
+        if (iterator == nullptr) {
+            return Status::OK();
+        }
+        if (iterator->has_null_bitmap()) {
+            roaring::Roaring null_bitmap;
+            RETURN_IF_ERROR(iterator->read_null_bitmap(&null_bitmap));
+            *result -= null_bitmap;
+        }
+        roaring::Roaring indices;
+        for (auto value : _values) {
+            bool exact_match;
+            Status s = iterator->seek_dictionary(&value, &exact_match);
+            rowid_t seeked_ordinal = iterator->current_ordinal();
+            if (!s.is_not_found()) {
+                if (!s.ok()) {
+                    return s;
+                }
+                if (exact_match) {
+                    roaring::Roaring index;
+                    RETURN_IF_ERROR(iterator->read_bitmap(seeked_ordinal, &index));
+                    indices |= index;
+                }
+            }
+        }
+
+        if constexpr (PT == PredicateType::IN_LIST) {
+            *result &= indices;
+        } else {
+            *result -= indices;
+        }
+
+        return Status::OK();
+    }
+
+    void evaluate(vectorized::IColumn& column, uint16_t* sel, uint16_t* size) const override {
+        if (column.is_nullable()) {
+            auto* nullable_col =
+                    vectorized::check_and_get_column<vectorized::ColumnNullable>(column);
+            auto& null_bitmap = reinterpret_cast<const vectorized::ColumnUInt8&>(
+                                        nullable_col->get_null_map_column())
+                                        .get_data();
+            auto& nested_col = nullable_col->get_nested_column();
+
+            if (_opposite) {
+                _base_evaluate<true, true>(&nested_col, &null_bitmap, sel, size);
+            } else {
+                _base_evaluate<true, false>(&nested_col, &null_bitmap, sel, size);
+            }
+        } else {
+            if (_opposite) {
+                _base_evaluate<false, true>(&column, nullptr, sel, size);
+            } else {
+                _base_evaluate<false, false>(&column, nullptr, sel, size);
+            }
+        }
+    }
+
+    // todo(wb) support evaluate_and,evaluate_or
+    void evaluate_and(vectorized::IColumn& column, uint16_t* sel, uint16_t size,
+                      bool* flags) const override {
+        LOG(FATAL) << "IColumn not support in_list_predicate.evaluate_and now.";
+    }
+    void evaluate_or(vectorized::IColumn& column, uint16_t* sel, uint16_t size,
+                     bool* flags) const override {
+        LOG(FATAL) << "IColumn not support in_list_predicate.evaluate_or now.";
+    }
+
+private:
+    template <typename LeftT, typename RightT>
+    bool _operator(const LeftT& lhs, const RightT& rhs) const {
+        if constexpr (PT == PredicateType::IN_LIST) {
+            return lhs != rhs;
+        }
+        return lhs == rhs;
+    }
+
+    template <bool is_nullable>
+    void _base_evaluate(const ColumnBlock* block, uint16_t* sel, uint16_t* size) const {
+        uint16_t new_size = 0;
+        for (uint16_t i = 0; i < *size; ++i) {
+            uint16_t idx = sel[i];
+            sel[new_size] = idx;
+            const T* cell_value = reinterpret_cast<const T*>(block->cell(idx).cell_ptr());
+            if constexpr (is_nullable) {
+                new_size += _opposite ^ (!block->cell(idx).is_null() &&
+                                         _operator(_values.find(*cell_value), _values.end()));
+            } else {
+                new_size += _opposite ^ _operator(_values.find(*cell_value), _values.end());
+            }
+        }
+        *size = new_size;
+    }
+
+    template <bool is_nullable, bool is_and>
+    void _base_evaluate(const ColumnBlock* block, const uint16_t* sel, uint16_t size,
+                        bool* flags) const {
+        for (uint16_t i = 0; i < size; ++i) {
+            if (!flags[i]) {
+                continue;
+            }
+
+            uint16_t idx = sel[i];
+            const T* cell_value = reinterpret_cast<const T*>(block->cell(idx).cell_ptr());
+            auto result = true;
+            if constexpr (is_nullable) {
+                result &= !block->cell(idx).is_null();
+            }
+            result &= _operator(_values.find(*cell_value), _values.end());
+
+            if constexpr (is_and) {
+                flags[i] &= _opposite ^ result;
+            } else {
+                flags[i] |= _opposite ^ result;
+            }
+        }
+    }
+
+    template <bool is_nullable, bool is_opposite>
+    void _base_evaluate(const vectorized::IColumn* column,
+                        const vectorized::PaddedPODArray<vectorized::UInt8>* null_map,
+                        uint16_t* sel, uint16_t* size) const {
+        uint16_t new_size = 0;
+
+        if (column->is_column_dictionary()) {
+            if constexpr (std::is_same_v<T, StringValue>) {
+                auto* nested_col_ptr = vectorized::check_and_get_column<
+                        vectorized::ColumnDictionary<vectorized::Int32>>(column);
+                auto& data_array = nested_col_ptr->get_data();
+                std::vector<vectorized::UInt8> selected;
+                nested_col_ptr->find_codes(_values, selected);
+
+                for (uint16_t i = 0; i < *size; i++) {
+                    uint16_t idx = sel[i];
+                    if constexpr (is_nullable) {
+                        if ((*null_map)[idx]) {
+                            if constexpr (is_opposite) {
+                                sel[new_size++] = idx;
+                            }
+                            continue;
+                        }
+                    }
+
+                    if constexpr (is_opposite != (PT == PredicateType::IN_LIST)) {
+                        if (selected[data_array[idx]]) {
+                            sel[new_size++] = idx;
+                        }
+                    } else {
+                        if (!selected[data_array[idx]]) {
+                            sel[new_size++] = idx;
+                        }
+                    }
+                }
+            } else {
+                LOG(FATAL) << "column_dictionary must use StringValue predicate.";
+            }
+        } else {
+            auto* nested_col_ptr =
+                    vectorized::check_and_get_column<vectorized::PredicateColumnType<T>>(column);
+            auto& data_array = nested_col_ptr->get_data();
+
+            for (uint16_t i = 0; i < *size; i++) {
+                uint16_t idx = sel[i];
+                if constexpr (is_nullable) {
+                    if ((*null_map)[idx]) {
+                        if constexpr (is_opposite) {
+                            sel[new_size++] = idx;
+                        }
+                        continue;
+                    }
+                }
+
+                if constexpr (is_opposite != (PT == PredicateType::IN_LIST)) {
+                    if (_operator(_values.find(reinterpret_cast<const T&>(data_array[idx])),
+                                  _values.end())) {
+                        sel[new_size++] = idx;
+                    }
+                } else {
+                    if (!_operator(_values.find(reinterpret_cast<const T&>(data_array[idx])),
+                                   _values.end())) {
+                        sel[new_size++] = idx;
+                    }
+                }
+            }
+        }
+
+        *size = new_size;
+    }
+
+    phmap::flat_hash_set<T> _values;
+};
+
+template <class T>
+using InListPredicate = InListPredicateBase<T, PredicateType::IN_LIST>;
+
+template <class T>
+using NotInListPredicate = InListPredicateBase<T, PredicateType::NOT_IN_LIST>;
 
 } //namespace doris
diff --git a/be/src/vec/columns/column_dictionary.h b/be/src/vec/columns/column_dictionary.h
index 29db3a334c..d38bc4e049 100644
--- a/be/src/vec/columns/column_dictionary.h
+++ b/be/src/vec/columns/column_dictionary.h
@@ -21,21 +21,10 @@
 
 #include <algorithm>
 
-#include "gutil/hash/string_hash.h"
-#include "olap/column_predicate.h"
-#include "olap/comparison_predicate.h"
-#include "olap/decimal12.h"
-#include "olap/in_list_predicate.h"
-#include "olap/uint24.h"
 #include "runtime/string_value.h"
-#include "util/slice.h"
 #include "vec/columns/column.h"
-#include "vec/columns/column_decimal.h"
-#include "vec/columns/column_impl.h"
 #include "vec/columns/column_string.h"
-#include "vec/columns/column_vector.h"
 #include "vec/columns/predicate_column.h"
-#include "vec/common/typeid_cast.h"
 #include "vec/core/types.h"
 
 namespace doris::vectorized {
@@ -259,7 +248,7 @@ public:
     uint32_t get_hash_value(uint32_t idx) const { return _dict.get_hash_value(_codes[idx]); }
 
     void find_codes(const phmap::flat_hash_set<StringValue>& values,
-                    std::vector<bool>& selected) const {
+                    std::vector<vectorized::UInt8>& selected) const {
         return _dict.find_codes(values, selected);
     }
 
@@ -363,13 +352,12 @@ public:
         }
 
         void find_codes(const phmap::flat_hash_set<StringValue>& values,
-                        std::vector<bool>& selected) const {
+                        std::vector<vectorized::UInt8>& selected) const {
             size_t dict_word_num = _dict_data.size();
             selected.resize(dict_word_num);
             selected.assign(dict_word_num, false);
             for (const auto& value : values) {
-                auto it = _inverted_index.find(value);
-                if (it != _inverted_index.end()) {
+                if (auto it = _inverted_index.find(value); it != _inverted_index.end()) {
                     selected[it->second] = true;
                 }
             }


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org
For additional commands, e-mail: commits-help@doris.apache.org