You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@doris.apache.org by mo...@apache.org on 2022/12/01 07:02:28 UTC

[doris] 02/02: [Enhancement](bitmapfilter) Support bitmap filter to apply zone_map index to filter pages (#14635)

This is an automated email from the ASF dual-hosted git repository.

morningman pushed a commit to branch branch-1.2-unstable
in repository https://gitbox.apache.org/repos/asf/doris.git

commit d85c1e98b472bd915f68c66e9e407b4c17e6d3ee
Author: luozenglin <37...@users.noreply.github.com>
AuthorDate: Thu Dec 1 10:41:09 2022 +0800

    [Enhancement](bitmapfilter) Support bitmap filter to apply zone_map index to filter pages (#14635)
---
 be/src/exprs/bitmapfilter_predicate.h              | 35 ++++++----
 be/src/exprs/runtime_filter.cpp                    | 21 ++++--
 be/src/exprs/runtime_filter.h                      |  3 +
 be/src/olap/bitmap_filter_predicate.h              | 45 +++++++------
 be/src/olap/olap_common.h                          |  2 +
 be/src/olap/rowset/segment_v2/segment_iterator.cpp |  1 +
 be/src/util/bitmap_value.h                         | 76 +++++++++++++++++++++-
 be/src/vec/exec/scan/new_olap_scan_node.cpp        |  1 +
 be/src/vec/exec/scan/new_olap_scan_node.h          |  1 +
 be/src/vec/exec/scan/new_olap_scanner.cpp          |  2 +
 be/src/vec/runtime/shared_hash_table_controller.h  |  2 +-
 11 files changed, 151 insertions(+), 38 deletions(-)

diff --git a/be/src/exprs/bitmapfilter_predicate.h b/be/src/exprs/bitmapfilter_predicate.h
index c9a2efc621..dbc9447034 100644
--- a/be/src/exprs/bitmapfilter_predicate.h
+++ b/be/src/exprs/bitmapfilter_predicate.h
@@ -17,7 +17,8 @@
 
 #pragma once
 
-#include "common/object_pool.h"
+#include <algorithm>
+
 #include "gutil/integral_types.h"
 #include "runtime/define_primitive_type.h"
 #include "runtime/primitive_type.h"
@@ -30,14 +31,15 @@ class BitmapFilterFuncBase {
 public:
     virtual void insert(const void* data) = 0;
     virtual void insert_many(const std::vector<const BitmapValue*> bitmaps) = 0;
-    virtual bool find(uint64 data) const = 0;
-    virtual bool is_empty() = 0;
+    virtual bool empty() = 0;
     virtual Status assign(BitmapValue* bitmap_value) = 0;
     virtual void light_copy(BitmapFilterFuncBase* other) { _not_in = other->_not_in; };
     virtual uint16_t find_fixed_len_olap_engine(const char* data, const uint8* nullmap,
                                                 uint16_t* offsets, int number) = 0;
     virtual void find_batch(const char* data, const uint8* nullmap, int number,
                             uint8* results) const = 0;
+    virtual size_t size() const = 0;
+    bool is_not_in() const { return _not_in; }
     void set_not_in(bool not_in) { _not_in = not_in; }
     virtual ~BitmapFilterFuncBase() = default;
 
@@ -51,7 +53,7 @@ class BitmapFilterFunc : public BitmapFilterFuncBase {
 public:
     using CppType = typename PrimitiveTypeTraits<type>::CppType;
 
-    BitmapFilterFunc() : _bitmap_value(std::make_shared<BitmapValue>()), _empty(true) {}
+    BitmapFilterFunc() : _bitmap_value(std::make_shared<BitmapValue>()) {}
 
     ~BitmapFilterFunc() override = default;
 
@@ -59,15 +61,13 @@ public:
 
     void insert_many(const std::vector<const BitmapValue*> bitmaps) override;
 
-    bool find(uint64 data) const override { return _not_in ^ _bitmap_value->contains(data); }
-
     uint16_t find_fixed_len_olap_engine(const char* data, const uint8* nullmap, uint16_t* offsets,
                                         int number) override;
 
     void find_batch(const char* data, const uint8* nullmap, int number,
                     uint8* results) const override;
 
-    bool is_empty() override { return _empty; }
+    bool empty() override { return _bitmap_value->empty(); }
 
     Status assign(BitmapValue* bitmap_value) override {
         *_bitmap_value = *bitmap_value;
@@ -76,9 +76,25 @@ public:
 
     void light_copy(BitmapFilterFuncBase* bloomfilter_func) override;
 
+    size_t size() const override { return _bitmap_value->cardinality(); }
+
+    uint64_t max() { return _bitmap_value->max(nullptr); }
+
+    uint64_t min() { return _bitmap_value->min(nullptr); }
+
+    bool contains_any(CppType left, CppType right) {
+        if (right < 0) {
+            return false;
+        }
+        return _bitmap_value->contains_any(std::max(left, (CppType)0), right);
+    }
+
+    std::shared_ptr<BitmapValue> get_inner_bitmap() { return _bitmap_value; }
+
 private:
     std::shared_ptr<BitmapValue> _bitmap_value;
-    bool _empty;
+
+    bool find(CppType data) const { return _not_in ^ (data >= 0 && _bitmap_value->contains(data)); }
 };
 
 template <PrimitiveType type>
@@ -88,7 +104,6 @@ void BitmapFilterFunc<type>::insert(const void* data) {
     }
 
     *_bitmap_value |= *reinterpret_cast<const BitmapValue*>(data);
-    _empty = false;
 }
 
 template <PrimitiveType type>
@@ -97,7 +112,6 @@ void BitmapFilterFunc<type>::insert_many(const std::vector<const BitmapValue*> b
         return;
     }
     _bitmap_value->fastunion(bitmaps);
-    _empty = false;
 }
 
 template <PrimitiveType type>
@@ -136,7 +150,6 @@ template <PrimitiveType type>
 void BitmapFilterFunc<type>::light_copy(BitmapFilterFuncBase* bitmapfilter_func) {
     BitmapFilterFuncBase::light_copy(bitmapfilter_func);
     auto other_func = reinterpret_cast<BitmapFilterFunc*>(bitmapfilter_func);
-    _empty = other_func->_empty;
     _bitmap_value = other_func->_bitmap_value;
 }
 
diff --git a/be/src/exprs/runtime_filter.cpp b/be/src/exprs/runtime_filter.cpp
index 39760c7c3d..d584f06d17 100644
--- a/be/src/exprs/runtime_filter.cpp
+++ b/be/src/exprs/runtime_filter.cpp
@@ -17,6 +17,8 @@
 
 #include "runtime_filter.h"
 
+#include <memory>
+
 #include "common/object_pool.h"
 #include "common/status.h"
 #include "exprs/binary_predicate.h"
@@ -457,8 +459,8 @@ public:
             return Status::OK();
         }
         case RuntimeFilterType::BITMAP_FILTER: {
-            _context._bitmap_filter_func.reset(create_bitmap_filter(_column_return_type));
-            _context._bitmap_filter_func->set_not_in(params->bitmap_filter_not_in);
+            _context.bitmap_filter_func.reset(create_bitmap_filter(_column_return_type));
+            _context.bitmap_filter_func->set_not_in(params->bitmap_filter_not_in);
             return Status::OK();
         }
         default:
@@ -524,7 +526,7 @@ public:
             break;
         }
         case RuntimeFilterType::BITMAP_FILTER: {
-            _context._bitmap_filter_func->insert(data);
+            _context.bitmap_filter_func->insert(data);
             break;
         }
         default:
@@ -602,7 +604,7 @@ public:
         for (int index : rows) {
             bitmaps.push_back(&(col->get_data()[index]));
         }
-        _context._bitmap_filter_func->insert_many(bitmaps);
+        _context.bitmap_filter_func->insert_many(bitmaps);
     }
 
     RuntimeFilterType get_real_type() {
@@ -1086,6 +1088,10 @@ public:
 
     size_t get_in_filter_size() const { return _context.hybrid_set->size(); }
 
+    std::shared_ptr<BitmapFilterFuncBase> get_bitmap_filter() const {
+        return _context.bitmap_filter_func;
+    }
+
     friend class IRuntimeFilter;
 
 private:
@@ -1260,6 +1266,11 @@ void IRuntimeFilter::signal() {
     if (_wrapper->get_real_type() == RuntimeFilterType::IN_FILTER) {
         _profile->add_info_string("InFilterSize", std::to_string(_wrapper->get_in_filter_size()));
     }
+    if (_wrapper->get_real_type() == RuntimeFilterType::BITMAP_FILTER) {
+        auto bitmap_filter = _wrapper->get_bitmap_filter();
+        _profile->add_info_string("BitmapSize", std::to_string(bitmap_filter->size()));
+        _profile->add_info_string("IsNotIn", bitmap_filter->is_not_in() ? "true" : "false");
+    }
 }
 
 BloomFilterFuncBase* IRuntimeFilter::get_bloomfilter() const {
@@ -1935,7 +1946,7 @@ Status RuntimePredicateWrapper::get_push_vexprs(std::vector<doris::vectorized::V
         node.__set_vector_opcode(to_in_opcode(_column_return_type));
         node.__set_is_nullable(false);
         auto bitmap_pred = _pool->add(new vectorized::VBitmapPredicate(node));
-        bitmap_pred->set_filter(_context._bitmap_filter_func);
+        bitmap_pred->set_filter(_context.bitmap_filter_func);
         auto cloned_vexpr = vprob_expr->root()->clone(_pool);
         bitmap_pred->add_child(cloned_vexpr);
         auto wrapper = _pool->add(new vectorized::VRuntimeFilterWrapper(node, bitmap_pred));
diff --git a/be/src/exprs/runtime_filter.h b/be/src/exprs/runtime_filter.h
index 09afe6c640..5cd08aca46 100644
--- a/be/src/exprs/runtime_filter.h
+++ b/be/src/exprs/runtime_filter.h
@@ -43,6 +43,7 @@ class PMinMaxFilter;
 class HashJoinNode;
 class RuntimeProfile;
 class BloomFilterFuncBase;
+class BitmapFilterFuncBase;
 
 namespace vectorized {
 class VExpr;
@@ -252,6 +253,8 @@ public:
 
     void ready_for_publish();
 
+    std::shared_ptr<BitmapFilterFuncBase> get_bitmap_filter() const;
+
     static bool enable_use_batch(int be_exec_version, PrimitiveType type) {
         return be_exec_version > 0 && (is_int_or_bool(type) || is_float_or_double(type));
     }
diff --git a/be/src/olap/bitmap_filter_predicate.h b/be/src/olap/bitmap_filter_predicate.h
index be3dc23cc9..053cdb748d 100644
--- a/be/src/olap/bitmap_filter_predicate.h
+++ b/be/src/olap/bitmap_filter_predicate.h
@@ -17,9 +17,14 @@
 
 #pragma once
 
+#include <cstdint>
+
 #include "exprs/bitmapfilter_predicate.h"
+#include "exprs/cast_functions.h"
 #include "exprs/runtime_filter.h"
 #include "olap/column_predicate.h"
+#include "olap/wrapper_field.h"
+#include "util/bitmap_value.h"
 #include "vec/columns/column_dictionary.h"
 #include "vec/columns/column_nullable.h"
 #include "vec/columns/column_vector.h"
@@ -51,6 +56,26 @@ public:
     void evaluate_and(ColumnBlock* block, uint16_t* sel, uint16_t size,
                       bool* flags) const override {};
 
+    bool evaluate_and(const std::pair<WrapperField*, WrapperField*>& statistic) const override {
+        if (_specific_filter->is_not_in()) {
+            return true;
+        }
+
+        CppType max_value;
+        if (statistic.second->is_null()) {
+            // no non-null values
+            return false;
+        } else {
+            max_value = *reinterpret_cast<const CppType*>(statistic.second->cell_ptr());
+        }
+
+        CppType min_value =
+                statistic.first->is_null() /* contains null values */
+                        ? 0
+                        : *reinterpret_cast<const CppType*>(statistic.first->cell_ptr());
+        return _specific_filter->contains_any(min_value, max_value);
+    }
+
     Status evaluate(BitmapIndexIterator*, uint32_t, roaring::Roaring*) const override {
         return Status::OK();
     }
@@ -84,25 +109,7 @@ private:
 };
 
 template <PrimitiveType T>
-void BitmapFilterColumnPredicate<T>::evaluate(ColumnBlock* block, uint16_t* sel,
-                                              uint16_t* size) const {
-    uint16_t new_size = 0;
-    if (block->is_nullable()) {
-        for (uint16_t i = 0; i < *size; ++i) {
-            uint16_t idx = sel[i];
-            sel[new_size] = idx;
-            new_size += (!block->cell(idx).is_null() &&
-                         _specific_filter->find(*block->cell(idx).cell_ptr()));
-        }
-    } else {
-        for (uint16_t i = 0; i < *size; ++i) {
-            uint16_t idx = sel[i];
-            sel[new_size] = idx;
-            new_size += _specific_filter->find(*block->cell(idx).cell_ptr());
-        }
-    }
-    *size = new_size;
-}
+void BitmapFilterColumnPredicate<T>::evaluate(ColumnBlock*, uint16_t*, uint16_t*) const {}
 
 template <PrimitiveType T>
 uint16_t BitmapFilterColumnPredicate<T>::evaluate(const vectorized::IColumn& column, uint16_t* sel,
diff --git a/be/src/olap/olap_common.h b/be/src/olap/olap_common.h
index 2ca2c37982..41f9c2132a 100644
--- a/be/src/olap/olap_common.h
+++ b/be/src/olap/olap_common.h
@@ -286,6 +286,7 @@ struct OlapReaderStatistics {
     // block_load_ns
     //      block_init_ns
     //          block_init_seek_ns
+    //          block_conditions_filtered_ns
     //      first_read_ns
     //          block_first_read_seek_ns
     //      lazy_read_ns
@@ -322,6 +323,7 @@ struct OlapReaderStatistics {
     int64_t rows_del_by_bitmap = 0;
     // the number of rows filtered by various column indexes.
     int64_t rows_conditions_filtered = 0;
+    int64_t block_conditions_filtered_ns = 0;
 
     int64_t index_load_ns = 0;
 
diff --git a/be/src/olap/rowset/segment_v2/segment_iterator.cpp b/be/src/olap/rowset/segment_v2/segment_iterator.cpp
index 1caa62f638..5f5419073a 100644
--- a/be/src/olap/rowset/segment_v2/segment_iterator.cpp
+++ b/be/src/olap/rowset/segment_v2/segment_iterator.cpp
@@ -282,6 +282,7 @@ Status SegmentIterator::_prepare_seek(const StorageReadOptions::KeyRange& key_ra
 }
 
 Status SegmentIterator::_get_row_ranges_by_column_conditions() {
+    SCOPED_RAW_TIMER(&_opts.stats->block_conditions_filtered_ns);
     if (_row_bitmap.isEmpty()) {
         return Status::OK();
     }
diff --git a/be/src/util/bitmap_value.h b/be/src/util/bitmap_value.h
index e8f10a4f39..2717f18895 100644
--- a/be/src/util/bitmap_value.h
+++ b/be/src/util/bitmap_value.h
@@ -21,6 +21,7 @@
 
 #include <algorithm>
 #include <cstdarg>
+#include <cstdint>
 #include <cstdio>
 #include <limits>
 #include <map>
@@ -32,6 +33,7 @@
 #include <utility>
 
 #include "common/logging.h"
+#include "gutil/integral_types.h"
 #include "udf/udf.h"
 #include "util/coding.h"
 #include "vec/common/pod_array.h"
@@ -1430,6 +1432,9 @@ public:
         return false;
     }
 
+    // true if contains a value that belongs to the range [left, right].
+    bool contains_any(uint64_t left, uint64_t right) const;
+
     uint64_t cardinality() const {
         switch (_type) {
         case EMPTY:
@@ -1677,6 +1682,16 @@ public:
         }
     }
 
+    uint64_t max(bool* empty) const {
+        return min_or_max(empty, [&]() { return _bitmap.maximum(); });
+    }
+
+    uint64_t min(bool* empty) const {
+        return min_or_max(empty, [&]() { return _bitmap.minimum(); });
+    }
+
+    bool empty() const { return _type == EMPTY; }
+
     /**
      * Return new set with specified range (not include the range_end)
      */
@@ -1823,6 +1838,7 @@ public:
 
     b_iterator begin() const;
     b_iterator end() const;
+    b_iterator lower_bound(uint64_t val) const;
 
 private:
     void _convert_to_smaller_type() {
@@ -1839,6 +1855,25 @@ private:
         }
     }
 
+    uint64_t min_or_max(bool* empty, std::function<uint64_t()> func) const {
+        bool is_empty = false;
+        uint64_t result = 0;
+        switch (_type) {
+        case SINGLE:
+            result = _sv;
+            break;
+        case BITMAP:
+            result = func();
+            break;
+        default:
+            is_empty = true;
+        }
+        if (empty) {
+            *empty = is_empty;
+        }
+        return result;
+    }
+
     enum BitmapDataType {
         EMPTY = 0,
         SINGLE = 1, // single element
@@ -1876,7 +1911,9 @@ public:
     }
 
     BitmapValueIterator(const BitmapValueIterator& other)
-            : _bitmap(other._bitmap), _iter(other._iter), _sv(other._sv), _end(other._end) {}
+            : _bitmap(other._bitmap), _sv(other._sv), _end(other._end) {
+        _iter = other._iter ? new detail::Roaring64MapSetBitForwardIterator(*other._iter) : nullptr;
+    }
 
     ~BitmapValueIterator() {
         if (_iter != nullptr) {
@@ -1930,7 +1967,9 @@ public:
     }
 
     bool operator==(const BitmapValueIterator& other) const {
-        if (_end && other._end) return true;
+        if (_end && other._end) {
+            return true;
+        }
 
         switch (_bitmap._type) {
         case BitmapValue::BitmapDataType::EMPTY:
@@ -1947,6 +1986,27 @@ public:
 
     bool operator!=(const BitmapValueIterator& other) const { return !(*this == other); }
 
+    /**
+    * Move the iterator to the first value >= `val`.
+    */
+    BitmapValueIterator& move(uint64_t val) {
+        switch (_bitmap._type) {
+        case BitmapValue::BitmapDataType::SINGLE:
+            if (_sv < val) {
+                _end = true;
+            }
+            break;
+        case BitmapValue::BitmapDataType::BITMAP:
+            if (!_iter->move(val)) {
+                _end = true;
+            }
+            break;
+        default:
+            break;
+        }
+        return *this;
+    }
+
 private:
     const BitmapValue& _bitmap;
     detail::Roaring64MapSetBitForwardIterator* _iter = nullptr;
@@ -1962,4 +2022,16 @@ inline BitmapValueIterator BitmapValue::end() const {
     return BitmapValueIterator(*this, true);
 }
 
+inline BitmapValueIterator BitmapValue::lower_bound(uint64_t val) const {
+    return BitmapValueIterator(*this).move(val);
+}
+
+inline bool BitmapValue::contains_any(uint64_t left, uint64_t right) const {
+    if (left > right) {
+        return false;
+    }
+    auto it = lower_bound(left);
+    return it != end() && *it <= right;
+}
+
 } // namespace doris
diff --git a/be/src/vec/exec/scan/new_olap_scan_node.cpp b/be/src/vec/exec/scan/new_olap_scan_node.cpp
index 46cb029d38..ad978de41f 100644
--- a/be/src/vec/exec/scan/new_olap_scan_node.cpp
+++ b/be/src/vec/exec/scan/new_olap_scan_node.cpp
@@ -66,6 +66,7 @@ Status NewOlapScanNode::_init_profile() {
     _block_init_timer = ADD_TIMER(_segment_profile, "BlockInitTime");
     _block_init_seek_timer = ADD_TIMER(_segment_profile, "BlockInitSeekTime");
     _block_init_seek_counter = ADD_COUNTER(_segment_profile, "BlockInitSeekCount", TUnit::UNIT);
+    _block_conditions_filtered_timer = ADD_TIMER(_segment_profile, "BlockConditionsFilteredTime");
 
     _rows_vec_cond_counter = ADD_COUNTER(_segment_profile, "RowsVectorPredFiltered", TUnit::UNIT);
     _vec_cond_timer = ADD_TIMER(_segment_profile, "VectorPredEvalTime");
diff --git a/be/src/vec/exec/scan/new_olap_scan_node.h b/be/src/vec/exec/scan/new_olap_scan_node.h
index f55e715659..dc882aa6a9 100644
--- a/be/src/vec/exec/scan/new_olap_scan_node.h
+++ b/be/src/vec/exec/scan/new_olap_scan_node.h
@@ -95,6 +95,7 @@ private:
     RuntimeProfile::Counter* _block_init_timer = nullptr;
     RuntimeProfile::Counter* _block_init_seek_timer = nullptr;
     RuntimeProfile::Counter* _block_init_seek_counter = nullptr;
+    RuntimeProfile::Counter* _block_conditions_filtered_timer = nullptr;
     RuntimeProfile::Counter* _first_read_timer = nullptr;
     RuntimeProfile::Counter* _first_read_seek_timer = nullptr;
     RuntimeProfile::Counter* _first_read_seek_counter = nullptr;
diff --git a/be/src/vec/exec/scan/new_olap_scanner.cpp b/be/src/vec/exec/scan/new_olap_scanner.cpp
index 3ddccdf460..224affa4bc 100644
--- a/be/src/vec/exec/scan/new_olap_scanner.cpp
+++ b/be/src/vec/exec/scan/new_olap_scanner.cpp
@@ -386,6 +386,8 @@ void NewOlapScanner::_update_counters_before_close() {
     COUNTER_UPDATE(olap_parent->_block_init_timer, stats.block_init_ns);
     COUNTER_UPDATE(olap_parent->_block_init_seek_timer, stats.block_init_seek_ns);
     COUNTER_UPDATE(olap_parent->_block_init_seek_counter, stats.block_init_seek_num);
+    COUNTER_UPDATE(olap_parent->_block_conditions_filtered_timer,
+                   stats.block_conditions_filtered_ns);
     COUNTER_UPDATE(olap_parent->_first_read_timer, stats.first_read_ns);
     COUNTER_UPDATE(olap_parent->_first_read_seek_timer, stats.block_first_read_seek_ns);
     COUNTER_UPDATE(olap_parent->_first_read_seek_counter, stats.block_first_read_seek_num);
diff --git a/be/src/vec/runtime/shared_hash_table_controller.h b/be/src/vec/runtime/shared_hash_table_controller.h
index 0a308f8aea..e2c54f533d 100644
--- a/be/src/vec/runtime/shared_hash_table_controller.h
+++ b/be/src/vec/runtime/shared_hash_table_controller.h
@@ -42,7 +42,7 @@ struct SharedRuntimeFilterContext {
     std::shared_ptr<MinMaxFuncBase> minmax_func;
     std::shared_ptr<HybridSetBase> hybrid_set;
     std::shared_ptr<BloomFilterFuncBase> bloom_filter_func;
-    std::shared_ptr<BitmapFilterFuncBase> _bitmap_filter_func;
+    std::shared_ptr<BitmapFilterFuncBase> bitmap_filter_func;
 };
 
 struct SharedHashTableContext {


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org
For additional commands, e-mail: commits-help@doris.apache.org