You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@doris.apache.org by mo...@apache.org on 2022/05/24 03:28:16 UTC

[incubator-doris] 03/04: [vec][opt] opt hash join build resize hash table before insert data (#9735)

This is an automated email from the ASF dual-hosted git repository.

morningman pushed a commit to branch dev-1.0.1
in repository https://gitbox.apache.org/repos/asf/incubator-doris.git

commit 2265374d28c2411079e1b3a844b86e5859e41de1
Author: HappenLee <ha...@hotmail.com>
AuthorDate: Mon May 23 15:13:57 2022 +0800

    [vec][opt] opt hash join build resize hash table before insert data (#9735)
    
    Co-authored-by: lihaopeng <li...@baidu.com>
---
 be/src/vec/common/hash_table/hash_table.h |  6 ++++++
 be/src/vec/exec/join/vhash_join_node.cpp  | 10 +++++++---
 2 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/be/src/vec/common/hash_table/hash_table.h b/be/src/vec/common/hash_table/hash_table.h
index f0a94b77f3..c55d806699 100644
--- a/be/src/vec/common/hash_table/hash_table.h
+++ b/be/src/vec/common/hash_table/hash_table.h
@@ -731,6 +731,12 @@ protected:
     }
 
 public:
+    void expanse_for_add_elem(size_t num_elem) {
+        if (add_elem_size_overflow(num_elem)) {
+            resize(grower.buf_size() + num_elem);
+        }
+    }
+
     /// Insert a value. In the case of any more complex values, it is better to use the `emplace` function.
     std::pair<LookupResult, bool> ALWAYS_INLINE insert(const value_type& x) {
         std::pair<LookupResult, bool> res;
diff --git a/be/src/vec/exec/join/vhash_join_node.cpp b/be/src/vec/exec/join/vhash_join_node.cpp
index 7ec9f23439..8a8691a51d 100644
--- a/be/src/vec/exec/join/vhash_join_node.cpp
+++ b/be/src/vec/exec/join/vhash_join_node.cpp
@@ -67,6 +67,10 @@ struct ProcessHashTableBuild {
         KeyGetter key_getter(_build_raw_ptrs, _join_node->_build_key_sz, nullptr);
 
         SCOPED_TIMER(_join_node->_build_table_insert_timer);
+        // only not build_unique, we need expanse hash table before insert data
+        if constexpr (!build_unique) {
+            hash_table_ctx.hash_table.expanse_for_add_elem(_rows);
+        }
         hash_table_ctx.hash_table.reset_resize_timer();
 
         vector<int>& inserted_rows = _join_node->_inserted_rows[&_acquired_block];
@@ -980,8 +984,8 @@ Status HashJoinNode::_hash_table_build(RuntimeState* state) {
         if (block.rows() != 0) { mutable_block.merge(block); }
 
         // make one block for each 4 gigabytes
-        constexpr static auto BUILD_BLOCK_MAX_SIZE =  4 * 1024UL * 1024UL * 1024UL;
-        if (_mem_used - last_mem_used > BUILD_BLOCK_MAX_SIZE) {
+        constexpr static auto BUILD_BLOCK_MAX_SIZE = 4 * 1024UL * 1024UL * 1024UL;
+        if (UNLIKELY(_mem_used - last_mem_used > BUILD_BLOCK_MAX_SIZE)) {
             _build_blocks.emplace_back(mutable_block.to_block());
             // TODO:: Rethink may we should do the proess after we recevie all build blocks ?
             // which is better.
@@ -1099,7 +1103,7 @@ Status HashJoinNode::extract_probe_join_column(Block& block, NullMap& null_map,
 Status HashJoinNode::_process_build_block(RuntimeState* state, Block& block, uint8_t offset) {
     SCOPED_TIMER(_build_table_timer);
     size_t rows = block.rows();
-    if (rows == 0) {
+    if (UNLIKELY(rows == 0)) {
         return Status::OK();
     }
     COUNTER_UPDATE(_build_rows_counter, rows);


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org
For additional commands, e-mail: commits-help@doris.apache.org