You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@doris.apache.org by yi...@apache.org on 2024/04/30 00:20:50 UTC
(doris) branch master updated: [improvement](join) Avoid merging blocks more than once on the build side (#34291)
This is an automated email from the ASF dual-hosted git repository.
yiguolei pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new 619f83a677d [improvement](join) Avoid merging blocks more than once on the build side (#34291)
619f83a677d is described below
commit 619f83a677d2c4b714a4a2cd83ffe5e0d4c5bb66
Author: Jerry Hu <mr...@gmail.com>
AuthorDate: Tue Apr 30 08:20:43 2024 +0800
[improvement](join) Avoid merging blocks more than once on the build side (#34291)
---
be/src/pipeline/exec/hashjoin_build_sink.cpp | 32 ++++++++++++++++++++--------
be/src/pipeline/exec/hashjoin_build_sink.h | 4 ++++
2 files changed, 27 insertions(+), 9 deletions(-)
diff --git a/be/src/pipeline/exec/hashjoin_build_sink.cpp b/be/src/pipeline/exec/hashjoin_build_sink.cpp
index 3bec2b07f0b..4331fd8efcb 100644
--- a/be/src/pipeline/exec/hashjoin_build_sink.cpp
+++ b/be/src/pipeline/exec/hashjoin_build_sink.cpp
@@ -535,23 +535,37 @@ Status HashJoinBuildSinkOperatorX::sink(RuntimeState* state, vectorized::Block*
RETURN_IF_ERROR(local_state._do_evaluate(*in_block, local_state._build_expr_ctxs,
*local_state._build_expr_call_timer,
res_col_ids));
-
- SCOPED_TIMER(local_state._build_side_merge_block_timer);
- RETURN_IF_ERROR(local_state._build_side_mutable_block.merge_ignore_overflow(*in_block));
- COUNTER_UPDATE(local_state._build_blocks_memory_usage, in_block->bytes());
- local_state._mem_tracker->consume(in_block->bytes());
- if (local_state._build_side_mutable_block.rows() >
- std::numeric_limits<uint32_t>::max()) {
+ local_state._build_side_rows += in_block->rows();
+ if (local_state._build_side_rows > std::numeric_limits<uint32_t>::max()) {
return Status::NotSupported(
- "Hash join do not support build table rows"
- " over:" +
+ "Hash join do not support build table rows over: {}, you should enable "
+ "join spill to avoid this issue",
std::to_string(std::numeric_limits<uint32_t>::max()));
}
+
+ local_state._mem_tracker->consume(in_block->bytes());
+ COUNTER_UPDATE(local_state._build_blocks_memory_usage, in_block->bytes());
+ local_state._build_blocks.emplace_back(std::move(*in_block));
}
}
if (local_state._should_build_hash_table && eos) {
DCHECK(!local_state._build_side_mutable_block.empty());
+
+ for (auto& column : local_state._build_side_mutable_block.mutable_columns()) {
+ column->reserve(local_state._build_side_rows);
+ }
+
+ {
+ SCOPED_TIMER(local_state._build_side_merge_block_timer);
+ for (auto& block : local_state._build_blocks) {
+ RETURN_IF_ERROR(local_state._build_side_mutable_block.merge_ignore_overflow(block));
+
+ vectorized::Block temp;
+ std::swap(block, temp);
+ }
+ }
+
local_state._shared_state->build_block = std::make_shared<vectorized::Block>(
local_state._build_side_mutable_block.to_block());
diff --git a/be/src/pipeline/exec/hashjoin_build_sink.h b/be/src/pipeline/exec/hashjoin_build_sink.h
index f675ab46209..0d40d3d49f0 100644
--- a/be/src/pipeline/exec/hashjoin_build_sink.h
+++ b/be/src/pipeline/exec/hashjoin_build_sink.h
@@ -77,6 +77,10 @@ protected:
bool _should_build_hash_table = true;
int64_t _build_side_mem_used = 0;
int64_t _build_side_last_mem_used = 0;
+
+ size_t _build_side_rows = 0;
+ std::vector<vectorized::Block> _build_blocks;
+
vectorized::MutableBlock _build_side_mutable_block;
std::shared_ptr<VRuntimeFilterSlots> _runtime_filter_slots;
bool _has_set_need_null_map_for_build = false;
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org
For additional commands, e-mail: commits-help@doris.apache.org