You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@doris.apache.org by da...@apache.org on 2022/10/21 13:50:04 UTC
[doris] branch branch-1.2-unstable updated: [Revert](mem) revert the mem config cause perfermace degradation (#13526)
This is an automated email from the ASF dual-hosted git repository.
dataroaring pushed a commit to branch branch-1.2-unstable
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-1.2-unstable by this push:
new dcc8d57f46 [Revert](mem) revert the mem config cause perfermace degradation (#13526)
dcc8d57f46 is described below
commit dcc8d57f460d052b85e0ce9514abdaf6f715fffa
Author: HappenLee <ha...@hotmail.com>
AuthorDate: Fri Oct 21 08:32:16 2022 +0800
[Revert](mem) revert the mem config cause perfermace degradation (#13526)
* Revert "[fix](mem) failure of allocating memory (#13414)"
This reverts commit 971eb9172f3e925c0b46ec1ffd1a9037a1b49801.
* Revert "[improvement](memory) disable page cache and chunk allocator, optimize memory allocate size (#13285)"
This reverts commit a5f3880649b094b58061f25c15dccdb50a4a2973.
Conflicts:
be/src/common/config.h
---
be/src/common/config.h | 14 +++-----
be/src/runtime/exec_env_init.cpp | 15 ++++++--
be/src/runtime/mem_pool.cpp | 15 ++++++--
be/src/runtime/mem_pool.h | 13 ++++---
be/src/runtime/memory/chunk_allocator.cpp | 57 +++++++++++++++----------------
be/src/util/bit_util.h | 10 ++----
be/src/vec/common/arena.h | 11 ++----
be/src/vec/common/pod_array.h | 20 +++--------
8 files changed, 73 insertions(+), 82 deletions(-)
diff --git a/be/src/common/config.h b/be/src/common/config.h
index 02f76870b0..a242e03f00 100644
--- a/be/src/common/config.h
+++ b/be/src/common/config.h
@@ -239,7 +239,7 @@ CONF_Int32(storage_page_cache_shard_size, "16");
// all storage page cache will be divided into data_page_cache and index_page_cache
CONF_Int32(index_page_cache_percentage, "10");
// whether to disable page cache feature in storage
-CONF_Bool(disable_storage_page_cache, "true");
+CONF_Bool(disable_storage_page_cache, "false");
CONF_Bool(enable_storage_vectorization, "true");
@@ -439,20 +439,14 @@ CONF_Bool(disable_mem_pools, "false");
// increase this variable can improve performance,
// but will acquire more free memory which can not be used by other modules.
CONF_mString(chunk_reserved_bytes_limit, "10%");
-
-// Whether using chunk allocator to cache memory chunk
-CONF_Bool(disable_chunk_allocator, "true");
+// 1024, The minimum chunk allocator size (in bytes)
+CONF_Int32(min_chunk_reserved_bytes, "1024");
// Disable Chunk Allocator in Vectorized Allocator, this will reduce memory cache.
// For high concurrent queries, using Chunk Allocator with vectorized Allocator can reduce the impact
// of gperftools tcmalloc central lock.
// Jemalloc or google tcmalloc have core cache, Chunk Allocator may no longer be needed after replacing
// gperftools tcmalloc.
-CONF_mBool(disable_chunk_allocator_in_vec, "true");
-
-// Both MemPool and vectorized engine's podarray allocator, vectorized engine's arena will try to allocate memory as power of two.
-// But if the memory is very large then power of two is also very large. This config means if the allocated memory's size is larger
-// than this limit then all allocators will not use RoundUpToPowerOfTwo to allocate memory.
-CONF_mInt64(memory_linear_growth_threshold, "134217728"); // 128Mb
+CONF_mBool(disable_chunk_allocator_in_vec, "false");
// The probing algorithm of partitioned hash table.
// Enable quadratic probing hash table
diff --git a/be/src/runtime/exec_env_init.cpp b/be/src/runtime/exec_env_init.cpp
index e18ed5c9c1..bf0f66d344 100644
--- a/be/src/runtime/exec_env_init.cpp
+++ b/be/src/runtime/exec_env_init.cpp
@@ -193,7 +193,9 @@ Status ExecEnv::_init_mem_tracker() {
if (global_memory_limit_bytes > MemInfo::physical_mem()) {
LOG(WARNING) << "Memory limit "
<< PrettyPrinter::print(global_memory_limit_bytes, TUnit::BYTES)
- << " exceeds physical memory, using physical memory instead";
+ << " exceeds physical memory of "
+ << PrettyPrinter::print(MemInfo::physical_mem(), TUnit::BYTES)
+ << ". Using physical memory instead";
global_memory_limit_bytes = MemInfo::physical_mem();
}
_process_mem_tracker =
@@ -300,6 +302,13 @@ Status ExecEnv::_init_mem_tracker() {
RETURN_IF_ERROR(_disk_io_mgr->init(global_memory_limit_bytes));
RETURN_IF_ERROR(_tmp_file_mgr->init());
+ // 5. init chunk allocator
+ if (!BitUtil::IsPowerOf2(config::min_chunk_reserved_bytes)) {
+ ss << "Config min_chunk_reserved_bytes must be a power-of-two: "
+ << config::min_chunk_reserved_bytes;
+ return Status::InternalError(ss.str());
+ }
+
int64_t chunk_reserved_bytes_limit =
ParseUtil::parse_mem_spec(config::chunk_reserved_bytes_limit, global_memory_limit_bytes,
MemInfo::physical_mem(), &is_percent);
@@ -309,8 +318,8 @@ Status ExecEnv::_init_mem_tracker() {
<< config::chunk_reserved_bytes_limit;
return Status::InternalError(ss.str());
}
- // Has to round to multiple of page size(4096 bytes), chunk allocator will also check this
- chunk_reserved_bytes_limit = BitUtil::RoundDown(chunk_reserved_bytes_limit, 4096);
+ chunk_reserved_bytes_limit =
+ BitUtil::RoundDown(chunk_reserved_bytes_limit, config::min_chunk_reserved_bytes);
ChunkAllocator::init_instance(chunk_reserved_bytes_limit);
LOG(INFO) << "Chunk allocator memory limit: "
<< PrettyPrinter::print(chunk_reserved_bytes_limit, TUnit::BYTES)
diff --git a/be/src/runtime/mem_pool.cpp b/be/src/runtime/mem_pool.cpp
index 7e80e7e5b4..c2b709162c 100644
--- a/be/src/runtime/mem_pool.cpp
+++ b/be/src/runtime/mem_pool.cpp
@@ -119,9 +119,20 @@ Status MemPool::find_chunk(size_t min_size, bool check_limits) {
}
// Didn't find a big enough free chunk - need to allocate new chunk.
+ size_t chunk_size = 0;
DCHECK_LE(next_chunk_size_, MAX_CHUNK_SIZE);
- DCHECK_GE(next_chunk_size_, INITIAL_CHUNK_SIZE);
- size_t chunk_size = BitUtil::RoundUpToPowerOfTwo(std::max<size_t>(min_size, next_chunk_size_));
+
+ if (config::disable_mem_pools) {
+ // Disable pooling by sizing the chunk to fit only this allocation.
+ // Make sure the alignment guarantees are respected.
+ // This will generate too many small chunks.
+ chunk_size = std::max<size_t>(min_size, alignof(max_align_t));
+ } else {
+ DCHECK_GE(next_chunk_size_, INITIAL_CHUNK_SIZE);
+ chunk_size = std::max<size_t>(min_size, next_chunk_size_);
+ }
+
+ chunk_size = BitUtil::RoundUpToPowerOfTwo(chunk_size);
if (check_limits &&
!thread_context()->_thread_mem_tracker_mgr->limiter_mem_tracker_raw()->check_limit(
chunk_size)) {
diff --git a/be/src/runtime/mem_pool.h b/be/src/runtime/mem_pool.h
index d11952dd50..41240ab375 100644
--- a/be/src/runtime/mem_pool.h
+++ b/be/src/runtime/mem_pool.h
@@ -232,16 +232,15 @@ private:
ChunkInfo& info = chunks_[current_chunk_idx_];
int64_t aligned_allocated_bytes =
- BitUtil::RoundUpToMultiplyOfFactor(info.allocated_bytes, alignment);
- auto size_with_padding = size + DEFAULT_PADDING_SIZE;
- if (aligned_allocated_bytes + size_with_padding <= info.chunk.size) {
+ BitUtil::RoundUpToPowerOf2(info.allocated_bytes + DEFAULT_PADDING_SIZE, alignment);
+ if (aligned_allocated_bytes + size <= info.chunk.size) {
// Ensure the requested alignment is respected.
int64_t padding = aligned_allocated_bytes - info.allocated_bytes;
uint8_t* result = info.chunk.data + aligned_allocated_bytes;
- ASAN_UNPOISON_MEMORY_REGION(result, size_with_padding);
- DCHECK_LE(info.allocated_bytes + size_with_padding, info.chunk.size);
- info.allocated_bytes += padding + size_with_padding;
- total_allocated_bytes_ += padding + size_with_padding;
+ ASAN_UNPOISON_MEMORY_REGION(result, size);
+ DCHECK_LE(info.allocated_bytes + size, info.chunk.size);
+ info.allocated_bytes += padding + size;
+ total_allocated_bytes_ += padding + size;
DCHECK_LE(current_chunk_idx_, chunks_.size() - 1);
return result;
}
diff --git a/be/src/runtime/memory/chunk_allocator.cpp b/be/src/runtime/memory/chunk_allocator.cpp
index 09df7e23d4..43acc79538 100644
--- a/be/src/runtime/memory/chunk_allocator.cpp
+++ b/be/src/runtime/memory/chunk_allocator.cpp
@@ -154,36 +154,35 @@ ChunkAllocator::ChunkAllocator(size_t reserve_limit)
Status ChunkAllocator::allocate(size_t size, Chunk* chunk) {
CHECK((size > 0 && (size & (size - 1)) == 0));
+ // fast path: allocate from current core arena
int core_id = CpuInfo::get_current_core();
- chunk->core_id = core_id;
chunk->size = size;
- if (!config::disable_chunk_allocator) {
- // fast path: allocate from current core arena
- if (_arenas[core_id]->pop_free_chunk(size, &chunk->data)) {
- DCHECK_GE(_reserved_bytes, 0);
- _reserved_bytes.fetch_sub(size);
- chunk_pool_local_core_alloc_count->increment(1);
- // transfer the memory ownership of allocate from ChunkAllocator::tracker to the tls tracker.
- THREAD_MEM_TRACKER_TRANSFER_FROM(size, _mem_tracker.get());
- return Status::OK();
- }
- // Second path: try to allocate from other core's arena
- // When the reserved bytes is greater than the limit, the chunk is stolen from other arena.
- // Otherwise, it is allocated from the system first, which can reserve enough memory as soon as possible.
- // After that, allocate from current core arena as much as possible.
- if (_reserved_bytes > _steal_arena_limit) {
- ++core_id;
- for (int i = 1; i < _arenas.size(); ++i, ++core_id) {
- if (_arenas[core_id % _arenas.size()]->pop_free_chunk(size, &chunk->data)) {
- DCHECK_GE(_reserved_bytes, 0);
- _reserved_bytes.fetch_sub(size);
- chunk_pool_other_core_alloc_count->increment(1);
- // reset chunk's core_id to other
- chunk->core_id = core_id % _arenas.size();
- // transfer the memory ownership of allocate from ChunkAllocator::tracker to the tls tracker.
- THREAD_MEM_TRACKER_TRANSFER_FROM(size, _mem_tracker.get());
- return Status::OK();
- }
+ chunk->core_id = core_id;
+
+ if (_arenas[core_id]->pop_free_chunk(size, &chunk->data)) {
+ DCHECK_GE(_reserved_bytes, 0);
+ _reserved_bytes.fetch_sub(size);
+ chunk_pool_local_core_alloc_count->increment(1);
+ // transfer the memory ownership of allocate from ChunkAllocator::tracker to the tls tracker.
+ THREAD_MEM_TRACKER_TRANSFER_FROM(size, _mem_tracker.get());
+ return Status::OK();
+ }
+ // Second path: try to allocate from other core's arena
+ // When the reserved bytes is greater than the limit, the chunk is stolen from other arena.
+ // Otherwise, it is allocated from the system first, which can reserve enough memory as soon as possible.
+ // After that, allocate from current core arena as much as possible.
+ if (_reserved_bytes > _steal_arena_limit) {
+ ++core_id;
+ for (int i = 1; i < _arenas.size(); ++i, ++core_id) {
+ if (_arenas[core_id % _arenas.size()]->pop_free_chunk(size, &chunk->data)) {
+ DCHECK_GE(_reserved_bytes, 0);
+ _reserved_bytes.fetch_sub(size);
+ chunk_pool_other_core_alloc_count->increment(1);
+ // reset chunk's core_id to other
+ chunk->core_id = core_id % _arenas.size();
+ // transfer the memory ownership of allocate from ChunkAllocator::tracker to the tls tracker.
+ THREAD_MEM_TRACKER_TRANSFER_FROM(size, _mem_tracker.get());
+ return Status::OK();
}
}
}
@@ -205,7 +204,7 @@ Status ChunkAllocator::allocate(size_t size, Chunk* chunk) {
void ChunkAllocator::free(const Chunk& chunk) {
DCHECK(chunk.core_id != -1);
CHECK((chunk.size & (chunk.size - 1)) == 0);
- if (config::disable_chunk_allocator) {
+ if (config::disable_mem_pools) {
SystemAllocator::free(chunk.data, chunk.size);
return;
}
diff --git a/be/src/util/bit_util.h b/be/src/util/bit_util.h
index f68586df64..28534b139b 100644
--- a/be/src/util/bit_util.h
+++ b/be/src/util/bit_util.h
@@ -43,8 +43,6 @@ public:
return value / divisor + (value % divisor != 0);
}
- static inline size_t round_up_to_page_size(size_t s) { return (s + 4096 - 1) / 4096 * 4096; }
-
// Returns 'value' rounded up to the nearest multiple of 'factor'
static inline int64_t round_up(int64_t value, int64_t factor) {
return (value + (factor - 1)) / factor * factor;
@@ -306,12 +304,8 @@ public:
}
/// Returns 'value' rounded up to the nearest multiple of 'factor' when factor is
- /// a power of two, for example
- /// Factor has to be a power of two
- /// factor = 16, value = 10 --> result = 16
- /// factor = 16, value = 17 --> result = 32
- /// factor = 16, value = 33 --> result = 48
- static inline int64_t RoundUpToMultiplyOfFactor(int64_t value, int64_t factor) {
+ /// a power of two
+ static inline int64_t RoundUpToPowerOf2(int64_t value, int64_t factor) {
DCHECK((factor > 0) && ((factor & (factor - 1)) == 0));
return (value + (factor - 1)) & ~(factor - 1);
}
diff --git a/be/src/vec/common/arena.h b/be/src/vec/common/arena.h
index 8042d5618d..e136bae143 100644
--- a/be/src/vec/common/arena.h
+++ b/be/src/vec/common/arena.h
@@ -127,16 +127,11 @@ private:
public:
Arena(size_t initial_size_ = 4096, size_t growth_factor_ = 2,
- size_t linear_growth_threshold_ = -1)
+ size_t linear_growth_threshold_ = 128 * 1024 * 1024)
: growth_factor(growth_factor_),
+ linear_growth_threshold(linear_growth_threshold_),
head(new Chunk(initial_size_, nullptr)),
- size_in_bytes(head->size()) {
- if (linear_growth_threshold_ < 0) {
- linear_growth_threshold = config::memory_linear_growth_threshold;
- } else {
- linear_growth_threshold = linear_growth_threshold_;
- }
- }
+ size_in_bytes(head->size()) {}
~Arena() { delete head; }
diff --git a/be/src/vec/common/pod_array.h b/be/src/vec/common/pod_array.h
index 6d844340ef..10a8ed8d75 100644
--- a/be/src/vec/common/pod_array.h
+++ b/be/src/vec/common/pod_array.h
@@ -30,8 +30,6 @@
#include <cstddef>
#include <memory>
-#include "common/config.h"
-#include "util/bit_util.h"
#include "vec/common/allocator.h"
#include "vec/common/bit_helpers.h"
#include "vec/common/memcpy_small.h"
@@ -122,9 +120,8 @@ protected:
}
}
- /// Not round up, keep the size just as the application pass in like std::vector
void alloc_for_num_elements(size_t num_elements) {
- alloc(minimum_memory_for_elements(num_elements));
+ alloc(round_up_to_power_of_two_or_zero(minimum_memory_for_elements(num_elements)));
}
template <typename... TAllocatorParams>
@@ -184,7 +181,6 @@ protected:
return (stack_threshold > 0) && (allocated_bytes() <= stack_threshold);
}
- /// This method is called by push back or emplace back, this is the same behaviour with std::vector
template <typename... TAllocatorParams>
void reserve_for_next_size(TAllocatorParams&&... allocator_params) {
if (size() == 0) {
@@ -193,10 +189,8 @@ protected:
realloc(std::max(integerRoundUp(initial_bytes, ELEMENT_SIZE),
minimum_memory_for_elements(1)),
std::forward<TAllocatorParams>(allocator_params)...);
- } else {
- // There is still a power of 2 expansion here, this method is used in push back method
+ } else
realloc(allocated_bytes() * 2, std::forward<TAllocatorParams>(allocator_params)...);
- }
}
#ifndef NDEBUG
@@ -450,11 +444,9 @@ public:
template <typename It1, typename It2, typename... TAllocatorParams>
void insert_prepare(It1 from_begin, It2 from_end, TAllocatorParams&&... allocator_params) {
size_t required_capacity = this->size() + (from_end - from_begin);
- if (required_capacity > this->capacity()) {
- // std::vector's insert method will expand if required capactiy is larger than current
+ if (required_capacity > this->capacity())
this->reserve(round_up_to_power_of_two_or_zero(required_capacity),
std::forward<TAllocatorParams>(allocator_params)...);
- }
}
/// Do not insert into the array a piece of itself. Because with the resize, the iterators on themselves can be invalidated.
@@ -631,10 +623,8 @@ public:
template <typename It1, typename It2>
void assign(It1 from_begin, It2 from_end) {
size_t required_capacity = from_end - from_begin;
- if (required_capacity > this->capacity()) {
- // std::vector assign just expand the capacity to the required capacity
- this->reserve(required_capacity);
- }
+ if (required_capacity > this->capacity())
+ this->reserve(round_up_to_power_of_two_or_zero(required_capacity));
size_t bytes_to_copy = this->byte_size(required_capacity);
memcpy(this->c_start, reinterpret_cast<const void*>(&*from_begin), bytes_to_copy);
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org
For additional commands, e-mail: commits-help@doris.apache.org